#clear objects rm(list = ls()) #clear memory gc() #Load libraries library("sandwich") # for use with Sandwhich estimator library("lmtest") # for use with linear models library("magrittr") # so we can use commands like %>% which pipes the result into the next command library("dplyr") # so we can use commands like mutate and summarize library("data.table") # so we can use data tables #Set working directory. Remember to put the slashes as forward slashes: setwd('C:/Users/Benja/Downloads') #Read Data loadedData <- read.csv("DataSetWithManyColumns.csv" , header=TRUE, sep=",") #Get Variables namelist<-names(loadedData) #Create a table to accept the results df2 = data.table(coefName=rep(c("x"),each=1) , coefValue = rnorm(1) , coefPValue = rnorm(1) , coefSPValue = rnorm(1) , iterationCounter = rnorm(1)) # delete the first row which was just a placeholder df2 <- df2[-c(1),] ######################################## #Begin Loop ######################################## #Set time ptm <- proc.time() iteratorCount = 1 #Create loop that loops through various regressions for (i in 191:7){ #print where we are so one can monitor the progress print(i) #show memory so that one can monitor if it is about to run out. print(memory.size()) #gc stands for garbage collection and it means we are freeing memory gc() #print a status of how long the iteration took print ( proc.time() - ptm ) #reset the timer ptm <- proc.time() #perform 2nd, inner, loop for (j in 7:191){ # create a table that is temporarily used within the 2nd inner loop; this was necessary to manage performance of the machine. df1 = data.table(coefName=rep(c("x"),each=1) , coefValue = rnorm(1) , coefPValue = rnorm(1) , coefSPValue = rnorm(1) , iterationCounter = rnorm(1)) # delete the first row which was just a placeholder df1 <- df1[-c(1),] #Create 3rd, inner, loop for (k in 7:191){ if(i==j | i==k | j==k | j==i) { #do nothing because there are duplicate fields in the regression } else { # Run regression lmOutput <- lm(reformulate(namelist[c(i,j, k)],"Difference"), weight = TotalVotes, data = loadedData) tester <- summary(lmOutput, complete=TRUE) tester2 <- coeftest(lmOutput, vcov = sandwich) # Obtain values from regression df1 <- df1 %>% add_row( coefName = namelist[i],coefValue=tester$coefficients[2,1], coefPValue=tester$coefficients[2,4], coefSPValue=tester2[2,4], iterationCounter = iteratorCount) df1 <- df1 %>% add_row( coefName = namelist[j],coefValue=tester$coefficients[3,1], coefPValue=tester$coefficients[3,4], coefSPValue=tester2[3,4], iterationCounter = iteratorCount) df1 <- df1 %>% add_row( coefName = namelist[k],coefValue=tester$coefficients[4,1], coefPValue=tester$coefficients[4,4], coefSPValue=tester2[4,4], iterationCounter = iteratorCount) iteratorCount = iteratorCount + 1 } #end else } # end loop 3 # bind the results of loop 2 to the big table with all of the results # I found this to be the best way to manage the memory/throughput of the machine df2 <-rbind(df2, df1) } # end loop 2 } # end loop 3 ######################################## #end loop ######################################## #Let user know the loop has completed; it took my computer 24 hours print("Loop of regressions completed.") #next save giant dataset as an RDA file. save(df2,file="df2.Rda") #Then if you wish to load it at a later time, execute the following: #load("df2.Rda") # Create a table indicating whether the coefficient is positive or negative. Note this is all one statement because the %>% pipes one result to the next. See library magrittr for more info. df3 <- df2 %>% mutate(coefPositive = ifelse(coefValue >= 0, 1, 0) , coefNegative = ifelse(coefValue < 0, 1, 0)) %>% group_by(coefName) %>% summarise( coefPositive = sum(coefPositive) , coefNegative = sum(coefNegative) ) %>% mutate(coefSign = ifelse(coefPositive>coefNegative,1, -1 )) #create right function right = function(text, num_char) { substr(text, nchar(text) - (num_char-1), nchar(text)) } #Create broad category field for display df3$broadCategory = ifelse(nchar(df3$coefName) ==2 , "Random Variable", ifelse(right(df3$coefName,4)=="Flag","Election Machine","Demographic Field")) #Create specific category field for display df3$specificCategory = ifelse(nchar(df3$coefName) ==2 , substr(df3$coefName, 1, 2), ifelse(right(df3$coefName,4)=="Flag", df3$coefName ,"Demographic Field")) #left join coefficient signs and categories to the dataset df2 <- merge(x=df2,y=df3,by="coefName",all.x=TRUE) # provide a summary of results. output <- df2 %>% mutate(coefExists = ifelse(coefValue != 0, 1, 0) , coefPositive2 = ifelse(coefValue > 0, 1, 0) , inconsistentSign = ifelse(coefValue*coefSign < 0 , 1, 0) , confidentAndConsistentSign = ifelse(coefValue*coefSign > 0 & coefPValue<.05 , 1, 0) , confidentAndConsistentSSign = ifelse(coefValue*coefSign > 0 & coefSPValue<.05 , 1, 0) , pValueSignificantAt5 = ifelse(coefPValue<.05,1,0) , sPValueSignificantAt5 = ifelse(coefSPValue<.05,1,0) , pValueExists = ifelse(is.numeric(coefPValue), 1, 0 )) %>% group_by(coefName) %>% summarise( signOfCoefficient = median(coefSign) , numberTimesInARegression = sum(coefExists) , percentInconsistentSign = sum(inconsistentSign) / sum(coefExists) , percentPValueSignificantAt5 = sum(pValueSignificantAt5)/sum(pValueExists) , percentSPValueSignificantAt5 = sum(sPValueSignificantAt5)/sum(pValueExists) , percentSignificantAt5AndConsistentSign = sum(confidentAndConsistentSign) / sum(coefExists) , percentSSignificantAt5AndConsistentSign = sum(confidentAndConsistentSSign) / sum(coefExists) , maxPValue = max(coefPValue) , maxSPValue = max(coefSPValue) , minPValue = min(coefPValue) , minSPValue = min(coefSPValue) , meanPValue = mean(coefPValue) , meanPSValue = mean(coefSPValue) , medianPValue = median(coefPValue) , medianSPValue = median(coefSPValue) ) #View results View(output) #write this output to CSV (it is too big for a clipboard) #execute code to output the "output" to csv. write.csv(output,"outputSummary.csv", row.names = FALSE)