#clear objects
rm(list = ls())
#clear memory 
gc()

#Load libraries
library("sandwich") # for use with Sandwhich estimator
library("lmtest")  # for use with linear models
library("magrittr") # so we can use commands like %>% which pipes the result into the next command
library("dplyr") # so we can use commands like mutate and summarize
library("data.table") # so we can use data tables

#Set working directory. Remember to put the slashes as forward slashes:
setwd('C:/Users/Benja/Downloads')

#Read Data
loadedData <- read.csv("DataSetWithManyColumns.csv" , header=TRUE, sep=",")

#Get Variables
namelist<-names(loadedData)

#Create a table to accept the results
df2 = data.table(coefName=rep(c("x"),each=1)
                 , coefValue = rnorm(1)
                 , coefPValue = rnorm(1)
                 , coefSPValue = rnorm(1)
                 , iterationCounter = rnorm(1))

# delete the first row which was just a placeholder
df2 <- df2[-c(1),] 

########################################
#Begin Loop
########################################

#Set time
ptm <- proc.time()
iteratorCount = 1
#Create loop that loops through various regressions
for (i in 191:7){
  #print where we are so one can monitor the progress
  print(i)
  #show memory so that one can monitor if it is about to run out.
  print(memory.size())
  #gc stands for garbage collection and it means we are freeing memory
  gc()
  #print a status of how long the iteration took 
  print ( proc.time() - ptm ) 
  #reset the timer
  ptm <- proc.time()
  #perform 2nd, inner, loop
  for (j in 7:191){
    # create a table that is temporarily used within the 2nd inner loop; this was necessary to manage performance of the machine.
    df1 = data.table(coefName=rep(c("x"),each=1)
                     , coefValue = rnorm(1)
                     , coefPValue  = rnorm(1)
                     , coefSPValue = rnorm(1)
                     , iterationCounter = rnorm(1))
    # delete the first row which was just a placeholder
    df1 <- df1[-c(1),] 
    
    #Create 3rd, inner, loop
    for (k in 7:191){
      if(i==j | i==k | j==k | j==i) {
        #do nothing because there are duplicate fields in the regression
      } 
      else {
        # Run regression
        lmOutput <- lm(reformulate(namelist[c(i,j, k)],"Difference"), weight = TotalVotes, data = loadedData)
        tester <- summary(lmOutput, complete=TRUE)
        tester2 <- coeftest(lmOutput, vcov = sandwich)
        
        # Obtain values from regression
        df1 <- df1 %>% add_row( coefName = namelist[i],coefValue=tester$coefficients[2,1], coefPValue=tester$coefficients[2,4], coefSPValue=tester2[2,4], iterationCounter = iteratorCount)
        df1 <- df1 %>% add_row( coefName = namelist[j],coefValue=tester$coefficients[3,1], coefPValue=tester$coefficients[3,4], coefSPValue=tester2[3,4], iterationCounter = iteratorCount)
        df1 <- df1 %>% add_row( coefName = namelist[k],coefValue=tester$coefficients[4,1], coefPValue=tester$coefficients[4,4], coefSPValue=tester2[4,4], iterationCounter = iteratorCount)
        iteratorCount = iteratorCount + 1               
      } #end else
    } # end loop 3
    # bind the results of loop 2 to the big table with all of the results
    # I found this to be the best way to manage the memory/throughput of the machine
    df2 <-rbind(df2, df1)  
  } # end loop 2
} # end loop 3

########################################
#end loop
########################################

#Let user know the loop has completed; it took my computer 24 hours
print("Loop of regressions completed.") 

#next save giant dataset as an RDA file.
save(df2,file="df2.Rda")

#Then if you wish to load it at a later time, execute the following:
#load("df2.Rda")

# Create a table indicating whether the coefficient is positive or negative. Note this is all one statement because the %>% pipes one result to the next. See library magrittr for more info.
df3 <- df2 %>% 
  mutate(coefPositive = ifelse(coefValue >=  0, 1, 0)
         , coefNegative = ifelse(coefValue <  0, 1, 0)) %>%
  group_by(coefName) %>% 
  summarise( coefPositive = sum(coefPositive)
             , coefNegative = sum(coefNegative) ) %>%
  mutate(coefSign  = ifelse(coefPositive>coefNegative,1, -1 ))

#create right function
right = function(text, num_char) {
  substr(text, nchar(text) - (num_char-1), nchar(text))
}

#Create broad category field for display
df3$broadCategory = ifelse(nchar(df3$coefName) ==2 , "Random Variable", 
                           ifelse(right(df3$coefName,4)=="Flag","Election Machine","Demographic Field"))

#Create specific category field for display
df3$specificCategory = ifelse(nchar(df3$coefName) ==2 , substr(df3$coefName, 1, 2), 
                              ifelse(right(df3$coefName,4)=="Flag", df3$coefName ,"Demographic Field"))

#left join coefficient signs and categories to the dataset
df2 <- merge(x=df2,y=df3,by="coefName",all.x=TRUE)

# provide a summary of results.
output <- df2 %>% 
  mutate(coefExists = ifelse(coefValue != 0, 1, 0)
         , coefPositive2 = ifelse(coefValue >  0, 1, 0)
         , inconsistentSign = ifelse(coefValue*coefSign < 0 , 1, 0) 
         , confidentAndConsistentSign = ifelse(coefValue*coefSign > 0 & coefPValue<.05 , 1, 0) 
         , confidentAndConsistentSSign = ifelse(coefValue*coefSign > 0 & coefSPValue<.05 , 1, 0) 
         , pValueSignificantAt5 = ifelse(coefPValue<.05,1,0)
         , sPValueSignificantAt5 = ifelse(coefSPValue<.05,1,0)
         , pValueExists = ifelse(is.numeric(coefPValue), 1, 0 ))  %>%
  group_by(coefName) %>% 
  summarise(
    signOfCoefficient = median(coefSign)
    , numberTimesInARegression = sum(coefExists)
    , percentInconsistentSign = sum(inconsistentSign) / sum(coefExists)
    , percentPValueSignificantAt5 = sum(pValueSignificantAt5)/sum(pValueExists)
    , percentSPValueSignificantAt5 = sum(sPValueSignificantAt5)/sum(pValueExists)
    , percentSignificantAt5AndConsistentSign = sum(confidentAndConsistentSign) / sum(coefExists)
    , percentSSignificantAt5AndConsistentSign = sum(confidentAndConsistentSSign) / sum(coefExists)
    , maxPValue = max(coefPValue)
    , maxSPValue = max(coefSPValue)
    , minPValue = min(coefPValue)             
    , minSPValue = min(coefSPValue)             
    , meanPValue = mean(coefPValue)
    , meanPSValue = mean(coefSPValue)
    , medianPValue = median(coefPValue)
    , medianSPValue = median(coefSPValue)
  )

#View results
View(output)

#write this output to CSV (it is too big for a clipboard)
#execute code to output the "output" to csv.
write.csv(output,"outputSummary.csv", row.names = FALSE)