Homework 11

Batch processing: repeat lecture materials but with a different simulated data set (don’t have a real data set to use yet)

####################################
# FUNCTION: file_builder
# purpose: create a set of random files for regression
# input: file_n = number of files to create
#       : file_folder = name of folder for random files
#       : file_size = c(min,max) number of rows in file
#       : file_na = number on average of NA values per column
# output: set of random files
#----------------------------------
file_builder <- function(file_n=10, # create 10 files
                         file_folder="RandomFilesHW11/",
                         file_size=c(10,20), # between 10 and 20 rows
                         file_na=5) { # 5 NAs present in the data
  for (i in seq_len(file_n)) {
    file_length <- sample(file_size[1]:file_size[2],size=1) # get the number of rows
    var_x <- runif(file_length) # create random x
    var_y <- runif(file_length) # create random y
    df <- data.frame(var_x,var_y) # bind into a data frame
    bad_vals <- rpois(n=1,lambda=file_na) # determine NA number; poisson is integer distribution and lambda will take value from NA (5, defined above)
    df[sample(nrow(df),size=bad_vals),1] <- NA # random NA in var_x
    df[sample(nrow(df),size=bad_vals),2] <- NA # random NA in var_y
    
    # create path: RandomFile/ranFile001.csv (for e.g.)
    file_label <- paste(file_folder,
                        "ranFile",
                        formatC(i,
                                width=3,
                                format="d",
                                flag="0"),
                        ".csv",sep="") # padded 0s to get everything in the right order, e.g. make sure file 001 comes before file 010, etc.
    
    # set up data file and incorporate time stamp and minimal metadata
    write.table(cat("# Simulated random data file for batch processing","\n",
                    "# timestamp: ",as.character(Sys.time()),"\n",
                    "# HRS","\n",
                    "# ------------------------", "\n",
                    "\n",
                    file=file_label,
                    row.names="",
                    col.names="",
                    sep=""))
    
    # now add the data frame
    write.table(x=df,
                file=file_label,
                sep=",",
                row.names=FALSE,
                append=TRUE) # need append=TRUE - means that it'll open the same file but will put everything below what we did; the code will work without the line but it'll overwrite everything we just coded
    
    
  }
}

####################################
# FUNCTION: reg_stats
# purpose: fits linear model, extracts statistics
# input: 2-column data frame (x and y)
# output: slope, p-value, and r2
#----------------------------------
reg_stats <- function(d=NULL) {
  if(is.null(d)) {
    x_var <- runif(10)
    y_var <- runif(10)
    d <- data.frame(x_var,y_var)
  }
  . <- lm(data=d,d[,2]~d[,1])
  . <- summary(.)
  stats_list <- list(slope=.$coefficients[2,1],
                     p_val=.$coefficients[2,4],
                     r2=.$r.squared)
  return(stats_list)
  
}

### Body of script for batch processing of regression models

# Global variables
file_folder <- "RandomFilesHW11/"
n_files <- 10
file_out <- "StatsSummaryHW11.csv"

# Create 10 random data sets
dir.create(file_folder)

## Warning in dir.create(file_folder): 'RandomFilesHW11' already exists

file_builder(file_n=n_files)

## ""

## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file

## ""

## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file

## ""

## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file

## ""

## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file

## ""

## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file

## ""

## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file

## ""

## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file

## ""

## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file

## ""

## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file

## ""

## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file

file_names <- list.files(path=file_folder)
head(file_names)

## [1] "ranFile001.csv" "ranFile002.csv" "ranFile003.csv" "ranFile004.csv"
## [5] "ranFile005.csv" "ranFile006.csv"

# Create data frame to hold file summary statistics
ID <- seq_along(file_names)
file_name <- file_names
slope <- rep(NA,n_files)
p_val <- rep(NA,n_files)
r2 <- rep(NA,n_files)

stats_out <- data.frame(ID, file_name, slope, p_val, r2)

# batch process by looping through individual files and reading them in

for (i in seq_along(file_names)) {
  data <- read.table(file=paste(file_folder,file_names[i],sep=""),
                     sep=",",
                     header=TRUE) # read in next data file
  d_clean <- data[complete.cases(data),] # get clean cases
  . <- reg_stats(d_clean) # pull out regression stats from clean file
  stats_out[i,3:5] <- unlist(.) # un-list, copy into last 3 columns
}


# set up output file and incorporate time stamp and minimal metadata
write.table(cat("# Summary stats for ",
                "batch processing of regression models","\n",
                "# timestamp: ",as.character(Sys.time()),"\n",
                "# NJG","\n",
                "# ------------------------", "\n",
                "\n",
                file=file_out,
                row.names="",
                col.names="",
                sep=""))

## ""

# now add the data frame
write.table(x=stats_out,
            file=file_out,
            row.names=FALSE,
            col.names=TRUE,
            sep=",",
            append=TRUE)

## Warning in write.table(x = stats_out, file = file_out, row.names = FALSE, :
## appending column names to file

## Organizing Source Files

# Logging
library(logger)
log_layout(layout_glue_colors)
log_threshold(TRACE)
mylog <- tempfile() # set up a temporary file to record the log
log_appender(appender_tee(mylog)) # append log statements to temp file

# using log statements
log_info()
log_trace()
log_debug()

for (i in 1:5) {
  log_debug('running file #',i)
}

# consider using log statements as annotation to code
z <- function(x=NULL){log_info(x)}

# now create a snippet

#---------------------------------------
z('read input')
# 

#---------------------------------------
z('source functions')
# 

# close the log file 
cat(readLines(mylog),file="logfile.txt",sep="\n")

# write the entire logfile once to the screen
cat("#---------------",
    "logfile.txt: ",
    readLines(mylog),sep="\n",
    "#---------------")

## #---------------
## logfile.txt: 
## INFO [2022-04-27 10:18:35] 
## TRACE [2022-04-27 10:18:35] 
## DEBUG [2022-04-27 10:18:35] 
## DEBUG [2022-04-27 10:18:35] running file #1
## DEBUG [2022-04-27 10:18:35] running file #2
## DEBUG [2022-04-27 10:18:35] running file #3
## DEBUG [2022-04-27 10:18:35] running file #4
## DEBUG [2022-04-27 10:18:35] running file #5
## INFO [2022-04-27 10:18:35] read input
## INFO [2022-04-27 10:18:35] source functions
## #---------------

# clean up and remove temporary file from memory
unlink(mylog)
rm(mylog)


## Using a progress bar (helpful  so we can see how far we get in case code fails)

# "Old school"
for (i in 1:10) {
  Sys.sleep(0.1) # at each loop, stop for 0.1 sec
  if(i%%10==0) cat(i) else if(i%%5==0) cat('.')
} # first part is the same, if it's divisible by 10 print the number; else if divisible by 5, print period. and nothing happens if not divisible by 5 or 10.

## .10

Homework 11

Hannah Shafer

4/6/2022