SIG_WNV_Flow_Data_Cleaning.r


## Load functions for parsing the flow cytometry spreadsheets
## The gdata library is necessary for reading Excel spreadsheets; it will be loaded as well.
source('./scripts/flow_data_cleaning_functions.r')

## View help documentation on the functions listed above
describe(read_flow_exp_file)

## Load all expected flow variables (expected column names)
flow_cn = read.xls(xls="./data/WNV_Data_Dictionary.xlsx", sheet="Flow Data", as.is=T)
flow_cn = flow_cn[,1]

## Move to the directory holding the data
flow_dir = "/Users/mooneymi/Documents/MyDocuments/SystemsImmunogenetics/WNV/Lund_Flow_fixed_Apr_26"

## Get a list of data files to read (in this case all flow spreadsheets begin with the prefix 'Expt')
flow_files = list.files(flow_dir, pattern="Expt.*\\.xls")

print(flow_files)

## Iterate through all the files, parse each, and merge all data into a single dataframe
i = 1
for (file in flow_files) {
    print(file)
    flow_dat = read_flow_exp_file(file.path(flow_dir, file), flow_cn)
    
    ## Check if there are any unexpected columns
    new_columns = setdiff(colnames(flow_dat), flow_cn)
    if (length(new_columns) > 0) {
        flow_cn = c(flow_cn, new_columns)
    }
    if (i > 1) {
        ## Fill extra columns with NAs
        for (col in new_columns) {
            flow_all[,col] = NA
        }
        ## Merge data
        flow_all = rbind(flow_all[,flow_cn], flow_dat[,flow_cn])
    } else {
        flow_all = flow_dat
    }
    i = i + 1
}

## Check the dimensions of the dataframe
dim(flow_all)

## Check that all expected columns are present
setdiff(flow_cn, colnames(flow_all))

flow_all[1:10,1:11]

## Order columns, add Lab column and fix formatting
flow_all = flow_all[, flow_cn]
flow_all$Lab = "Lund"

flow_all$ID = gsub(" ", "", flow_all$ID)
flow_all$ID = gsub("X", "x", flow_all$ID)
flow_all$Mating = gsub(" ", "", flow_all$Mating)
flow_all$Mating = gsub("X", "x", flow_all$Mating) 
flow_all$UW_Line = as.numeric(flow_all$UW_Line)
flow_all$Timepoint = as.numeric(flow_all$Timepoint)

## For validation data mock animals do not have IDs, set to NA
flow_all$ID[is.na(flow_all$RIX_ID)] = NA

flow_all[1:10,1:11]

## Check for duplicate IDs
new_flow_ids = paste(flow_all$ID, flow_all$Tissue, sep='_')
sum(duplicated(new_flow_ids))

## Read the previously cleaned data
## Note: you will have to change the file path
cleaned_dir = '/Users/mooneymi/Documents/MyDocuments/SystemsImmunogenetics/WNV/Cleaned_Data_Releases/23-Mar-2016'
flow_prev = read.xls(file.path(cleaned_dir, 'Lund_Flow_21-Mar-2016_final.xlsx'), 
                     header=T, as.is=T, na.strings=c(""," ", "NA", "#DIV/0!"))

dim(flow_prev)

## Check for duplicate IDs
dup_ids1 = intersect(flow_prev$ID[flow_prev$Tissue=='brain'], flow_all$ID[flow_all$Tissue=='brain'])
dup_ids2 = intersect(flow_prev$ID[flow_prev$Tissue=='spleen'], flow_all$ID[flow_all$Tissue=='spleen'])

## Overwrite old data with new
idx1 = which(flow_prev$ID %in% dup_ids1 & flow_prev$Tissue=='brain')
idx2 = which(flow_prev$ID %in% dup_ids2 & flow_prev$Tissue=='spleen')
idx_dups = c(idx1, idx2)
print(length(idx_dups))

idx_to_keep = setdiff(1:nrow(flow_prev), idx_dups)
flow_prev = flow_prev[idx_to_keep,]
flow_all = rbind(flow_prev[, flow_cn], flow_all[, flow_cn])

## Check the dimensions of the dataframe
dim(flow_all)

## Change all data columns to numeric
for (i in 11:277) {
    flow_all[,i] = as.numeric(flow_all[,i])
}

## Calculate cell counts and ratios
flow_full = flow_all
flow_full = calc_treg_counts(flow_full)
flow_full = calc_tcell_counts(flow_full)
flow_full = calc_ics_counts(flow_full)
flow_full = calc_ics_percent_ratios(flow_full)
flow_full = calc_ics_count_ratios(flow_full)
flow_full = clean_inf_nan(flow_full)

dim(flow_full)

## Save the data file
write.table(flow_all, file=file.path(flow_dir, 'Lund_Flow_12-May-2016_final.txt'), 
            col.names=T, row.names=F, quote=T, sep='\t', na='')

## Save the full data file
write.table(flow_full, file=file.path(flow_dir, 'Lund_Flow_Full_12-May-2016_final.txt'), 
            col.names=T, row.names=F, quote=T, sep='\t', na='')
save(flow_full, file=file.path(flow_dir, 'lund_flow_full_12-May-2016_final.rda'))