-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSIG_WNV_Flow_Data_Cleaning.r
121 lines (94 loc) · 4.06 KB
/
SIG_WNV_Flow_Data_Cleaning.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
## Load functions for parsing the flow cytometry spreadsheets
## The gdata library is necessary for reading Excel spreadsheets; it will be loaded as well.
source('./scripts/flow_data_cleaning_functions.r')
## View help documentation on the functions listed above
describe(read_flow_exp_file)
## Load all expected flow variables (expected column names)
flow_cn = read.xls(xls="./data/WNV_Data_Dictionary.xlsx", sheet="Flow Data", as.is=T)
flow_cn = flow_cn[,1]
## Move to the directory holding the data
flow_dir = "/Users/mooneymi/Documents/MyDocuments/SystemsImmunogenetics/WNV/Lund_Flow_fixed_Apr_26"
## Get a list of data files to read (in this case all flow spreadsheets begin with the prefix 'Expt')
flow_files = list.files(flow_dir, pattern="Expt.*\\.xls")
print(flow_files)
## Iterate through all the files, parse each, and merge all data into a single dataframe
i = 1
for (file in flow_files) {
print(file)
flow_dat = read_flow_exp_file(file.path(flow_dir, file), flow_cn)
## Check if there are any unexpected columns
new_columns = setdiff(colnames(flow_dat), flow_cn)
if (length(new_columns) > 0) {
flow_cn = c(flow_cn, new_columns)
}
if (i > 1) {
## Fill extra columns with NAs
for (col in new_columns) {
flow_all[,col] = NA
}
## Merge data
flow_all = rbind(flow_all[,flow_cn], flow_dat[,flow_cn])
} else {
flow_all = flow_dat
}
i = i + 1
}
## Check the dimensions of the dataframe
dim(flow_all)
## Check that all expected columns are present
setdiff(flow_cn, colnames(flow_all))
flow_all[1:10,1:11]
## Order columns, add Lab column and fix formatting
flow_all = flow_all[, flow_cn]
flow_all$Lab = "Lund"
flow_all$ID = gsub(" ", "", flow_all$ID)
flow_all$ID = gsub("X", "x", flow_all$ID)
flow_all$Mating = gsub(" ", "", flow_all$Mating)
flow_all$Mating = gsub("X", "x", flow_all$Mating)
flow_all$UW_Line = as.numeric(flow_all$UW_Line)
flow_all$Timepoint = as.numeric(flow_all$Timepoint)
## For validation data mock animals do not have IDs, set to NA
flow_all$ID[is.na(flow_all$RIX_ID)] = NA
flow_all[1:10,1:11]
## Check for duplicate IDs
new_flow_ids = paste(flow_all$ID, flow_all$Tissue, sep='_')
sum(duplicated(new_flow_ids))
## Read the previously cleaned data
## Note: you will have to change the file path
cleaned_dir = '/Users/mooneymi/Documents/MyDocuments/SystemsImmunogenetics/WNV/Cleaned_Data_Releases/23-Mar-2016'
flow_prev = read.xls(file.path(cleaned_dir, 'Lund_Flow_21-Mar-2016_final.xlsx'),
header=T, as.is=T, na.strings=c(""," ", "NA", "#DIV/0!"))
dim(flow_prev)
## Check for duplicate IDs
dup_ids1 = intersect(flow_prev$ID[flow_prev$Tissue=='brain'], flow_all$ID[flow_all$Tissue=='brain'])
dup_ids2 = intersect(flow_prev$ID[flow_prev$Tissue=='spleen'], flow_all$ID[flow_all$Tissue=='spleen'])
## Overwrite old data with new
idx1 = which(flow_prev$ID %in% dup_ids1 & flow_prev$Tissue=='brain')
idx2 = which(flow_prev$ID %in% dup_ids2 & flow_prev$Tissue=='spleen')
idx_dups = c(idx1, idx2)
print(length(idx_dups))
idx_to_keep = setdiff(1:nrow(flow_prev), idx_dups)
flow_prev = flow_prev[idx_to_keep,]
flow_all = rbind(flow_prev[, flow_cn], flow_all[, flow_cn])
## Check the dimensions of the dataframe
dim(flow_all)
## Change all data columns to numeric
for (i in 11:277) {
flow_all[,i] = as.numeric(flow_all[,i])
}
## Calculate cell counts and ratios
flow_full = flow_all
flow_full = calc_treg_counts(flow_full)
flow_full = calc_tcell_counts(flow_full)
flow_full = calc_ics_counts(flow_full)
flow_full = calc_ics_percent_ratios(flow_full)
flow_full = calc_ics_count_ratios(flow_full)
flow_full = clean_inf_nan(flow_full)
dim(flow_full)
## Save the data file
write.table(flow_all, file=file.path(flow_dir, 'Lund_Flow_12-May-2016_final.txt'),
col.names=T, row.names=F, quote=T, sep='\t', na='')
## Save the full data file
write.table(flow_full, file=file.path(flow_dir, 'Lund_Flow_Full_12-May-2016_final.txt'),
col.names=T, row.names=F, quote=T, sep='\t', na='')
save(flow_full, file=file.path(flow_dir, 'lund_flow_full_12-May-2016_final.rda'))