-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrawDataProcessing.R
137 lines (112 loc) · 5.06 KB
/
rawDataProcessing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
library(Biobase)
### Loading count data
countData <- read.table("Data/OSMNP_unnormalized_genes_count_10_3_18.count_table", sep = "\t", header = T)
dim(countData)
View(head(countData))
which(duplicated(countData$gene_symbol)==T) # no duplicated rows
rownames(countData) <- countData$gene_symbol
countData <- countData[,-1]
dim(countData) # 52997 414
### Loading meta data
anno <- read.csv("Data/Official_OSMNP_metadata_QC_passes6.19.18.csv",
header = TRUE, stringsAsFactors = FALSE)
dim(anno) # 417 20
View(anno)
# fixing typos, etc
anno$Organ[anno$Organ %in% c("Peritoenal Cavity", "Peritoneal")] <- "Peritoneal Cavity"
anno$Organ[which(anno$Organ == "yolk sac")] <- "Yolk sac"
anno$Organ[which(anno$Organ == "Epithelial Cell")] <- "Epithelium"
anno$Organ[which(anno$Organ == "Lymph Node (Mesenteric)")] <- "Mesenteric LN"
anno$Organ[which(anno$Organ == "Mesenteric Fat")] <- "Mesenteric fat"
rownames(anno) <- anno$ImmGen.Nomenclature
View(head(anno))
### Filtering samples in anno table
anno <- anno[-which(!rownames(anno) %in% colnames(countData)), ]
anno <- anno[-which(rownames(anno) %in% c("MF.FC.1", "MF.FC.3")), ] # contamination with fibroblasts
View(anno)
PC_to_keep <- c("MF.226pIIp.PC.1",
"MF.226pIIp.PC.3",
"MF.226pIIp.PC.4",
"MF.226pIIp480lo.PC.1",
"MF.226pIIp480lo.PC.2", # SPMs from Ki Wook, Randolph Lab
"MF.PC.01", # Jigar Desai from Lionakis lab, ICAM2+F4/80+
"MF.PC.03", # Ivan Dzhagalov from Dzhagalov lab
"MF.PC.07", # Geetika Bajpai from Lavine lab
"MF.PC.09") # Karine Crozat from Dalod lab
anno <- anno[-setdiff(
which(anno$Organ == "Peritoneal Cavity"),
which(anno$ImmGen.Nomenclature %in% PC_to_keep)), ] # dowsampling MF.PC
View(anno)
tissue1 <- dplyr::as_tibble(readr::read_csv(
"Data/Official_OSMNP_master_table6.19.18.csv"))
View(tissue1)
# (var1) Keeping all samples
anno$treatment <- tissue1$Organism.Treatment[match(anno$SampleName, tissue1$Sample.me)]
# tissue1$Cell.Treatment
# tissue1$Genetic.Variation
# (var2) Removing treated samples
anno <- anno[-grep("pIC.alv", anno$ImmGen.Nomenclature), ]
treated <- tissue1$Sample.me[which(tissue1$Organism.Treatment != "none")]
anno <- anno[-which(anno$SampleName %in% treated), ]
# (var3) Removing treated and embryo samples
anno <- anno[-which(anno$Organ %in% c("Yolk sac", "Embryonic body")), ]
anno <- anno[-grep("E[1468]{2}|Neo|pIC.alv", anno$ImmGen.Nomenclature), ]
treated <- tissue1$Sample.me[which(tissue1$Organism.Treatment != "none")]
anno <- anno[-which(anno$SampleName %in% treated), ]
# Test filtering of MG (not used)
# MG_to_keep <- c("MF.45lo.CNS.1",
# "MF.microglia.cerebel.CNS.1",
# "MF.microglia.cerebr.CNS.1",
# "MG.cortex.1",
# "MG.hippo.1",
# "MF.microglia.CNS.1")
# anno <- anno[-setdiff(
# which(anno$Organ == "Brain"),
# which(anno$ImmGen.Nomenclature %in% MG_to_keep)), ]
dim(anno) # 204 20 (for General), 243 20 (for GAM-clustering), # 337 20 (for Metab)
View(anno)
anno$treatment[anno$ImmGen.Nomenclature == "MF.pIC.alv.siglecFp.Lu.2"] <- "pIC"
anno$Cell.Family[anno$Organ == "Brain"] <- "Microglia"
anno$Sorting.Markers <- gsub("\t", "", anno$Sorting.Markers)
anno$Sorting.Markers <- gsub("\\s*", "", anno$Sorting.Markers)
anno$treatment[is.na(anno$treatment)] <- "none"
rUtils::write.tsv(anno, file = "Data/337_anno.tsv")
### Filter and reorder count table
countData <- countData[, rownames(anno)]
dim(countData)
### Filter rarely expressed genes
eObj <- edgeR::DGEList(counts = countData)
keep <- rowSums(edgeR::cpm(eObj) > 1) >= 3
eObj <- eObj[keep,]
dim(eObj)
eObj$samples$lib.size = colSums(eObj$counts) # reset lib sizes
### Normalisation and log
ytmm <- edgeR::calcNormFactors(eObj, method = "TMM")
limma::plotMDS(ytmm, col=anno$Batch, pch=16) # metric - log2 FC-s between samples
# .
# TS <- paste(anno$Cell.Family, anno$Organ, anno$Lab, anno$Batch, sep=".")
# TS <- paste(anno$Cell.Family, anno$Organ, anno$Lab, sep=".")
# TS <- factor(TS, levels=unique(TS))
# design <- model.matrix(~0+TS); design
# colnames(design) <- levels(TS)
# v <- limma::voom(ytmm, design, plot = TRUE)
# .
v <- limma::voom(ytmm, plot = TRUE)
expTable <- v$E
dim(expTable)
es <- ExpressionSet(assayData = expTable,
phenoData = new("AnnotatedDataFrame",
data = anno))
fData(es)$gene <- rownames(expTable)
### Filtering genes
fData(es)$entrez <- AnnotationDbi::mapIds(org.Mm.eg.db::org.Mm.eg.db,
keytype = "SYMBOL",
column = "ENTREZID",
keys=as.character(fData(es)$gene))
unique(fData(es)$entrez[which(duplicated(fData(es)$entrez))]) # NA
length(which(is.na(fData(es)$entrez))) #
save(es, file = "Data/337_es.total.Rda")
es <- es[-which(is.na(fData(es)$entrez)), ]
es.top12k <- es[head(order(apply(exprs(es), 1, mean), decreasing = T), 12000), ]
save(es.top12k, file = "Data/337_es.top12k.Rda")
# save(es.top12k, file = "Data/243_es.top12k.Rda")