forked from heyuan7676/COVID-19
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SVA_followedby_LR.R
194 lines (161 loc) · 7.38 KB
/
SVA_followedby_LR.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
library(sva)
source("utils.R")
test_association <- function(tissue){
# sample covariates
sample_in_the_tissue = samples %>% filter(SMTSD == tissue)
## remove tissues with less than 70 samples
if(nrow(sample_in_the_tissue) < 70){ return ()}
sample_in_the_tissue = as.data.frame(sample_in_the_tissue)
rownames(sample_in_the_tissue) = sample_in_the_tissue$SAMPID
# gene TPM
tissue_name = gsub(" ", "_", gsub('\\)', '', gsub(' \\(', '_', gsub(' - ', '_', tissue))))
gene_tpm_in_the_tissue = read.table(paste0(datadir, 'tissue_tpm/', tissue_name, '_gene_TPM.txt'),
sep = '\t', header = T, stringsAsFactors = F, row.names = 1)
## make sure that sample orders in sample_in_the_tissue is the same as in gene_tpm_in_the_tissue
sample_in_the_tissue = sample_in_the_tissue[gsub("\\.", "-", colnames(gene_tpm_in_the_tissue)), ]
save_df = NULL
save_df = cbind(save_df, sample_in_the_tissue$SAMPID)
save_df = cbind(save_df, sample_in_the_tissue$AGE)
save_df = cbind(save_df, sample_in_the_tissue$Gender)
save_df_cols = c('SAMPID', 'AGE', 'SEX')
### 1. Test association with AGE_GROUP
## fit linear model
if(Test_gene %in% rownames(gene_tpm_in_the_tissue)){
tpm_matrix_gene_names = rownames(gene_tpm_in_the_tissue)
}else if(Test_gene %in% sapply(rownames(gene_tpm_in_the_tissue), function(x) strsplit(x, '\\.')[[1]][1])){
tpm_matrix_gene_names = sapply(rownames(gene_tpm_in_the_tissue), function(x) strsplit(x, '\\.')[[1]][1])
}else{
tpm_matrix_gene_names = c()
}
if(Test_gene %in% tpm_matrix_gene_names){
print(paste0('Perform LR using SVs for ', tissue))
gene_y = as.matrix(as.numeric(gene_tpm_in_the_tissue[which(tpm_matrix_gene_names == Test_gene), ]))
# AGE
cov = read.table(paste0(datadir, 'SVs/', tissue_name, '_SVs_AGE.txt'), sep = '\t', header = T)
cov = as.matrix(cov)
fitsv = lm(gene_y~cov)
coef = summary(fitsv)$coefficients['covAGE_GROUP', 1]
pvalue = summary(fitsv)$coefficients['covAGE_GROUP', 4]
result_tested_gene = c(coef, pvalue, median(as.numeric(gene_y)))
# save
sv_cols = paste0("SV", seq(1, ncol(cov)-1))
control_model = lm(gene_y~as.matrix(cov)[,sv_cols])
save_df = cbind(save_df, control_model$residuals)
save_df_cols = c(save_df_cols, 'SVA_AGE')
# SEX
cov = tryCatch(read.table(paste0(datadir, 'SVs/',tissue_name,'_SVs_SEX.txt'),
sep='\t', header = T, stringsAsFactors = F),
warning = function (w) {print(paste("No data available for tissue type", tis))},
error = function(f) {return("failed")})
if(inherits(cov, "character")){
print(paste0(" ", "Skipping ", tissue_name, " for sex"))
result_tested_gene = rbind(result_tested_gene, c(0, -1, 0))
}else{
cov = as.matrix(cov)
fitsv = lm(gene_y~cov)
coef = summary(fitsv)$coefficients['covSEX', 1]
pvalue = summary(fitsv)$coefficients['covSEX', 4]
result_tested_gene = rbind(result_tested_gene, c(coef, pvalue, median(as.numeric(gene_y))))
# save
sv_cols = paste0("SV", seq(1, ncol(cov)-1))
control_model = lm(gene_y~as.matrix(cov)[,sv_cols])
save_df = cbind(save_df, control_model$residuals)
save_df_cols = c(save_df_cols, 'SVA_SEX')
}
}else{
print(paste(Test_gene, " has no expression in ", tissue_name))
result_tested_gene = rbind(c(0,-1,0), c(0, -1, 0))
}
save_df = as.data.frame(save_df)
colnames(save_df) = save_df_cols
write.table(save_df, paste0(datadir, 'SVA_corrected/', Test_gene_name, '/', tissue_name, '_SV_removed.txt'),
sep = '\t', quote = FALSE, row.names = F)
result_tested_gene = as.data.frame(result_tested_gene)
colnames(result_tested_gene) = c("coefficient", "p-value", "median_TPM")
result_tested_gene$Variable = c("AGE", "SEX")
result_tested_gene$Gene = Test_gene_name
result_tested_gene$Tissue = tissue
return(result_tested_gene)
}
check_Test_gene_SVA <- function(){
result = data.frame()
for(tissue in sort(unique(samples$SMTSD))){
result_tested_gene = tryCatch(test_association(tissue), warning = function (w) {print(paste("No data available for tissue type", tis))},
error = function(f) {return("failed")})
if(!inherits(result_tested_gene, "character")){
result = rbind(result, result_tested_gene)
}else{
print(paste0("Skipping ", tissue))
}
}
## remove null test
result = result[result$"p-value" > 0, ]
## remove tissues with low gene expression (median TPM < 1)
result$median_TPM = 10^(result$median_TPM) - 1
result = result[result$median_TPM > 1, ]
## compute FDR for each gene seperately
result$FDR = p.adjust(result$"p-value", method = 'BH')
result = result[,c("Tissue", "Gene", "Variable", "median_TPM", "coefficient", "p-value", "FDR")]
result = result[order(result$"p-value"), ]
write.table(result, paste0(outdir, 'Association_tests_', Test_gene_name,'_SVA.csv'), sep=',', row.names=F, quote = FALSE)
return(result)
}
## plot
plot_one_row <- function(rowi){
tissue = rowi['Tissue']
tissue_name = gsub(" ", "_", gsub('\\)', '', gsub(' \\(', '_', gsub(' - ', '_', tissue))))
# read in
corrected_df = try(read.table(paste0(datadir, 'SVA_corrected/', Test_gene_name, '/', tissue_name, '_SV_removed.txt'),
sep = '\t', header = T, stringsAsFactors = F))
if(inherits(corrected_df, "try-error")){
return()
}
y = corrected_df[, paste('SVA', rowi['Variable'], sep = '_')]
x = corrected_df[, as.character(rowi['Variable'])]
variable = as.character(rowi['Variable'])
if(variable == 'AGE'){
x = factor(x, levels = c("20-29", "30-39", "40-49", "50-59",
"60-69", "70-79"))
color_p = 'Greens'
}else{
x = factor(x , levels = c(1,2), labels = c("Male", "Female"))
color_p = 'Set1'
}
ggtitle_text = paste0(tissue,
":\n coef = ", round(rowi$coefficient, 3),
':\n median TPM = ', round(rowi$median_TPM, 3))
df_for_plot = data.frame("Covariate"=x, "y"=y)
if(variable == 'AGE'){
xlabs = paste(levels(df_for_plot$Covariate),"yr\n(N=",table(df_for_plot$Covariate),")",sep="")
}else{
xlabs = paste(levels(df_for_plot$Covariate),"\n(N=",table(df_for_plot$Covariate),")",sep="")
}
gg = ggplot(aes(x = Covariate, y = y), data = df_for_plot) +
geom_boxplot(aes(fill = Covariate)) +
theme_bw() +
scale_x_discrete(labels=xlabs) +
xlab("") +
ylab(paste0("Corrected expression of ", Test_gene_name)) +
ggtitle(ggtitle_text) +
scale_fill_brewer(palette = color_p) +
theme(legend.position = 'none')
png(paste0(outdir, 'plots/', Test_gene_name, '/', Test_gene_name, '_',tissue_name,'_',as.character(rowi['Variable']),'_SVA.png'),
res = 130, height = 500, width = 600)
print(gg)
dev.off()
}
args <- commandArgs(TRUE)
Test_gene = args[1]
Test_gene_name = args[2]
dir.create(file.path(datadir, "SVA_corrected/"), showWarnings = FALSE)
dir.create(file.path(datadir, "SVA_corrected/", Test_gene_name, '/'), showWarnings = FALSE)
dir.create(file.path(outdir, "plots/", Test_gene_name, '/'), showWarnings = FALSE)
reg_result = check_Test_gene_SVA()
# plot
reg_result = read.table(paste0(outdir, 'Association_tests_', Test_gene_name,'_SVA.csv'),
sep=',', header = T, stringsAsFactors = F)
sig = reg_result[reg_result$FDR < 0.05, ]
for(i in seq(1, nrow(sig))){
rowI = sig[i, ]
plot_one_row(rowI)
}