forked from heyuan7676/COVID-19
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SVA_compute_SV_parallel.R
72 lines (59 loc) · 2.91 KB
/
SVA_compute_SV_parallel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
library(sva)
source("utils.R")
dir.create(file.path(datadir, "SVs/"), showWarnings = FALSE)
estimate_SVs <- function(tissue){
# sample covariates
sample_in_the_tissue = samples %>% filter(SMTSD == tissue)
sample_in_the_tissue = as.data.frame(sample_in_the_tissue)
rownames(sample_in_the_tissue) = sample_in_the_tissue$SAMPID
tis = gsub(" ", "_", gsub('\\)', '', gsub(' \\(', '_', gsub(' - ', '_', tissue))))
print(paste0('Tissue: ', tis))
# gene TPM
gene_tpm_in_the_tissue = tryCatch(read.table(paste0(datadir, 'tissue_tpm/', tis, '_gene_TPM.txt'),
sep = '\t', header = T, stringsAsFactors = F, row.names = 1),
warning = function (w) {print(paste("No data available for tissue type", tis))}, error = function(f) {return("failed")})
if(inherits(gene_tpm_in_the_tissue, "character")){
print(paste(" ", "Skipping tissue", tis))
return()
}
gene_tpm_in_the_tissue = gene_tpm_in_the_tissue[apply(gene_tpm_in_the_tissue, 1, function(x) sum(x>0.1)) > 0.2 * ncol(gene_tpm_in_the_tissue),]
gene_tpm_in_the_tissue = log10(gene_tpm_in_the_tissue + 1)
sample_in_the_tissue = sample_in_the_tissue[gsub("\\.", "-", colnames(gene_tpm_in_the_tissue)), ]
## 1). keep AGE_GROUP when estimating SVs
mod = model.matrix(~AGE_GROUP,data=sample_in_the_tissue)
mod0 = model.matrix(~1, data=sample_in_the_tissue)
n.sv = num.sv(as.matrix(gene_tpm_in_the_tissue), mod, method = 'be')
sva1 = sva(as.matrix(gene_tpm_in_the_tissue),mod,mod0,n.sv=n.sv)
print(paste0('Keep AGE, there are ', n.sv, ' SVs removed'))
## 2). combine AGE_GROUP and SVs
cov = data.frame(sva1$sv)
colnames(cov) = paste0('SV', seq(1, ncol(cov)))
cov$AGE_GROUP = sample_in_the_tissue$AGE_GROUP
write.table(cov, paste0(datadir, 'SVs/', tis, '_SVs_AGE.txt'), sep = '\t', row.names = F)
### Estimate SVs preserving SEX
# if only one sex, skip this step
# if the tissue has less than 10 samples in either gender group, skip this step
if(sum(table(sample_in_the_tissue$SEX) < 10) > 0){
return
}else if(length(table(sample_in_the_tissue$SEX)) == 1){
return
}else{
## keep SEX when estimating SVs
mod = model.matrix(~SEX,data=sample_in_the_tissue)
mod0 = model.matrix(~1, data=sample_in_the_tissue)
n.sv = num.sv(as.matrix(gene_tpm_in_the_tissue), mod, method = 'be')
sva1 = sva(as.matrix(gene_tpm_in_the_tissue),mod,mod0,n.sv=n.sv)
print(paste0('Keep SEX, there are ', n.sv, ' SVs removed'))
## combine SEX and SVs
cov = data.frame(sva1$sv)
colnames(cov) = paste0('SV', seq(1, ncol(cov)))
cov$SEX = sample_in_the_tissue$SEX
write.table(cov, paste0(datadir, 'SVs/', tis, '_SVs_SEX.txt'), sep = '\t', row.names = F)
}
}
## Among these, samples were selected based on donor genotype
## availability and a threshold of at least 70 samples per tissue
args <- commandArgs(TRUE)
tissue_idx=as.numeric(args[1])
tissue = sort(unique(samples$SMTSD))[tissue_idx]
estimate_SVs(tissue)