-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_DE.R
179 lines (141 loc) · 5.98 KB
/
run_DE.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env Rscript
# 1/function to perform t-test and get p-values for normalize expresion dataset
# 2/function for partional genes overlap across samples
# 3/function to identify outliers (diferentialy expresse gene) in datasets without distinguishable biological groups
######### check and install dependencies ##########
libraries <- c("data.table", "optparse")
for(x in libraries){
status <- x %in% rownames(installed.packages())
if(status == TRUE){
library(x, character.only=TRUE)
} else {
install.packages(x)
library(x, character.only=TRUE)
}
}
######## arguments #########
option_list <- list(
make_option(c("-f", "--file"), type="character", default=NULL, help="expression table name",
metavar = "character"),
make_option(c("-o", "--output_file"), type="character", default="dif_expr_genes.txt",
help="output file name [default = %default]", metavar="character"),
make_option(c("-p", "--p_value"), type="numeric", default="0.05", help="p-value [default = %default]",
metavar = "numeric"),
make_option(c("-t", "--tolerance_ratio"), type="numeric", default="1",
help="tolerance ration [default = %default]; values: 0-1", metavar = "numeric")
)
opt_parser <- OptionParser(option_list=option_list);
opt <- parse_args(opt_parser)
out_dir <- dirname(opt$output_file)
if (is.null(opt$file)){
print_help(opt_parser)
stop("At least one argument must be supplied (--file -> expression table name)", call.=FALSE)
}
################ Load functions #####################
# function 1 (getPvalues)
getPvalues<-function(x){
# only values without row names
expr_matrix <- x[,c(2:ncol(x))]
# count p-values (t-test) for expression value for particular gene in particular sample against rest of samples
p_values<-lapply(1:nrow(expr_matrix), function(ex){
pval<-lapply(c(1:ncol(expr_matrix)), function(zet){
j <- c(1:ncol(expr_matrix))
remove <- c(zet)
#print(zet)
k <-as.integer(paste(j [! j %in% remove], sep=""))
#print(k)
t_test <-t.test(expr_matrix[ex,c(k)], mu= expr_matrix[ex,zet])
return(t_test$p.value)
#print(t_test$p.value, header=FALSE)
})
return(pval)
})
col_names<-colnames(expr_matrix)
tmp <- data.frame(matrix(unlist(p_values), nrow=nrow(x), byrow=T))
#asi ne nutny krok
p_valuesDF <- tmp[,c(1:length(col_names))]
names(p_valuesDF) <- col_names
return(p_valuesDF)
}
#function 2 (part_overlap)
part_overlap <- function(x, num_s = y){
list_len <- length(x)
tab_it <- table(unlist(x))
tab_it_perc <- tab_it / list_len
names(tab_it_perc[tab_it_perc >= num_s])
}
# function 3 (recognizeOutLiers)
recognizeOutLiers <- function(x,y,z,p){
expr_matrix <- x[,c(2:ncol(x))]
sampleID<-colnames(x[,-1])
dif_gene_down <- list()
dif_gene_up <- list()
for(i in c(1:ncol(expr_matrix))){
j <- c(1:ncol(expr_matrix))
remove <- c(i)
k <-paste(j [! j %in% remove], sep=" ")
print(sampleID[i])
print(paste("p_value",z, sep=" "))
outliers_list_up <- list()
outliers_list_down <- list()
for(j in k){
#print(i)
#print(j)
#set variables
j<-as.integer(j)
e<- as.character(paste(sampleID[i])) #y
f<- as.character(paste(sampleID[j])) #x
nazev3 <- paste("",e,"_",f,"_outliers_up", sep="")
nazev4 <- paste("",e,"_",f,"_outliers_down", sep="")
#statistics
fit <- lm(x[,c(paste(e))] ~ x[,c(paste(f))], x)
preds1 <- predict(fit, interval = 'confidence', level=0.99)
df <- data.frame(outliers=cbind(preds1[,2] <= x[,(e)] & preds1[,3] >= x[,(e)]), genes_symbol=x$Gene_symbol,
interval_lwr = preds1[,2], interval_upr=preds1[,3], y_value=x[,(e)],
ttest_pvalue=y[,(e)], UP_regulation=cbind(preds1[,3] < x[,(e)]))
df_up <- df[df[,5]> 0 & df[,1] == "FALSE" & df[,6] < z & df[,7] == "TRUE",]
df_down <- df[df[,5]> 0 & df[,1] == "FALSE" & df[,6] < z & df[,7] == "FALSE",]
outliers_list_up[[nazev3]]<-as.vector(df_up$genes_symbol)
outliers_list_down[[nazev4]]<-as.vector(df_down$genes_symbol)
}
# merge genes to get genes from all pair combinnations
dif_gene_up[[paste(sampleID[i])]] <- part_overlap(outliers_list_up, as.numeric(opt$tolerance_ratio))
dif_gene_down[[paste(sampleID[i])]] <- part_overlap(outliers_list_down, as.numeric(opt$tolerance_ratio))
#save outputs (mozna zakomentovat!!!)
lapply(dif_gene_up, function(r){
write.table(r, file=paste0(out_dir, "/", sampleID[i],"_upregulated_genes.txt"), sep="\t", quote = F, row.names = F)
})
lapply(dif_gene_down, function(q){
write.table(q, file=paste0(out_dir, "/", sampleID[i],"_downregulated_genes.txt"), sep="\t", quote = F, row.names = F)
})
}
tab_up <- rbindlist(lapply(dif_gene_up, function(s){
a<-parent.frame()$i[]
data.table(gene=s, sample= sampleID[a], reg = "up" )
}))
tab_down <- rbindlist(lapply(dif_gene_down, function(q){
b<-parent.frame()$i[]
data.table(gene=q, sample= sampleID[b], reg = "down" )
}))
final_table_tmp<-rbind(tab_up, tab_down)
final_table_tmp<-final_table_tmp[!duplicated(final_table_tmp$gene),]
# write final results
final_table<-data.table::dcast.data.table(final_table_tmp, formula=gene~sample, value.var = "reg", fill = "0" )
write.table(final_table, file=p, sep="\t", quote = F, row.names = F)
}
######## Run pipeline ##########
print("Loading expression table..")
eset<-read.table(opt$file, header=TRUE)
dimen<-dim(eset)
print(paste("Number of genes:", dimen[1], sep=" "))
print(paste("Number of samples:", dimen[2]-1, sep=" "))
print(paste("P-value:", opt$p_value, sep=" "))
print(paste("Tolerance ratio:", opt$tolerance_ratio, sep=" "))
#set directory
cd <- getwd()
setwd(cd)
#execute functions
print("Computing p-values (t-test)..")
p_valuesDF<-getPvalues(eset)
print("Outliers recognition in..")
recognizeOutLiers(eset, p_valuesDF, opt$p_value, opt$output_file)