-
Notifications
You must be signed in to change notification settings - Fork 0
/
Verhaak.R
148 lines (117 loc) · 5.11 KB
/
Verhaak.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
############ This file takes in training and testing data for Glioblastoma #################
#################################################################################################
rm(list =ls())
################################################
######## Load the Data from the folder #########
load("/home/bit/ashar/ownCloud/DPMM_RESULTS/ONE_VIEW/Verhark/Final/DataVerhaak.RData")
Y.train.prelim <- relev$Y.train.prelim
Y.test.prelim <- relev$Y.test.prelim
signature.vk <- relev$signature
signature.dpmm <- relev$signature.dpmm
pheno.train <- relev$pheno.train
pheno.test <- relev$pheno.test
Y <- t(Y.train.prelim[as.character(signature.dpmm),])
time <- pheno.train[,3]
censoring <- pheno.train[,2]
c.true <- pheno.train[,4]
levels(c.true) <- c(1,2,3,4)
Y.new <- t(Y.test.prelim[as.character(signature.dpmm),])
time.new <- pheno.test[,3]
censoring.new <- pheno.test[,2]
c.true.new <- pheno.test[,4]
levels(c.true.new) <- c(1,2,3,4)
Y <- scale(Y, center = TRUE, scale = TRUE)
obj <- scale(Y, center = TRUE, scale = TRUE)
Y.new <- scale(Y.new, center = attr(obj,"scaled:center"), scale = (attr(obj,"scaled:scale")))
N <- nrow(Y)
D <- ncol(Y)
N.new <- nrow(Y.new)
smod <- Surv(exp(time), censoring)
smod.new <- Surv(exp(time.new), censoring.new)
k =4
F =k
Y.tot <- rbind(Y,Y.new)
pc <- prcomp(Y.tot)
pc.pred <- predict(pc,newdata = Y.tot)
bt <- c(rep(0,N),rep(1,N.new))
p1 <- ggplot(as.data.frame(pc.pred), aes(x=pc.pred[,1], y= pc.pred[,2], colour= as.factor(bt))) + ggtitle(" Batch Effects \n GBM I Cancer Data Set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")
p1
##################### STATE OF THE ART TECHNIQUES #################################
##################### BASIC METHODS + SOME ADVANCED METHODS ########################
source('Comparisonx.R')
Comparisonx()
source('ComparisionFLX.R')
ComparisionFLX()
source('ComparisionPReMiuM.R')
ComparisionPReMiuM()
setwd("~/Dropbox/Code/DPmixturemodel/SBC")
############################# PARAMETERS for GIBB's SAMPLING ####
iter = 100
iter.burnin = 100
iter.thin = 5
Nps = as.integer(iter/ iter.thin)
######################### Initialize the Parameters ##############################
source('initialize.R')
initialize()
###################### Start with a good configuration ###########################
#source('startSBC.R')
#startSBC()
########### Train the Model #########################################
source('burninDPMM.R')
burninDPMM()
source('gibbsDPMM.R')
gibbsDPMM()
########## Analyze the fit ##########################################
### Good feature selection from heatmap plus cindex plus randindex
source('MCMCanalyze.R')
MCMCanalyze()
###############################
### Some plots and analysis ####
#################################
#### Generating some plots and results ON Training data ###
logrank <- survdiff(smod ~ c.final)
1 - pchisq(unlist(logrank)$chisq,df =3)
surv.fit <- survfit(smod ~ c.final)
p5 <- ggsurv(surv.fit, main = " DPMM \n Kaplan Meier Estimators \n Verhaak Cancer Data Set") + ggplot2::guides(linetype = FALSE) + ggplot2::scale_colour_discrete(name = 'Classes',breaks = c(1,2),labels = c('1', '2'))
############ Generating some Plots ##########################
pc <- prcomp(Y)
pc.pred <- predict(pc,newdata = Y)
p1 <- ggplot(as.data.frame(pc.pred), aes(x=pc.pred[,1], y= pc.pred[,2], colour= as.factor(c.final))) + ggtitle(" SBC Clustering \n VerhaakCancer Data Set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")
# ######## Predict on New Data Set BASED ON JUST THE MOLECULAR DATA #####################################
# source('predictCLASS.R')
# predictCLASS(Y.new)
# ## Check the predicted Rand Index
#
# #### Choose that configuration which has the highest difference in survival curves ####################
# lr <- c(0)
# for (j in 1:Nps){
# lr[j] <- 1 - pchisq(unlist(survdiff(smod.new ~ c.matrix.new[,j]))$chisq,df =2)
# }
# c.sbc.new <- c.matrix.new[,which.min(lr)]
#
# ## If we fit cluster-specific models
# pre.sbc <- c(0)
# for ( q in 1:F){
# ind <- which(c.sbc == q)
# ind.new <- which(c.sbc.new == q)
#
# time.tmp <- time[ind]
#
# Y.tmp <- Y[ind,]
# Y.tmp.new <- Y.new[ind.new,]
#
# reg <- cv.glmnet(x = Y.tmp, y = time.tmp, family = "gaussian")
# pre.sbc[ind.new] <- predict(object = reg, newx = Y.tmp.new, s = "lambda.min")
# }
# predCIndex.sbc.aft <<- as.numeric(survConcordance(smod.new ~ exp(-pre.sbc))[1])
#
#### Use the adhoc-prediction algorithm ####################
source('predictADHOCverhaak.R')
surv.fit <- survfit(smod.new ~ c.sbc.new)
p2 <- ggsurv(surv.fit, main = " DPMM \n Kaplan Meier Estimators \n Verhaak Cancer Test Data Set") + ggplot2::guides(linetype = FALSE) + ggplot2::scale_colour_discrete(name = 'Classes',breaks = c(1,2,3,4),labels = c('1', '2','3','4'))
############ Generating some Plots ##########################
pc <- prcomp(Y.new)
pc.pred <- predict(pc,newdata = Y.new)
p3 <- ggplot(as.data.frame(pc.pred), aes(x=pc.pred[,1], y= pc.pred[,2], colour= as.factor(c.sbc.new))) + ggtitle(" SBC Clustering \n Verhaak Cancer Test Data Set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")
source('predictTIME.R')
predictchineseAFTtime(Y.new)