-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy path.Rhistory
422 lines (422 loc) · 15.3 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
setwd("~/Desktop/Personal Documents/Jax Lab/R/kidneyDataWork/project1")
sig<-readRDS("sig_noLD.Rdata")
head(sig)
length(which(sig$q_value<0.05))
sig_noLD<-sig
sig_noLD<- na.omit(sig_noLD)
sig_noLD<- sig_noLD[which(duplicated(sig_noLD[,2:3])==FALSE),]
which(duplicated(sig_noLD[,2:3])==FALSE)
sig$class<-0
bin.vector <- function(vectorX, bins = seq(0,1,0.5)){
dist.mat <- apply(matrix(bins, ncol = 1), 1, function(x) x - vectorX)
binned.vector <- apply(dist.mat, 1, function(x) bins[which(abs(x) == min(abs(x)))[1]])
return(binned.vector)
}
#######
get.interaction.error <- function(x, y, trace, error.type = c("sd", "se")){
if(length(grep("e", error.type)) > 0){
error.type = "se"
}else{
error.type = "sd"
}
x.levels <- levels(as.factor(x))
y.levels <- levels(as.factor(y))
mean.mat <- matrix(NA, ncol = length(x.levels), nrow = length(y.levels))
error.mat <- matrix(NA, ncol = length(x.levels), nrow = length(y.levels))
colnames(mean.mat) <- colnames(error.mat) <- x.levels
rownames(mean.mat) <- rownames(error.mat) <- y.levels
for(i in 1:length(x.levels)){
for(j in 1:length(y.levels)){
group <- intersect(which(x == x.levels[i]), which(y == y.levels[j]))
if(length(group) > 0){
mean.mat[j,i] <- mean(trace[group], na.rm = TRUE)
if(error.type == "se"){
error.mat[j,i] <- sd(trace[group], na.rm = TRUE)/sqrt(length(trace[group][!is.na(trace[group])]))
}
if(error.type == "sd"){
error.mat[j,i] <- sd(trace[group], na.rm = TRUE)
}
}
}
}
results <- list(mean.mat, error.mat)
names(results) <- c("means", error.type)
return(results)
}
###################
get.means2 <- function(data.obj, marker1, marker2){
error.type = "se"
markers <- colnames(data.obj$geno)[match(c(marker1, marker2), data.obj$marker.names)]
marker.names <- c(marker1, marker2)
all.pheno.mat <- data.obj$pheno
marker.mat <- data.obj$geno[,markers]
geno.bins <- sort(unique(marker.mat[which(!is.na(marker.mat))]))
if(length(geno.bins) > 3){
geno.bins <- c(0,0.5, 1)
}
marker1.geno <- data.obj$geno[,markers[1]]; if(!is.null(geno.bins)){marker1.geno <- bin.vector(marker1.geno, geno.bins)}
marker2.geno <- data.obj$geno[,markers[2]]; if(!is.null(geno.bins)){marker2.geno <- bin.vector(marker2.geno, geno.bins)}
marker.geno <- cbind(marker1.geno, marker2.geno)
colnames(marker.geno) <- marker.names
means <- array(NA, dim = c(length(geno.bins), length(geno.bins), 1))
errors <- array(NA, dim = c(length(geno.bins), length(geno.bins), 1))
dimnames(means)[[1]] <- dimnames(errors)[[1]] <- dimnames(means)[[2]] <- dimnames(errors)[[2]] <- geno.bins
dimnames(means)[[3]] <- dimnames(errors)[[3]] <- colnames(all.pheno.mat)[1]
pair.means <- get.interaction.error(marker1.geno, marker2.geno, all.pheno.mat[,i], error.type = error.type)
means[rownames(pair.means$means),colnames(pair.means$means),1] <- pair.means$means <- pair.means$means
#results=list(means)
#return(results)}
results <- list(means)
#results1=list(errors)
# names(results1)=c("std.error")
# results1=melt(results1)
# results1=results1[order(results1[,1]),]
# results1=t(as.matrix(results1$value))
# results1.heterozygous=cbind(results1[,1],results1[,2],results1[,4],results1[,5])
#return(results1.herterozygous)
names(results) <- c("genotype.means")
results=melt(results)
results=results[order(results[,1]),]
results=t(as.matrix(results$value))
#results.heterozygous=cbind(results[,1],results[,2],results[,3],results[,4])
results.heterozygous=cbind(results[,9],results[,3],results[,7],results[,1])#for allel 1
#return(results.heterozygous)
#return(results)
return(results.heterozygous)
# return(all.pheno.mat)
}
get.errors2 <- function(data.obj, marker1, marker2){
error.type = "se"
markers <- colnames(data.obj$geno)[match(c(marker1, marker2), data.obj$marker.names)]
marker.names <- c(marker1, marker2)
all.pheno.mat <- data.obj$pheno
marker.mat <- data.obj$geno[,markers]
geno.bins <- sort(unique(marker.mat[which(!is.na(marker.mat))]))
if(length(geno.bins) > 3){
geno.bins <- c(0, 0.5, 1)
}
marker1.geno <- data.obj$geno[,markers[1]]; if(!is.null(geno.bins)){marker1.geno <- bin.vector(marker1.geno, geno.bins)}
marker2.geno <- data.obj$geno[,markers[2]]; if(!is.null(geno.bins)){marker2.geno <- bin.vector(marker2.geno, geno.bins)}
marker.geno <- cbind(marker1.geno, marker2.geno)
colnames(marker.geno) <- marker.names
means <- array(NA, dim = c(length(geno.bins), length(geno.bins), 1))
errors <- array(NA, dim = c(length(geno.bins), length(geno.bins), 1))
dimnames(means)[[1]] <- dimnames(errors)[[1]] <- dimnames(means)[[2]] <- dimnames(errors)[[2]] <- geno.bins
dimnames(means)[[3]] <- dimnames(errors)[[3]] <- colnames(all.pheno.mat)[1]
pair.means <- get.interaction.error(marker1.geno, marker2.geno, all.pheno.mat[,i], error.type = error.type)
errors[rownames(pair.means$means),colnames(pair.means$means),1] <- pair.means$se
#means[rownames(pair.means$means),colnames(pair.means$means),1] <- pair.means$means
#return(results1)}
results=list(errors)
names(results)=c("std.error")
results=melt(results)
results=results[order(results[,1]),]
results=t(as.matrix(results$value))
results.heterozygous=cbind(results[,9],results[,3],results[,7],results[,1])
#results.heterozygous=cbind(results[,1],results[,2],results[,3],results[,4],results[,7],results[,5],results[,8],results[,6],results[,9])
return(results.heterozygous)
# return(all.pheno.mat)
}
means<-as.data.frame(matrix(nrow=132, ncol=4))
for(i in 1:nrow(final)){
means[i,]<- get.means2(cross, final[i,2], final[i,3])
}
means<-as.data.frame(matrix(nrow=132, ncol=4))
for(i in 1:nrow(sig)){
means[i,]<- get.means2(cross, sig[i,2], sig[i,3])
}
cross<-readRDS("cross_effectPlot.Rdata")
install.packages("reshape")
library('reshape')
expr<-readRDS("expres_data.Rdata")
geno= readRDS("geno.Rdata")
expr<-t(expr)
geno<-t(geno)
cross$pheno<-expr
#enter geno data
cross$geno<-geno
means<-as.data.frame(matrix(nrow=132, ncol=4))
for(i in 1:nrow(sig)){
means[i,]<- get.means2(cross, sig[i,2], sig[i,3])
}
colnames(means)<-c("mean1", "mean2", "mean3", "mean4")
errors<-as.data.frame(matrix(nrow=132, ncol=4))
for(i in 1:nrow(final)){
errors[i,]<- get.errors2(cross, final[i,2], final[i,3])
}
#set colnames
colnames(errors)<-c("error1", "error2", "error3", "error4")
errors<-as.data.frame(matrix(nrow=132, ncol=4))
for(i in 1:nrow(sig)){
errors[i,]<- get.errors2(cross, sig[i,2], sig[i,3])
}
colnames(errors)<-c("error1", "error2", "error3", "error4")
eff<- cbind(sig, means)
eff<-cbind(eff, errors)
eff<-na.omit(eff)
final<-readRDS("final_2.Rdata")
final<-readRDS("final_means.Rdata")
final<-readRDS("effect_Data.Rdata")
final<-readRDS("effect_data.Rdata")
head(final)
final$class<-0
#loops to test and classify every remaining significant pair
#apologize if code is confusing, will try to make more clear with comments
for(i in 1:nrow(final)){
#saving values for each mean plus the error bar (p) and minus the error bar (m)
#so that for each mean we have to values to finalnify the possible range
#col. 13 has been subtracted from all to normalize values to the neutral genotype
p1=(final[i,10]+final[i,14])- final[i,13]
m1= (final[i,10]-final[i,14])- final[i,13]
p2 <-(final[i,11]+final[i,15])- final[i,13]
m2=(final[i,11]-final[i,15])- final[i,13]
p3 <-(final[i,12]+final[i,16])- final[i,13]
m3=(final[i,12]-final[i,16])- final[i,13]
#tests to see if masking is occuring
#tests for any overlap between ranges of means
t1= (p2>m1&&p2<p1)
t2= (m2>m1&&m2<p1)
t3= (m1>m2&&m1<p2)
t4= (p1>m2&&p1<p2)
t5= (p3>m1&&p3<p1)
t6= (m3>m1&&m3<p1)
t7= (m1>m3&&m1<p3)
t8= (p1>m3&&p1<p3)
if(t1 || t2 || t3 || t4 || t5 || t6 ||t7||t8){
final$class[i]<- "masking"
}
else if(m1 >(p2+p3) || p1<(m2+m3)){
final$class[i]<-"enhancement"
}
else{
final$class[i]<-"suppression"
}
}
head(final)
unique(final$class)
length(which(final$class=="masking"))
length(which(final$class=="suppression"))
length(which(final$class=="enhancement"))
saveRDS(final, "final_classified.Rdata")
setwd("~/Desktop/Personal Documents/Jax Lab/R/DO_mice")
geno<-readRDS("probs.Rdata")
expr<-readRDS("expr_tmp_20360.Rdata")
covar<- readRDS("covariates.Rdata")
cis_trans<-readRDS("cis_trans_clean.Rdata")
pair=cis_trans[1,]
pair
m1 = geno[, strsplit(pair[[2]], split="-")[[1]][2], strsplit(pair[[2]], split="-")[[1]][1]]
#trans ^
m2 = geno[, strsplit(pair[[3]], split="-")[[1]][2], strsplit(pair[[3]], split="-")[[1]][1]]
#cis ^
fit <- lm(expr[,pair[[1]]] ~ m1 + m2 + m1:m2 +covar, na.action=na.omit)
summ=summary(fit)
coefs <- summ$coefficients
coefs
for(i in 1:nrow(cis_trans)){
if(i%%100==0){cat(i, "\n")}
chr[i]<- strsplit(cis_trans[,3], split="_")[[i]][1]
#marker[i]<- strsplit(strsplit(cis_trans[,3], split="_")[[i]][2], split="-")[[1]][1]
}
chr<-rep('a',5)
marker<-rep('a', 5)
for(i in 1:nrow(cis_trans)){
if(i%%100==0){cat(i, "\n")}
chr[i]<- strsplit(cis_trans[,3], split="_")[[i]][1]
#marker[i]<- strsplit(strsplit(cis_trans[,3], split="_")[[i]][2], split="-")[[1]][1]
}
head(cis_trans)
which(chr!="X")
setwd("~/Desktop/Personal Documents/Jax Lab/R/kidneyDataWork/project1")
setwd("~/Desktop/Personal Documents/Jax Lab/R/DO_mice")
sig<-readRDS("sig.Rdata")
head(sig)
setwd("~/Desktop/Personal Documents/Jax Lab/R/kidneyDataWork/project1")
cis_trans<-readRDS("cis_trans_ready.Rdata")
head(cis_trans)
pairwise<-readRDS("pairwise.Rdata")
pairwise<-readRDS("pairwise_results.Rdata")
head(pairwise)
pairwise[,1]
#take real data matrix and transpose for easier manipulation
pairwise<-t(pairwise)
#convert matrix to data.frame
pairwise<- as.data.frame(pairwise)
#change numbers from char to num
pairwise[,6]<- as.numeric(as.character(pairwise[,6]))
pairwise[,5]<- as.numeric(as.character(pairwise[,5]))
pairwise[,4]<- as.numeric(as.character(pairwise[,4]))
pairwise[,7]<- as.numeric(as.character(pairwise[,7]))
pairwise[,1]<- as.character(pairwise[,1])
pairwise[,2]<- as.character(pairwise[,2])
pairwise[,3]<- as.character(pairwise[,3])
head(pairwise)
perm<-readRDS("permutation.data.Rdata")
head(perm)
perm[,1]
perm[1,]
perm[[1]]
perm[[1]][,1]
perm[[1]][1,]
perm[[1]][1,]
perm[[1]][1]
perm[[1]][,1]
dim(perm)
perm<-readRDS("permutation.Rdata")
perm<-readRDS("permutationa.Rdata")
perm<-readRDS("permutations.Rdata")
head(perm)
perm[[1]][,1]
setwd("~/Desktop/Personal Documents/Jax Lab/R/DO_mice")
cis<-readRDS("allele_cis.Rdata")
trans<-readRDS("allele_trans.Rdata")
head(cis)
?source
?source
library("dplyr")
data <- read.csv("TechSurvey - Survey.csv",header=T);
setwd("~/programming/R/fdac/Miniproject3")
library(tidyr)
library(useful)
# Read in the data
data <- read.csv("TechSurvey - Survey.csv",header=T);
data[1:5,1:5]
# Convert date to unix second
for (i in c("Start", "End"))
data[,i] = as.numeric(as.POSIXct(strptime(data[,i], "%Y-%m-%d %H:%M:%S")))
for (i in 0:12){
vnam = paste(c("PG",i,"Submit"), collapse="")
data[,vnam] = as.numeric(as.POSIXct(strptime(data[,vnam], "%Y-%m-%d %H:%M:%S")))
}
# Calculate differences in time
for (i in 12:0){
pv = paste(c("PG",i-1,"Submit"), collapse="");
if (i==0)
pv="Start";
vnam = paste(c("PG",i,"Submit"), collapse="");
data[,vnam] = data[,vnam] -data[,pv];
}
# Total time to complete
complete_time= mean(data$End - data$Start, na.rm = TRUE)
# Extract response time columns
question_times<- data[,grep(pattern = "Submit", x= colnames(data),value=TRUE)]
# Get average time for each question
time_means<- apply(question_times[,-c(1)], 2, mean, na.rm=TRUE)
# Longest question
longQ<- names(time_means)[which(time_means==max(time_means))]
# Shortest question
shortQ<- names(time_means)[which(time_means==min(time_means))]
# Extract ranked critera columns
ranked_col<- data[,grep(pattern = "PG5", x= colnames(data),value=TRUE)]
ranked_col<- data[,grep(pattern = "Order", x= colnames(ranked_col),value=TRUE)]
# Mean ranking of each criteria
criteria_means<- apply(ranked_col, 2, mean, na.rm=TRUE)
# Highest ranked criteria
highRank<- names(criteria_means)[which(criteria_means==max(criteria_means))]
# Age distribution
age_freq <- data[,81] %>%
count() %>%
.[-c(1),]
# Plot distribution
ggplot(age_freq, aes(x, freq)) +
geom_col(fill="red") +
xlab("Age Group") +
ylab("Number of participants") +
ggtitle("Demographic Distribution by Age")
library(plyr)
# Total time to complete
complete_time= mean(data$End - data$Start, na.rm = TRUE)
# Extract response time columns
question_times<- data[,grep(pattern = "Submit", x= colnames(data),value=TRUE)]
# Get average time for each question
time_means<- apply(question_times[,-c(1)], 2, mean, na.rm=TRUE)
# Longest question
longQ<- names(time_means)[which(time_means==max(time_means))]
# Shortest question
shortQ<- names(time_means)[which(time_means==min(time_means))]
# Extract ranked critera columns
ranked_col<- data[,grep(pattern = "PG5", x= colnames(data),value=TRUE)]
ranked_col<- data[,grep(pattern = "Order", x= colnames(ranked_col),value=TRUE)]
# Mean ranking of each criteria
criteria_means<- apply(ranked_col, 2, mean, na.rm=TRUE)
# Highest ranked criteria
highRank<- names(criteria_means)[which(criteria_means==max(criteria_means))]
# Age distribution
age_freq <- data[,81] %>%
count() %>%
.[-c(1),]
# Plot distribution
ggplot(age_freq, aes(x, freq)) +
geom_col(fill="red") +
xlab("Age Group") +
ylab("Number of participants") +
ggtitle("Demographic Distribution by Age")
data[1:5,1:15]
# Columns to drop
col_drop= c("PG4Dtr0_6", "PG4Psv7_8", "PG4Prm9_10", "PG5_1RRPQ", "PG5_1Order", "PG5_1Time", "PG5_2BNUI",
"PG5_2Order", "PG5_2Time", "PG5_4VGP","PG5_4Order", "PG5_4Time" , "PG5_5PHR",
"PG5_5Order", "PG5_5Time", "PG5_6SSYOP", "PG5_6Order", "PG5_6Time", "PG5_7NDYP",
"PG5_7Order", "PG5_7Time", "PG5_8CP", "PG5_8Order", "PG5_8Time", "PG5_9FRP",
"PG5_9Order", "PG5_9Time", "PG5_10RPA", "PG5_10Order", "PG5_10Time", "PG5_11NSG",
"PG5_11Order", "PG5_11Time", "PG5_12NWG","PG5_12Order", "PG5_12Time",
"PG5_13NFG", "PG5_13Order", "PG5_13Time")
# Drop these columns
model_data<- data[,-c(which(colnames(data) %in% col_drop))]
# Fix incorrect colname
colnames(model_data)[15]<- "PG3Resp"
# Preserve only PG5 columns needed
model_data<- model_data[,c(1:18, 20, 22:43)]
# Rearrange df so response is far left column
model_data<- model_data[,c(19,1:18,20:ncol(model_data))]
# Remove all samples with NA for response
model_data<- model_data[-c(which(is.na(model_data[,1])==TRUE)),]
model_data[1:5,1:5]
model_data[1:5,]
library(tidyr)
library(useful)
library(plyr)
# Read in the data
data <- read.csv("TechSurvey - Survey.csv",header=T);
# Convert date to unix second
for (i in c("Start", "End"))
data[,i] = as.numeric(as.POSIXct(strptime(data[,i], "%Y-%m-%d %H:%M:%S")))
for (i in 0:12){
vnam = paste(c("PG",i,"Submit"), collapse="")
data[,vnam] = as.numeric(as.POSIXct(strptime(data[,vnam], "%Y-%m-%d %H:%M:%S")))
}
# Calculate differences in time
for (i in 12:0){
pv = paste(c("PG",i-1,"Submit"), collapse="");
if (i==0)
pv="Start";
vnam = paste(c("PG",i,"Submit"), collapse="");
data[,vnam] = data[,vnam] -data[,pv];
}
data[1:5,1:15]
complete_time= mean(data$End - data$Start, na.rm = TRUE)
# Extract response time columns
question_times<- data[,grep(pattern = "Submit", x= colnames(data),value=TRUE)]
question_times
# Get average time for each question
time_means<- apply(question_times[,-c(1)], 2, mean, na.rm=TRUE)
# Longest question
longQ<- names(time_means)[which(time_means==max(time_means))]
time_means
# Extract ranked critera columns
ranked_col<- data[,grep(pattern = "PG5", x= colnames(data),value=TRUE)]
ranked_col<- data[,grep(pattern = "Order", x= colnames(ranked_col),value=TRUE)]
# Mean ranking of each criteria
criteria_means<- apply(ranked_col, 2, mean, na.rm=TRUE)
criteria_means
# Age distribution
age_freq <- data[,81] %>%
count() %>%
.[-c(1),]
# Plot distribution
ggplot(age_freq, aes(x, freq)) +
geom_col(fill="red") +
xlab("Age Group") +
ylab("Number of participants") +
ggtitle("Demographic Distribution by Age")
model_data
model_data[1:10,]