-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmycroarray.R
121 lines (100 loc) · 5.19 KB
/
mycroarray.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
library(Matrix)
library(lattice)
library(fdrtool)
library(rpart)
library(ggplot2)
# BiocManager::install("affy")
BiocManager::install("oligo")
library(oligo)
library(limma)
library(Biobase)
library(Biostrings)
library(genefilter)
# BiocManager::install("annotationTools")
library(annotationTools)
#BiocManager::install("pdInfoBuilder")
library(pdInfoBuilder)
BiocManager::install("pd.ht.hg.u133a")
library(pd.ht.hg.u133a) #https://support.bioconductor.org/p/55779/
# library(pd.u133aaofav2)
# tutorial: http://homer.ucsd.edu/homer/basicTutorial/affymetrix.html
setwd("~/Users/gandreoletti/Library/CloudStorage/OneDrive-GraphiteBio/CD110/GSE174060/")
# specify the path on your computer where the folder that contains the CEL-files is located
celpath = ("/Users/gandreoletti/Library/CloudStorage/OneDrive-GraphiteBio/CD110/GSE174060/GSE174060_RAW")
# # import CEL files containing raw probe-level data into an R AffyBatch object
# data = ReadAffy(celfile.path=celpath)
# import CEL files containing raw probe-level data into an R AffyBatch object
list = list.files(celpath,full.names=TRUE)
data = read.celfiles(list, pkgname="pd.ht.hg.u133a")
ph = data@phenoData
#normalize the data
eset <- rma(data)
# Finally, save the data to an output file to be used by other programs, etc (Data will be log2 transformed and normalized)
write.exprs(eset,file="data.txt")
#Adding Gene Annotation to Normalized Expression Output
# Strategy is to create data frame objects and merge them together - put expression info into a data frame
my_frame <- data.frame(exprs(eset))
exprSet.nologs = exprs(eset)
# Put annotation information in a data frame. To get specific fields, use packageNameSYMBOL, where the caps part names the type of data you're after
# To get a list of available annotation information, run the packagename with () at the end, i.e. mogene20sttranscriptcluster()
# Annot <- data.frame(ACCNUM=sapply(contents(mogene20sttranscriptclusterACCNUM), paste, collapse=", "), SYMBOL=sapply(contents(mogene20sttranscriptclusterSYMBOL), paste, collapse=", "), DESC=sapply(contents(mogene20sttranscriptclusterGENENAME), paste, collapse=", "))
heatmap(exprSet.nologs, main = "Normalized ME matrix")
Annot <- read.table(file = "GPL4685_noParents.an.txt", header = TRUE)
head(Annot)
# Merge data frames together (like a database table join)
all <- merge(Annot, my_frame, by.x="ProbeName", by.y=0, all=T)
head(all)
# Write out to a file:
write.table(all,file="data.ann.txt",sep="\t")
samples <- read.table("sample.txt", header = T, row.names = 1)
samples$Cel_file_name <- rownames(samples)
dim(samples)
x.mas5 <- call.exprs(x,"mas5") # Calculates expression values with MAS 5.0 method which is required for the next step!
########################
# data_counts_cases <- read.table(file = "star_deseq_raw_counts_batch6.txt", header = TRUE, row.names = 1)
data_counts <- read.table(file = "/Users/gandreoletti/Library/CloudStorage/OneDrive-GraphiteBio/CD110/GSE174060/data.txt", header = TRUE)
# header.true <- function(df) {
# names(df) <- as.character(unlist(df[1,]))
# df[-1,]
# }
# df1 <- header.true(data_counts)
data_counts <- na.omit(data_counts)
# rownames(data_counts) <- make.unique(data_counts$GeneSymbols)
head(data_counts,2)
# data_counts$GeneSymbols <- NULL
dim(data_counts)
samples <- read.table("/Users/gandreoletti/Library/CloudStorage/OneDrive-GraphiteBio/CD110/GSE174060/sample.txt", header = T)
rownames(samples) <- samples$sample
dim(samples)
head(samples )
samples$condition <- as.factor(samples$condition)
library(DESeq2)
ncol(data_counts) == nrow(samples)
data_counts <- data_counts[,order(colnames(data_counts))]
samples <- samples[order(rownames(samples)),]
colnames(data_counts) == rownames(samples)
dds <- DESeqDataSetFromMatrix(countData = round(data_counts),
colData = samples,
design = ~ condition)
cts = counts(dds)
# geoMeans = apply(cts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0]))))
# dds = estimateSizeFactors(dds, geoMeans=geoMeans)
dds <- dds[ rowSums( counts(dds) ) > 0 , ]
dds<-estimateSizeFactors(dds)
dds <- estimateDispersionsGeneEst(dds)
dispersions(dds) <- mcols(dds)$dispGeneEst
dds <- DESeq(dds, test = "Wald")
summary(dds)
vsd <- vst(dds, blind=FALSE, fitType='local')
d <- plotCounts(dds, gene="MPL", intgroup="Diagnosis", returnData=TRUE)
ggboxplot(d, x = "Diagnosis", y = "count",
color = "Diagnosis", palette =c("#00AFBB", "#E7B800", "#D55E00","#CC79A7","#F0E442", "#0072B2"),
add = "jitter", shape = "Diagnosis") + ggtitle("CD110 expression across samples") + theme_bw() +
stat_compare_means(label.y = 15) # Add global p-value
ggsave("CD110_expGSE54646_CEL.pdf", height=6, width=5.5)
d$Diagnosis <- factor(d$Diagnosis, levels=c("Normal", "ET", "MF","PV","JAK2_KO","HEL_Control"))
ggboxplot(d, x = "Diagnosis", y = "count",
color = "Diagnosis", palette =c("#00AFBB", "#E7B800", "#D55E00","#CC79A7","#F0E442", "#0072B2"),
add = "jitter", shape = "Diagnosis") + ggtitle("CD110 expression across samples") + theme_bw() +
stat_compare_means(comparisons = list(c("Normal", "ET"), c("Normal", "MF"), c("Normal", "PV"),c("Normal", "JAK2_KO")))
ggsave("CD110_expGSE54646_CEL_byGroup.pdf", height=6, width=7)