-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscyttools_geneset_scoring.R
132 lines (100 loc) · 4.59 KB
/
scyttools_geneset_scoring.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env Rscript
require(docopt)
require(methods)
"
Usage:
scyttools_geneset_scoring.R (-h | --help | --version)
scyttools_geneset_scoring.R RDS OUT
Description: This script performs trajectory inference
Options:
--version Show the current version.
Arguments:
RDS Provide single cell experiment object Rdata file
OUT Provide output file
" -> doc
args <- docopt(doc)
source("scyttools_functions.R")
library(data.table)
##########################################################################
############################ R code goes here ############################
##########################################################################
# load single cell data
load(args$RDS)
# calculate mean absolute deviation for each gene across all samples
median_absolute_deviations <- logcounts(sce_glm_pca) %>%
tidy_sparse_matrix() %>%
group_by(row) %>%
summarise(median_abs_dev = mad(value),
median = median(value))
# calculate modified Z-score by multipling the difference of the cpm_tmm and mean for each gene times 0.6745 (just because), and dividing by the mean absolute deviation
modified_z_score <- logcounts(sce_glm_pca) %>%
tidy_sparse_matrix() %>%
left_join(median_absolute_deviations) %>%
mutate(mod_z = (0.6745*(value - median))/median_abs_dev) %>%
select(row, column, mod_z)
fake_cell <- as.character(ncol(sce_glm_pca) + 1)
zmad_sparse <- data.frame(row = names(logcounts(sce_glm_pca)[,1])) %>%
left_join(modified_z_score) %>%
mutate(row = factor(row, levels = names(logcounts(sce_glm_pca)[,1])),
column = if_else(is.na(column), fake_cell, column),
mod_z = if_else(is.na(mod_z), 0, mod_z),
column = factor(column, levels = c(colnames(logcounts(sce_glm_pca)), fake_cell))) %>%
arrange(column) %>%
tidytext::cast_sparse(row = row,
column = column,
value = mod_z)
assay(sce_glm_pca, "zmad") <- zmad_sparse[,colnames(zmad_sparse) != fake_cell]
geneset_table <- msigdbr::msigdbr() %>%
select(gs_name, entrez_gene) %>%
left_join(grch38 %>%
distinct(ensgene, entrez),
by = c("entrez_gene" = "entrez")) %>%
setDT()
dt_zmad <- zmad_sparse %>%
tidy_sparse_matrix() %>%
setDT()
zmad_scores <- lapply(unique(geneset_table$gs_name), function(geneset){
barcode_scores <- dt_zmad[row %in% geneset_table$ensgene[geneset_table$gs_name == geneset], mean(value, na.rm = T), by = column]
return(barcode_scores)
})
names(zmad_scores) <- unique(geneset_table$gs_name)
zmad_scores <- zmad_scores %>%
bind_rows(.id = "gs_name") %>%
spread(gs_name,
V1)
colData(sce_glm_pca) <- colData(sce_glm_pca) %>%
as.data.frame() %>%
rownames_to_column("row_names") %>%
left_join(zmad_scores,
by = c("Barcode" = "column")) %>%
column_to_rownames("row_names") %>%
DataFrame()
# load in signatures
neuro <- read_csv("/data/capaldobj/nf-core/lgcp/rnaseq/analysis-scripts/neuro_reference_vpca_loadings_grch37.csv")
ar_signature_weights <- read_csv("/data/capaldobj/nf-core/lgcp/rnaseq/analysis-scripts/ar_signature_weights_Mendiratta.csv")
barcode_ar_signature_score <- vector(length = ncol(sce_glm_pca))
for(i in 1:ncol(sce_glm_pca)){
scored <- data.frame(logcounts = logcounts(sce_glm_pca)[rownames(logcounts(sce_glm_pca)) %in% ar_signature_weights$`Ensemble ID`,i],
ensgene = names(logcounts(sce_glm_pca)[rownames(logcounts(sce_glm_pca)) %in% ar_signature_weights$`Ensemble ID`,i])) %>%
left_join(ar_signature_weights, by = c("ensgene" = "Ensemble ID")) %>%
transmute(score = logcounts*`BinReg Coef`)
barcode_ar_signature_score[i] <- sum(scored$score)
}
neuro_joined <- neuro %>%
left_join(as.data.frame(rowData(sce_glm_pca)), by = c("Loading" = "Symbol"))
barcode_neuro_score <- vector(length = ncol(sce_glm_pca))
for(i in 1:ncol(sce_glm_pca)){
scored <- data.frame(logcounts = logcounts(sce_glm_pca)[rowData(sce_glm_pca)$Symbol %in% neuro$Loading,i],
ensgene = names(logcounts(sce_glm_pca)[rowData(sce_glm_pca)$Symbol %in% neuro$Loading,i])) %>%
left_join(neuro_joined, by = c("ensgene" = "ID")) %>%
transmute(score = logcounts*PC1.v)
barcode_neuro_score[i] <- sum(scored$score)
}
sce_glm_pca$NE_score <- barcode_neuro_score
sce_glm_pca$AR_score <- barcode_ar_signature_score
## write out new SingleCellExperiment Object
save(sce_glm_pca,
file = args$OUT)
##########################################################################
############################ End code ############################
##########################################################################