-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare_data.R
executable file
·450 lines (398 loc) · 25.3 KB
/
prepare_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
# Preparing data for the analysis of proteins relocation upon ZIKV infection (Solene Denolly)
#
# Author: Alexey Stukalov
###############################################################################
project_id <- "sdenolly_canxapms"
msfolder <- "mq20211122"
data_version <- "20211125"
message('Project ID=', project_id, " data version=", data_version)
source("~/R/config.R")
source(file.path(base_scripts_path, 'R/misc/setup_base_paths.R'))
source(file.path(misc_scripts_path, 'setup_project_paths.R'))
source(file.path(misc_scripts_path, 'fasta_utils.R'))
source(file.path(misc_scripts_path, 'ggplot_ext.R'))
require(msimportr)
require(msglm)
require(dplyr)
require(jsonlite)
require(stringr)
require(readr)
require(pheatmap)
data_info <- list(project_id = project_id, data_ver=data_version, fit_ver="20211126",
pepmodstate_mscalib_filename = "mscalib_QEP5_intensity_pepmodstate_cov2_20211108.json",
msfolder = msfolder)
msdata_path <- file.path(data_path, msfolder)
message('Loading MS instrument protein calibration data from ', data_info$pepmodstate_mscalib_filename, '...')
pepmodstate_mscalib <- read_mscalib_json(file.path(data_path, data_info$pepmodstate_mscalib_filename))# %>% convert_logintensityBase(new_base=2)
msruns.df <- read_tsv(file.path(msdata_path, "msruns.txt")) %>%
tidyr::extract("msrun", c('bait', 'treatment', 'batch'),
"^(.+)_(.+)_(.+)$", remove = FALSE, convert=TRUE) %>%
dplyr::mutate(msrun_ix = row_number(),
rawfile = str_remove(rawfile, "\\.raw$"),
bait = factor(bait) %>% relevel("empty"),
treatment = factor(treatment) %>% relevel("Mock")) %>%
dplyr::arrange(bait, treatment, batch) %>%
dplyr::mutate(condition = paste0(bait, '_', treatment),
condition = factor(condition, levels=unique(condition)) %>% relevel("empty_Mock"))
fasta.dfs <- list(
HAtag = read_innate_uniprot_fasta(file.path(data_path, "fasta", "hatag.fasta")),
ZikV = read_innate_uniprot_fasta(file.path(data_path, "fasta", "A0A024B7W1_ZIKVF.fasta")),
human = bind_rows(
read_innate_uniprot_fasta(file.path(data_path, "fasta", "UP000005640_9606.fasta")),
read_innate_uniprot_fasta(file.path(data_path, "fasta", "UP000005640_9606_additional.fasta"))
)
)
fix_viral_protein_acs <- function(acs) {
str_remove_all(str_remove_all(acs, "(?<=^|;)(?:sp|tr)\\|"), "\\|[^|]+(?=;|$)")
}
msdata.wide <- read.MaxQuant.ProteinGroups(file.path(msdata_path, 'combined/txt'), import_data = c(data_info$quant_type, "ident_type"))
# fix protein acs
msdata.wide <- dplyr::mutate(msdata.wide,
across(matches("_acs?$"), fix_viral_protein_acs))
msdata_colgroups <- attr(msdata.wide, "column_groups")
mqevidence <- read.MaxQuant.Evidence(file.path(msdata_path, 'combined', 'txt'),
mschannel_annotate.f = function(mschans_df) {
res <- dplyr::inner_join(dplyr::select(mschans_df, mstag, rawfile),
dplyr::select(msruns.df, rawfile, condition, treatment, bait,
msrun=msrun, msexperiment=msrun, batch),
by="rawfile")
print(res)
attr(res, "column_scopes") <- c(treatment = "msexperiment",
bait = "msexperiment",
condition = "msexperiment",
batch = "msexperiment")
return(res)
})
mqevidence$peptides <- read.MaxQuant.Peptides(file.path(msdata_path, 'combined', 'txt'), file_name='peptides.txt',
import_data='ident_type')
mqevidence$peaks <- NULL # exclude big data frame
for (dfname in c("pepmods", "pepmodstates", "peptides")) {
mqevidence[[dfname]] <- dplyr::mutate(mqevidence[[dfname]], across(matches("_acs?$"), fix_viral_protein_acs))
}
strlist_label <- function(strs) {
str_c(strs[[1]], if_else(n_distinct(strs) > 1, '...', ''))
}
strlist_label2 <- function(strs, delim=fixed(';')) {
sapply(str_split(strs, delim), strlist_label)
}
# all ms data
msdata_full <- list(msexperiments = dplyr::select(mqevidence$msexperiments, -starts_with("msfraction")),
msruns = mqevidence$msruns)
msdata_full <- append_protgroups_info(msdata_full, dplyr::mutate(msdata.wide, organism=NULL),
proteins_info = dplyr::bind_rows(fasta.dfs) %>%
dplyr::mutate(protein_ac_noiso = str_remove(protein_ac, "-\\d+(?:#.+)*$"),
protein_isoform_ix = replace_na(as.integer(str_match(protein_ac, "-(\\d+)$")[, 2]), 1L)),
import_columns = c("organism"))
msdata_full$peptides <- mqevidence$peptides %>%
dplyr::left_join(select(dplyr::bind_rows(fasta.dfs), lead_razor_protein_ac = protein_ac, organism)) %>%
dplyr::mutate(peptide_rank = 1L)
msdata_full$pepmods <- mqevidence$pepmods %>%
dplyr::mutate(pepmod_rank = 1L)
msdata_full$pepmodstates <- mqevidence$pepmodstates
# redefine protein groups (protregroups)
peptides.df <- dplyr::select(msdata_full$peptides, peptide_id, protgroup_ids, protein_acs, lead_razor_protein_ac,
peptide_seq, is_reverse, peptide_rank)
proteins.df <- msdata_full$proteins
save(file = file.path(msdata_path, str_c(project_id, "_", msfolder, '_', data_version, "_peptides.RData")),
peptides.df, proteins.df)
# .. run protregroup.jl
msdata_full$protregroups <- read_tsv(file.path(data_path, msfolder,
str_c(project_id, "_", msfolder, '_', data_version, "_protregroups.txt")),
col_types = list(protregroup_id = "i"))
msdata_full$protein2protregroup <- dplyr::select(msdata_full$protregroups, protregroup_id, protein_ac=majority_protein_acs) %>%
separate_rows(protein_ac, sep=fixed(";"), convert=TRUE) %>%
dplyr::mutate(is_majority = TRUE) %>%
dplyr::group_by(protregroup_id) %>%
dplyr::mutate(protein_ac_rank = row_number()) %>%
dplyr::ungroup()
msdata_full$protregroup2peptide <- bind_rows(
select(msdata_full$protregroups, protregroup_id, peptide_id=spec_peptide_ids) %>%
separate_rows(peptide_id, sep=fixed(";"), convert=TRUE) %>%
mutate(is_specific = TRUE),
select(msdata_full$protregroups, protregroup_id, peptide_id=peptide_ids) %>%
separate_rows(peptide_id, sep=fixed(";"), convert=TRUE) %>%
mutate(is_specific = FALSE)) %>%
dplyr::group_by(protregroup_id, peptide_id) %>%
dplyr::summarise(is_specific = any(is_specific)) %>%
dplyr::ungroup()
msdata_full$protregroup2pepmod <- dplyr::inner_join(msdata_full$protregroup2peptide,
dplyr::select(msdata_full$pepmods, pepmod_id, peptide_id)) %>%
dplyr::select(-peptide_id)
msdata_full$protregroup2pepmodstate <- dplyr::semi_join(msdata_full$protregroup2pepmod,
dplyr::select(msdata_full$pepmods, pepmod_id)) %>%
dplyr::inner_join(dplyr::select(msdata_full$pepmodstates, pepmod_id, pepmodstate_id))
msdata_full$protregroups <- msdata_full$protregroups %>%
dplyr::mutate(gene_label = strlist_label2(gene_names),
protein_label = strlist_label2(protein_names),
protein_description = strlist_label2(protein_descriptions),
is_viral = replace_na(str_detect(organism, "ZIKV"), FALSE),
protac_label = strlist_label2(majority_protein_acs),
protregroup_label = case_when(is_viral ~ protein_label,
!is.na(gene_label) ~ gene_label,
!is.na(protac_label) ~ protac_label,
TRUE ~ str_c('#', protregroup_id))) %>%
dplyr::left_join(dplyr::inner_join(msdata_full$protregroup2pepmod, msdata_full$pepmods) %>%
dplyr::group_by(protregroup_id) %>%
dplyr::summarise(npeptides = n_distinct(peptide_id),
npepmods = n_distinct(pepmod_id),
npeptides_unique = n_distinct(peptide_id[is_specific]),
npepmods_unique = n_distinct(pepmod_id[is_specific])) %>%
dplyr::ungroup() %>%
dplyr::mutate(npeptides_unique_razor = npeptides_unique,
npeptides_razor = 0L,
npepmods_unique_razor = npepmods_unique,
npepmods_razor = 0L))
msdata_full$pepmodstate_intensities <- mqevidence$pepmodstate_intensities %>%
dplyr::mutate(is_idented = str_detect(ident_type, "MSMS"))
msdata_full$protregroup_idents <- dplyr::inner_join(msdata_full$protregroup2pepmod,
msdata_full$pepmodstate_intensities) %>%
dplyr::group_by(msexperiment, protregroup_id) %>%
dplyr::summarise(npepmods_quanted = sum(!is.na(intensity)),
nspecpepmods_quanted = sum(!is.na(intensity) & is_specific),
npepmods_idented = sum(is_idented),
nspecpepmods_idented = sum(is_idented & is_specific)) %>%
dplyr::ungroup() %>%
dplyr::mutate(ident_type = factor(if_else(nspecpepmods_idented > 0, "By MS/MS", "By matching")))
# setup experimental design matrices
conditions.df <- bind_rows(dplyr::select(msdata_full$msexperiments, condition, treatment, bait) %>%
dplyr::distinct() %>%
dplyr::mutate(is_virtual = FALSE)) %>%
dplyr::arrange(is_virtual, condition) %>%
dplyr::mutate(is_negctrl = bait == 'empty' & treatment == 'Mock')
conditionXeffect.mtx <- model.matrix(~ 1 + treatment*bait, conditions.df)
conditionXeffect.mtx <- conditionXeffect.mtx[, setdiff(colnames(conditionXeffect.mtx), '(Intercept)')] # handled separately
dimnames(conditionXeffect.mtx) <- list(condition = conditions.df$condition,
effect = colnames(conditionXeffect.mtx))
all_conditions <- as.character(conditions.df$condition)
plot_path <- file.path(analysis_path, 'plots', str_c(msfolder, "_", data_info$fit_ver))
if (!dir.exists(plot_path)) dir.create(plot_path)
pheatmap(conditionXeffect.mtx, cluster_rows=FALSE, cluster_cols=FALSE, na_col = "gray", border_color = "white",
filename = file.path(plot_path,
paste0(project_id, "_conditionXeffect_", data_info$msfolder, "_", data_info$fit_ver, ".pdf")),
width = 6, height = 5)
effects.df <- tibble(effect=colnames(conditionXeffect.mtx)) %>%
dplyr::mutate(treatment = effect_factor(effect, "treatment", levels(conditions.df$treatment), NA_character_),
bait = effect_factor(effect, "bait", levels(conditions.df$bait), NA_character_),
is_positive = FALSE) %>%
dplyr::mutate(effect_type = case_when(
!is.na(bait) & !is.na(treatment) ~ "bait_treatment",
!is.na(bait) ~ "bait",
!is.na(treatment) ~ "treatment")) %>%
dplyr::mutate(prior_tau = case_when(effect_type == "bait" ~ 1.0,
effect_type == "treatment" ~ 1.0,
effect_type == "bait_treatment" ~ 0.5,
TRUE ~ 1.0),
prior_mean = 0.0,
prior_df1 = case_when(effect_type == "bait" ~ 2.0,
effect_type == "treatment" ~ 2.0,
effect_type == "bait_treatment" ~ 4.0,
TRUE ~ 1.0),
prior_df2 = prior_df1)
effects.df$effect_label <- sapply(1:nrow(effects.df), function(i) {
bait <- as.character(effects.df$bait[[i]])
trt <- as.character(effects.df$treatment[[i]])
chunks <- if (!is.na(bait)) { bait } else { character(0) }
if (!is.na(trt)) chunks <- append(chunks, trt)
paste0(chunks, collapse="+")
})
all_metaconditions <- c(all_conditions)
conditionXmetacondition.mtx <- msglm::constant_matrix(FALSE, list(condition = all_conditions,
metacondition = all_metaconditions))
for (cname in conditions.df$condition) {
conditionXmetacondition.mtx[cname, cname] <- TRUE
}
conditionXmetacondition.df <- as.data.frame.table(conditionXmetacondition.mtx, responseName = "part_of",
stringsAsFactors = FALSE) %>% as_tibble() %>%
dplyr::filter(part_of) %>% dplyr::select(-part_of)
pheatmap(ifelse(conditionXmetacondition.mtx, 1.0, 0.0), cluster_rows=FALSE, cluster_cols=FALSE,
filename = file.path(plot_path,
paste0(project_id, "_metaconditions_", data_info$msfolder, "_", data_info$fit_ver, ".pdf")),
width = 6, height = 5)
canx_vs_empty <- paste0("CANX_vs_empty@", levels(conditions.df$treatment))
zikv_vs_mock <- paste0("ZikV_vs_Mock@", levels(conditions.df$bait))
all_contrasts <- c(canx_vs_empty, zikv_vs_mock)
metaconditionXcontrast.mtx <- msglm::constant_matrix(0, list(metacondition = all_metaconditions,
contrast = all_contrasts))
for (bait in levels(conditions.df$bait)) {
metaconditionXcontrast.mtx[paste0(bait, '_ZikV'), paste0('ZikV_vs_Mock@', bait)] <- 1.0
metaconditionXcontrast.mtx[paste0(bait, '_Mock'), paste0('ZikV_vs_Mock@', bait)] <- -1.0
}
for (trt in levels(conditions.df$treatment)) {
metaconditionXcontrast.mtx[paste0('CANX_', trt), paste0('CANX_vs_empty@', trt)] <- 1.0
metaconditionXcontrast.mtx[paste0('empty_', trt), paste0('CANX_vs_empty@', trt)] <- -1.0
}
pheatmap(metaconditionXcontrast.mtx, cluster_rows=FALSE, cluster_cols=FALSE,
filename = file.path(plot_path,
paste0(project_id, "_contrasts_", data_info$msfolder, "_", data_info$fit_ver, ".pdf")),
width = 6, height = 5)
metaconditionXcontrast.df <- as.data.frame.table(metaconditionXcontrast.mtx, responseName="weight",
stringsAsFactors = FALSE) %>% as_tibble() %>%
dplyr::filter(weight != 0) %>%
dplyr::mutate(contrast_type = if_else(contrast %in% canx_vs_empty, "filter", "comparison"),
condition_role = if_else(contrast_type == 'filter',
if_else(weight > 0, 'signal', 'background'),
'signal'))
contrasts.df <- dplyr::select(metaconditionXcontrast.df, contrast, contrast_type) %>%
dplyr::distinct()
msrunXbatchEffect.mtx <- model.matrix(~ 1 + batch, msdata_full$msruns %>% dplyr::mutate(batch = factor(batch)))
msrunXbatchEffect.mtx <- msrunXbatchEffect.mtx[, -1, drop=FALSE]
dimnames(msrunXbatchEffect.mtx) <- list(msrun = msdata_full$msruns$msrun,
batch_effect = colnames(msrunXbatchEffect.mtx))
pheatmap(msrunXbatchEffect.mtx, cluster_rows=FALSE, cluster_cols=FALSE,
file=file.path(analysis_path, "plots", str_c(msfolder, "_", data_info$fit_ver),
paste0(project_id, "_", msfolder, "_", data_info$fit_ver, "_batch_effects.pdf")),
width=4, height=10)
batch_effects.df <- tibble(batch_effect = colnames(msrunXbatchEffect.mtx),
is_positive = FALSE,
prior_mean = 0.0)
msglm_def <- msglm_model(conditionXeffect.mtx, conditions.df, effects.df, verbose=TRUE) %>%
msglm::set_contrasts(metaconditionXcontrast.mtx, conditionXmetacondition.mtx, contrasts.df) %>%
msglm::set_batch_effects(msrunXbatchEffect.mtx,
batch_effects.df, applies_to="object")
msdata <- import_msglm_data(msdata_full, msglm_def,
object="protregroup", quantobject="pepmodstate",
verbose = TRUE)
msdata4norm.df <- msdata_full$pepmodstate_intensities %>%
dplyr::semi_join(dplyr::select(msdata_full$pepmodstates, pepmodstate_id)) %>%
dplyr::semi_join(dplyr::semi_join(msdata_full$pepmods,
dplyr::filter(msdata_full$protregroups %>% dplyr::mutate(is_viral=FALSE),
!is_reverse & !is_contaminant & !is_viral) %>%
dplyr::inner_join(msdata_full$protregroup2pepmod) %>% dplyr::select(pepmod_id)) %>%
dplyr::filter(!is_reverse & !is_contaminant) %>%
dplyr::select(pepmod_id))
require(cmdstanr)
options(mc.cores=8L)
Sys.setenv(MKL_NUM_THREADS=1L)
# normalize experiments:
msruns_hnorm <- multilevel_normalize_experiments(msdata$pepmodstate_mscalib,
dplyr::select(msdata$msruns, msrun, treatment, condition, bait),
msdata4norm.df,
quant_col = "intensity", obj_col = "pepmodstate_id", mschan_col = "msrun",
#mcmc.iter = 100L,
verbose=TRUE,
stan_method = "mcmc",
quant_ratio.max = 3.0,
norm_levels = list(msrun = list(cond_col="msrun", nmschan_ratio.min=1.0, max_objs=1000L),
condition = list(cond_col="condition", max_objs=1000L),
bait = list(cond_col = "bait", nmschan_ratio.min=1.0)))
# skip bait shift because it's too big
msdata$msrun_shifts <- msruns_hnorm$mschannel_shifts
# estimate the effect scales per each msrun
pepmodstate_intensities4eff_scale.df <- msdata_full$pepmodstate_intensities %>%
# fitler for CANX interactors that are known ER-localized proteins
dplyr::semi_join(dplyr::filter(msdata_full$proteins, str_detect(gene_name, "^(CANX|EMC(1|2|3|4|10)|ERLIN(1|2)|GOLGA4|ERP29|PGRMC1|PDIA6|ESYT(1|2)|POR|TXNDC(5|12)|MLEC|MESD|SRPR[AB]|UGGT1|NOMO1|MOGS|SPTLC1|ASPH)$")) %>%
dplyr::inner_join(msdata_full$protein2protregroup, by="protein_ac") %>%
dplyr::inner_join(dplyr::filter(msdata_full$protregroup2pepmodstate, is_specific), by="protregroup_id") %>%
dplyr::select(pepmodstate_id) %>% dplyr::distinct()) %>%
dplyr::inner_join(msdata$msrun_shifts) %>%
dplyr::inner_join(msdata$msruns) %>%
dplyr::mutate(intensity_norm = intensity*2^(-total_msrun_shift),
intensity_norm_log2 = log2(intensity_norm)) %>%
dplyr::group_by(pepmodstate_id, condition, batch) %>%
dplyr::mutate(intensity_cond = median(intensity_norm, na.rm=TRUE)) %>%
dplyr::group_by(pepmodstate_id, batch) %>%
dplyr::mutate(intensity_empty_mock = intensity_cond[[which.max(condition == "empty_Mock")]],
intensity_empty_zikv = intensity_cond[[which.max(condition == "empty_ZikV")]],
intensity_CANX_mock = intensity_cond[[which.max(condition == "CANX_Mock")]],
intensity_CANX_zikv = intensity_cond[[which.max(condition == "CANX_ZikV")]]) %>%
dplyr::ungroup() %>%
dplyr::filter(bait == "CANX") %>%
dplyr::mutate(is_infected = treatment != "Mock",
intensity_cond_log2 = log2(intensity_cond),
intensity_cond_vs_empty_log2 = intensity_cond_log2 - log2(if_else(is_infected, intensity_empty_zikv, intensity_empty_mock)),
effect = if_else(intensity_cond_vs_empty_log2 == 0, NA_real_, intensity_cond_vs_empty_log2)) %>%
#dplyr::group_by(is_infected, batch) %>%
#dplyr::mutate(effect_bin = as.integer(cut(effect, 50))) %>%
#dplyr::group_by(is_infected, batch, effect_bin) %>%
#dplyr::slice_sample(n = 200) %>%
dplyr::ungroup()
pepmodstate_intensities_pairs4eff_scale.df <- dplyr::left_join(pepmodstate_intensities4eff_scale.df,
pepmodstate_intensities4eff_scale.df,
by=c("bait", "batch", "pepmodstate_id")) %>%
dplyr::filter(msrun.x != msrun.y & (effect.x > 1.0) & (effect.y > 1.0))
ggplot(dplyr::filter(pepmodstate_intensities_pairs4eff_scale.df, is_infected.y & msrun.x != msrun.y)) +
geom_abline(slope = 1, intercept = 0, color = "darkred", linetype = "dotted") +
geom_point(aes(x = effect.x, y = effect.y, color = factor(batch)))
require(cmdstanr)
options(mc.cores=8)
effect_scales.stanmodel <- cmdstanr::cmdstan_model(file.path(project_scripts_path, "effect_scales_normalize.stan"))
fit_effect_scales <- function(data_df, sigma_t_offset=1E-3, scale_sigma_df=2, data_sigma_df=16) {
data_df <- mutate(data_df,
msrun.x = factor(msrun.x),
msrun.y = factor(msrun.y, levels=levels(msrun.x)),
msrun_ix.x = as.integer(msrun.x),
msrun_ix.y = as.integer(msrun.y))
effect_scales.data <- list(Nmsruns = n_distinct(data_df$msrun.x),
NeffectPairs = nrow(data_df),
outlierProb = 1E-3,
effect_x = data_df$effect.x,
effect_y = data_df$effect.y,
msrun_x = data_df$msrun_ix.x,
msrun_y = data_df$msrun_ix.y,
sigma_t_offset = sigma_t_offset,
scale_sigma_df = scale_sigma_df,
data_sigma_df = data_sigma_df)
effect_scales.stanfit <- effect_scales.stanmodel$sample(
data = effect_scales.data, chains = 8,
iter_warmup=1000, iter_sampling=1000)
effect_scales_stats.df <- effect_scales.stanfit$summary(
variables = c("data_sigma", "scale_sigma", "scale_log2", "scale_mult"),
msglm:::posterior_summary_metrics) %>%
dplyr::rename(varspec = variable) %>%
tidyr::extract(varspec, c("var", "index_msrun"), "^(\\w+)(?:\\[(\\d+)\\])?$",
remove=FALSE, convert=TRUE) %>%
dplyr::mutate(msrun = levels(data_df$msrun.x)[index_msrun])
}
effect_scales_fit.df <- fit_effect_scales(pepmodstate_intensities_pairs4eff_scale.df)
#saveRDS(effect_scales_fit.df, file.path(scratch_path, str_c(project_id, "_", msfolder, "_", data_info$fit_ver, "_effect_scales.rds")))
effect_scales.df <- dplyr::transmute(dplyr::filter(effect_scales_fit.df, var=="scale_log2"),
msrun,
msrun_scale_log2=median, msrun_scale_log2_sd=sd,
ess_bulk, rhat) %>%
dplyr::left_join(dplyr::select(msdata_full$msruns, msrun, bait, condition, treatment, batch)) %>%
dplyr::group_by(bait, batch) %>%
dplyr::mutate(msrun_scale_log2 = msrun_scale_log2 - max(msrun_scale_log2),
msrun_scale=2^msrun_scale_log2,
msrun_scale_log2_median = median(msrun_scale_log2),
msrun_scale_avg = mean(msrun_scale)) %>%
dplyr::ungroup()
msrunXeffect.df <- inner_join(as.data.frame.table(conditionXeffect.mtx, responseName="w"), msdata$msruns) %>%
dplyr::filter(w != 0) %>%
dplyr::left_join(dplyr::select(effect_scales.df, msrun, msrun_scale)) %>%
dplyr::left_join(dplyr::select(effects.df, effect, eff_bait = bait)) %>%
dplyr::mutate(effect_scale = if_else(is.na(eff_bait), 1.0, msrun_scale),
w = w * effect_scale)
msexpXeffect.mtx <- msglm::frame2matrix(msrunXeffect.df, row_col="msexperiment", col_col="effect",
cols = colnames(conditionXeffect.mtx),
rows = dplyr::arrange(msdata$msexperiments, bait, treatment, batch) %>% .$msexperiment)
pheatmap(msexpXeffect.mtx, cluster_rows=FALSE, cluster_cols=FALSE, display_numbers = TRUE, number_format = "%.3f",
filename = file.path(analysis_path, "plots", paste0(msfolder, "_", data_info$fit_ver),
paste0(project_id, "_msexperimentXeffect_", msfolder, "_", data_info$fit_ver, ".pdf")),
width = 8, height = 12)
# redefine model with scaled effects
msglm_def <- msglm_model(conditionXeffect.mtx, conditions.df, effects.df,
msprobeXeffect = msexpXeffect.mtx, verbose=TRUE) %>%
msglm::set_contrasts(metaconditionXcontrast.mtx, conditionXmetacondition.mtx, contrasts.df) %>%
msglm::set_batch_effects(msrunXbatchEffect.mtx,
batch_effects.df, applies_to="object")
rmsglmdata_filepath <- file.path(scratch_path, paste0(project_id, '_msglm_data_', data_info$fit_ver, '.RData'))
message('Saving data for MSGLM to ', rmsglmdata_filepath, '...')
save(data_info, msdata, msglm_def, msruns_hnorm, effect_scales.df,
file = rmsglmdata_filepath)
rfulldata_filepath <- file.path(scratch_path, str_c(project_id, '_msdata_full_', data_info$data_ver, '.RData'))
message('Saving full MS data to ', rfulldata_filepath, '...')
save(data_info, msdata_full, fasta.dfs,
file = rfulldata_filepath)
message('Done.')
tmp.env <- new.env(parent=baseenv())
load("/pool/analysis/astukalov/aherrmann_cov2omap/scratch/aherrmann_cov2omap_msglm_data_20211003.RData", envir = tmp.env)
ggplot(inner_join(msdata$msrun_shifts, tmp.env$msdata$msrun_shifts, by=c("conditionXmsfraction", "compartmentXmsfraction", "msrun")),
aes(x=total_msrun_shift.x, y=total_msrun_shift.y, label=msrun, color=compartmentXmsfraction)) +
geom_point() +
geom_text(vjust=-1.1, size=3) +
geom_abline(intercept = 0, slope = 1, color="firebrick", linetype="dotted") +
theme_bw()
ggplot(total_msrun_shifts.df %>%
dplyr::inner_join(total_msrun_shifts_old.df %>%
dplyr::select(mschannel_ix, total_mschannel_shift_old = total_mschannel_shift))) +
geom_point()