xrydbh 2022-02-04
#` ---
#` title: "Boxplots
#` author: "Halfdan Rydbeck"
#` date: "`r Sys.Date()`"
#` output: github_document: default #html_preview: false
#` ---
#`
library(readxl)
library(tidyverse)
library(rmarkdown)
# Set path
# If script is running separately move up one step with setwd(".."). Only needed once (for first script run separately).
protID="UniprotID"
p_val_cut_off=0.05
fold_change_cutoff=2
de_q_sign_res_p005_FC_2 <- read_xlsx("./out_r/007_DEqMS_sign_res_p005_FC_2_ms.xlsx")
top_ms <- de_q_sign_res_p005_FC_2$gene
ttest_sign_res_p005_FC_2 <- read_xlsx("./out_r/007_t_test_sign_res_p005_FC_2_pea.xlsx")
top_pea <- ttest_sign_res_p005_FC_2$UniprotID
# Load normalized data
ms_pea_data <- load("./RData/003_ab_list.RData")
names(ab_list)
## [1] "dat_abundances" "dat_long" "dat_meta" "dat_abund_test"
names(ab_list[["dat_long"]])
## [1] "ms_abs_long_eq_med_norm" "pea_abs_long_eq_med_norm"
## [3] "pea_long" "ms_long"
#ms_abs_long_eq_med_norm
# Choose long data sets and Select and split columns (for ms eq_med_norm [on top of Prot_median_cent_l2] is selected)
ms_for_bp.0.0 <- ab_list[["dat_long"]][["ms_abs_long_eq_med_norm"]] %>%
dplyr::select(UniprotID,Gene.symbol,Stem.cell.cont, Sample,Prot_median_cent_l2_norm_ab_eq_med_norm=value) %>%
dplyr::filter(!!as.symbol(protID)%in%top_ms) %>%
mutate_at(vars(c("Prot_median_cent_l2_norm_ab_eq_med_norm")),as.numeric) # %>%
# separate(
# SC_Donor,
# c("Stem.cell.cont","Sample"),
# sep = "_")
ms_key <- ms_for_bp.0.0 %>%
dplyr::select(UniprotID,Gene.symbol) %>%
distinct()
pea_for_bp.0.0 <- ab_list[["dat_long"]][["pea_abs_long_eq_med_norm"]] %>%
dplyr::select(UniprotID,Gene.symbol,Stem.cell.cont,Sample,Prot_median_cent_rel_npx_eq_med_norm=value) %>%
dplyr::filter(!!as.symbol(protID)%in%top_pea) %>%
mutate_at(vars(c("Prot_median_cent_rel_npx_eq_med_norm")),as.numeric) #%>%
# separate(
# SC_Donor,
# c("Stem.cell.cont","Sample"),
# sep = "_")
###################
# MS
###################
# There are genes where one Uniprot codes to multiple Gene.symbols; look for them
Uniprot_with_multiple_matches_in_GS_ms <- inner_join(ms_for_bp.0.0,ms_key,by=c("UniprotID"="UniprotID")) %>%
#relocate(gene.symb,.after=UniprotID) %>%
dplyr::rename(Gene.symbol.ms=Gene.symbol.x) %>%
group_by(UniprotID) %>%
dplyr::select(UniprotID,Gene.symbol.ms) %>%
distinct() %>%
dplyr::filter(n()>1) %>%
arrange(UniprotID)
Uniprot_with_multiple_matches_in_GS_ms <- ms_for_bp.0.0 %>%
group_by(UniprotID) %>%
dplyr::select(UniprotID,Gene.symbol) %>%
distinct() %>%
dplyr::filter(n()>1)
# Remove duplicatd gene.IDs
ms_for_bp <- ms_for_bp.0.0 %>%
dplyr::filter(Gene.symbol!="SNHG28") %>% # Removing multiple matches
#dplyr::filter(Gene.symbol!="VSIG8") %>% #Dont remove this one RPIA (a Gene symbol synonym to corresponding to Uniprot P49247) is missing in this data set
#dplyr::filter(Gene.symbol!="LOC101060545") %>% # or this one since since C1orf204 (a third Gene.symbol synonym ot Uniprot ID P0DPA2)
unite(UnitedIDs,Gene.symbol:UniprotID,sep="_", remove = F)
# Check again; should have 0 rows
ms_for_bp %>%
group_by(UniprotID) %>%
dplyr::select(UniprotID,Gene.symbol) %>%
distinct() %>%
dplyr::filter(n()>1) %>%
arrange(UniprotID)
## # A tibble: 0 × 2
## # Groups: UniprotID [0]
## # … with 2 variables: UniprotID <chr>, Gene.symbol <chr>
# Find proteins with outliers
outliers_ms <- ms_for_bp %>%
group_by(UnitedIDs) %>%
dplyr::filter(Prot_median_cent_l2_norm_ab_eq_med_norm>3.1 | Prot_median_cent_l2_norm_ab_eq_med_norm < -3.1) %>%
pluck("UnitedIDs") %>%
unique()
n_prots=ms_for_bp %>%
dplyr::select(UnitedIDs,Gene.symbol) %>%
distinct() %>%
arrange(UnitedIDs) %>%
nrow()
n_prots
## [1] 44
ms_for_bp_regular <- ms_for_bp %>%
dplyr::filter(!(UnitedIDs%in%outliers_ms))
ms_for_bp_outliers<- ms_for_bp %>%
dplyr::filter(UnitedIDs%in%outliers_ms)
n_prots_regular=ms_for_bp_regular %>%
dplyr::select(UnitedIDs,Gene.symbol) %>%
distinct() %>%
arrange(UnitedIDs) %>%
nrow()
n_prots_regular
## [1] 32
n_prots_outliers=ms_for_bp_outliers %>%
dplyr::select(UnitedIDs,Gene.symbol) %>%
distinct() %>%
arrange(UnitedIDs) %>%
nrow()
n_prots_outliers
## [1] 12
ms_for_bp_regular_plot <- ms_for_bp_regular %>% ggplot(aes(x=Stem.cell.cont,
y=Prot_median_cent_l2_norm_ab_eq_med_norm,
fill=Stem.cell.cont)) +
facet_wrap("UnitedIDs",ncol=6) + #vars(Gene.symbol)
#theme() +
geom_boxplot(outlier.size = 0) +
geom_point(position=position_jitterdodge(),alpha=0.5, size=1) +
#theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
theme_classic() +
theme(legend.position = "none",strip.text.x = element_text(size = 6)) + # , colour = "orange"
ggtitle(paste("Mass spectrometry\n",n_prots_regular," with narrow distribution out of ",n_prots," proteins with p < ",p_val_cut_off,"; FC > ",fold_change_cutoff,sep="")) +
labs(y= "Median centred log2 normalized abundance", x = "Top table protein")
ms_for_bp_regular_plot
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3 rows containing missing values (geom_point).
# Narrow distributinons
pdf(file = paste("./out_r/008_boxplot_p005_FC_2_sign_DEqMS_narrow_dist_eq_med_norm.pdf",sep=""))
ms_for_bp_regular_plot
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Removed 3 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen
## 2
# Wider distributions
pdf(file = paste("./out_r/008_boxplot_p005_FC_2_sign_DEqMS_wider_dist_eq_med_norm.pdf",sep=""))
ms_for_bp_outliers %>% ggplot(aes(x=Stem.cell.cont,
y=Prot_median_cent_l2_norm_ab_eq_med_norm,
fill=Stem.cell.cont)) +
facet_wrap("UnitedIDs",ncol=6) + #vars(Gene.symbol)
#theme() +
geom_boxplot(outlier.size = 0) +
geom_point(position=position_jitterdodge(),alpha=0.5, size=1) +
#theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
theme_classic() +
theme(legend.position = "none",strip.text.x = element_text(size = 6)) + # , colour = "orange"
ggtitle(paste("Mass spectrometry\n",n_prots_outliers," with wider distribution out of ",n_prots," proteins with p < ",p_val_cut_off,"; FC > ",fold_change_cutoff,sep="")) +
labs(y= "Median centred log2 normalized abundance", x = "Top table protein")
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## Warning: Removed 4 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen
## 2
###################
# PEA
###################
Uniprot_with_multiple_matches_in_GS_pea <- pea_for_bp.0.0 %>%
group_by(UniprotID) %>%
dplyr::select(UniprotID,Gene.symbol) %>%
distinct() %>%
filter(n()>1) %>%
arrange(UniprotID)
pea_for_bp <- pea_for_bp.0.0 %>%
unite(UnitedIDs,Gene.symbol:UniprotID,sep="_", remove = F)
n_prots_pea=pea_for_bp %>%
dplyr::select(UnitedIDs,Gene.symbol) %>%
distinct() %>%
arrange(UnitedIDs) %>%
nrow()
# Make boxplot
pea_bp <- pea_for_bp %>% ggplot(aes(x=Stem.cell.cont,
y=Prot_median_cent_rel_npx_eq_med_norm,
fill=Stem.cell.cont
)
) +
facet_wrap("UnitedIDs",ncol=4) + #vars(Gene.symbol)
geom_boxplot(outlier.size = 0) +
geom_point(position=position_jitterdodge(),alpha=0.5, size=1) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
theme_classic() +
theme(legend.position = "none") +
ggtitle("Olink") +
ggtitle(paste("Olink\n",n_prots_pea," proteins with p < ",p_val_cut_off,"; FC > ",fold_change_cutoff,sep="")) +
labs(y= "Median centred relative NPX (to sample sum)", x = "Top table protein")
# Boxplot for report
pea_bp
# Boxplot to file
pdf(file = paste("./out_r/008_boxplot_p005_FC_2_sign_ttest_pea_eq_med_norm.pdf",sep=""))
pea_bp
dev.off()
## quartz_off_screen
## 2
# ##########################
# Boxplot of shared significant
# ##########################
# Get significant genes
signMS_unip <- read_xlsx("./out_r/007_DEqMS_res.xlsx") %>%
dplyr::filter(sca.P.Value<p_val_cut_off) %>%
dplyr::select(UniprotID=gene)
sigPEA_unip <- read_xlsx("./out_r/007_t_test_res_pea.xlsx") %>%
dplyr::filter(p.value<p_val_cut_off) %>%
dplyr::select( UniprotID)
# get significant in both platforms
sign_in_both_plfs <- signMS_unip %>% inner_join(sigPEA_unip) %>%
pluck("UniprotID")
## Joining, by = "UniprotID"
ms <- ab_list[["dat_long"]][["ms_abs_long_eq_med_norm"]] %>%
dplyr::select(UniprotID,Gene.symbol,Stem.cell.cont, Sample,Prot_median_cent_l2_norm_ab_eq_med_norm=value) %>%
mutate_at(vars(c("Prot_median_cent_l2_norm_ab_eq_med_norm")),as.numeric)%>%
filter(UniprotID%in%sign_in_both_plfs) %>%
mutate(platform="ms") %>%
mutate(eq_med_norm_ab=Prot_median_cent_l2_norm_ab_eq_med_norm,.keep = "unused")
ms_for_bp.0.0
## # A tibble: 704 × 5
## UniprotID Gene.symbol Stem.cell.cont Sample Prot_median_cent_l2_norm_ab_eq_m…
## <chr> <chr> <chr> <chr> <dbl>
## 1 P04264 KRT1 High 1 -0.691
## 2 P04264 KRT1 High 2 1.03
## 3 P04264 KRT1 High 3 -0.0449
## 4 P04264 KRT1 High 4 0.336
## 5 P04264 KRT1 High 5 -1.56
## 6 P04264 KRT1 High 6 -1.71
## 7 P04264 KRT1 High 7 -2.24
## 8 P04264 KRT1 High 8 -1.70
## 9 P04264 KRT1 Low 1 0.420
## 10 P04264 KRT1 Low 2 2.50
## # … with 694 more rows
pea <- ab_list[["dat_long"]][["pea_abs_long_eq_med_norm"]] %>%
dplyr::select(UniprotID,Gene.symbol,Stem.cell.cont,Sample,Prot_median_cent_rel_npx_eq_med_norm=value) %>%
mutate_at(vars(c("Prot_median_cent_rel_npx_eq_med_norm")),as.numeric) %>%
filter(UniprotID%in%sign_in_both_plfs) %>%
mutate(platform="pea")%>%
mutate(eq_med_norm_ab=Prot_median_cent_rel_npx_eq_med_norm,.keep = "unused")
shared.long <- ms %>% bind_rows(pea)%>%
unite(UnitedIDs,Gene.symbol:UniprotID,sep="_", remove = F)
# png(paste("/Users/xrydbh/OneDrive/OneDrive\ -\ Göteborgs\ Universitet/Diff_protExp_in_highVsLow_Cd34_cordPlasma/plots_r/both/shared_prots/",abund_trans,"_boxplot_",used_pars_string,".png",sep=""), 3000,2000,res=300)
# print(ggplot(shared.long, aes(x = Gene.symbol,y= eq_med_norm_ab, fill=as.factor(Gene.symbol))) +
# geom_boxplot(alpha = .3) +
# theme(axis.text.x=element_text(angle=60, hjust=1,)) +
# theme(legend.position = "none") +
# ggtitle("my_title"))
#
# dev.off()
for (prot in sign_in_both_plfs){
ms_pea_for_bp_one_gene <- shared.long %>% filter(UniprotID==prot)
unified_prot_name <- ms_pea_for_bp_one_gene[1,] %>% pluck("UnitedIDs")
pdf(file = paste("./out_r/008_boxplot_shared_ms_pea_UnitedIDs_",prot,".pdf",sep=""),
width=3,
height=3)
print(
ms_pea_for_bp_one_gene %>% ggplot(aes(x=platform,
y=eq_med_norm_ab,
fill=Stem.cell.cont,alpha=platform)) +
facet_wrap("Stem.cell.cont") + #vars(Gene.symbol),ncol=4
geom_boxplot(outlier.size = 0) +
geom_point(position=position_jitterdodge(),alpha=0.5, size=1) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
theme_classic() +
theme(legend.position = "none") +
ggtitle(paste("Significant for both platforms \n", unified_prot_name ,sep="")) +
labs(y= "Eq_med_norm_ab", x = "Top table protein")
)
dev.off()
}
## Warning: Using alpha for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.
## Using alpha for a discrete variable is not advised.
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing missing values (geom_point).
# shared.0.0 <- sel_transform_alts.l[["ms"]] %>%
# inner_join(sel_transform_alts.l[["pea"]],by=protID) %>%
# na.omit()
# ##########################
# # end Boxplot of shared significant
# ##########################