Add manifest validation workbook and misc updates

JessterB · Feb 29, 2024 · 7e02999 · 7e02999
1 parent 6bd737a
commit 7e02999
Show file tree

Hide file tree

Showing 18 changed files with 1,073 additions and 130 deletions.
diff --git a/src/adhoc/misc.Rmd b/src/adhoc/misc.Rmd
@@ -1,12 +1,25 @@
 ---
-title: "misc investigations"
-output: html_notebook
+title: "misc investigations etc."
 ---
-
+*****
+Setup
 ```{r eval=FALSE, echo=FALSE}
 source("~/repos/rstudio/agora-data-validation/src/utils.R")
 prepare()
 ```
+*****
+
+## cross file proteomics
+
+```{r}
+proteomics_new[proteomics_new$ensembl_gene_id == "ENSG00000178209",]
+proteomics_tmt_new[proteomics_tmt_new$ensembl_gene_id == "ENSG00000178209",]
+
+lfq <- proteomics_new$hgnc_symbol
+tmt <- proteomics_tmt_new$hgnc_symbol
+
+setdiff(lfq, tmt)
+```
 
 ## MSBB metadata (adportal thread)
 never mind, they shouldn't be joining MSSM to MSBB metadata...

diff --git a/src/adhoc/noms_2023.Rmd b/src/adhoc/noms_2023.Rmd
@@ -0,0 +1,174 @@
+---
+title: "misc investigations etc."
+---
+*****
+Setup
+*****
+```{r eval=FALSE, echo=FALSE}
+source("~/repos/rstudio/agora-data-validation/src/utils.R")
+prepare()
+```
+
+
+*****
+MSSM/ZHANG
+*****
+additional notes on https://sagebionetworks.jira.com/browse/AG-1252
+
+prior to loading csv into this workbook:
+1. populate internal-only columns (Source, input_data, initial_nomination_)
+2. adjust user-supplied values as required (study, Team)
+3. export each sheet as csv
+4. remove 'empty' rows from csvs as required (new only for this one)
+
+In this workbook:
+1. remove genes in 'updates' from 'new' 
+2. merge Predicted_Theraputic_Direction_* columns -> Predicted_Theraputic_Direction
+3. populate 'internal' columns (source, intital_nomination)
+4. export modified csv that can be cleanly merged with targets source file
+
+```{r}
+zhang_new <- read_csv("/Users/jbritton/Desktop/New nominations/from drive/Zhang/Zhang_new_noms.csv")
+
+zhang_updates <- read_csv("/Users/jbritton/Desktop/New nominations/from drive/Zhang/Zhang_updated_noms.csv")
+
+zhang_new
+zhang_updates
+```
+Sanity check uniques
+new - no dups
+updates - 3 dups?? -> TODO why????!?!?!?!?!
+```{r}
+# are there dups?
+length(unique(zhang_new$ensembl_gene_id))
+nrow(zhang_new)
+
+length(unique(zhang_updates$ensembl_gene_id))
+nrow(zhang_updates)
+
+# what are the dups?
+n_occur <- data.frame(table(zhang_updates$ensembl_gene_id))
+dup_updates <- zhang_updates[zhang_updates$ensembl_gene_id %in% n_occur$Var1[n_occur$Freq > 1],]
+
+# are the dup rows identical? -> Nope
+highlander <- function(x) {
+  x |> 
+    unique() |> 
+    length() |> 
+    {\(x) x == 1}()
+}
+
+dup_updates |> 
+  group_by(ensembl_gene_id) |> 
+  summarise(
+    across(everything(), highlander)
+  )
+
+# look at the dups
+zhang_updates[zhang_updates$ensembl_gene_id == "ENSG00000118733",]
+zhang_updates[zhang_updates$ensembl_gene_id == "ENSG00000124785",]
+zhang_updates[zhang_updates$ensembl_gene_id == "ENSG00000147065",]
+```
+
+remove targets on the updates list from the new list, because they are not in fact new
+```{r}
+keep <- zhang_new[!(zhang_new$ensembl_gene_id %in% zhang_updates$ensembl_gene_id),]
+
+dropped <- zhang_new[zhang_new$ensembl_gene_id %in% zhang_updates$ensembl_gene_id,]
+cat(dropped$hgnc_symbol)
+cat("\nnew: ", nrow(zhang_new), "\nupdates: ", nrow(zhang_updates), "\ndropped: ", nrow(dropped), "\nkeeps: ", nrow(keep))
+keep
+dropped
+```
+
+
+Merge Predicted_Therapeutic_Direction_* fields in 'keep'
+```{r}
+merged_ptd <- paste(keep$`Predicted_Therapeutic_Direction category`, keep$`Predicted_Therapeutic_Direction details`)
+merged_ptd_keeps <- keep$`Predicted_Therapeutic_Direction category` <- merged_ptd
+drop_cols <- c("Predicted_Therapeutic_Direction details")
+merged_ptd_keeps <- keep[, !(names(keep) %in% drop_cols)]
+names(merged_ptd_keeps)[names(merged_ptd_keeps) == "Predicted_Therapeutic_Direction category"] <- "Predicted_Therapeutic_Direction"
+merged_ptd_keeps
+```
+
+Write new nominations csv file
+```{r}
+write.csv(merged_ptd_keeps, "/Users/jbritton/Desktop/New nominations/from drive/Zhang/Zhang_new_noms_deduped.csv", row.names=FALSE)
+
+```
+
+
+*****
+Emory
+*****
+additional notes on https://sagebionetworks.jira.com/browse/AG-1254
+
+prior to loading csv into this workbook:
+1. populate internal-only columns (Source, input_data, initial_nomination)
+2. adjust user-supplied values as required (Study, Team)
+3. export each sheet as csv
+4. remove 'empty' rows from csvs as required
+
+In this workbook:
+1. merge Predicted_Theraputic_Direction_* columns -> Predicted_Theraputic_Direction
+2. export modified csv that can be cleanly merged with targets source file
+
+```{r}
+emory_new <- read_csv("/Users/jbritton/Desktop/New nominations/from drive/Emory/12.6.23_Emory Agora Target Submission filled+11Nov23_updates.csv")
+
+emory_updates <- read_csv("/Users/jbritton/Desktop/New nominations/from drive/Emory/Emory Agora target submissions_for_empty_updates.xlsm - Nominated Target Updates.csv")
+
+emory_new
+emory_updates
+```
+
+Sanity check uniques
+new - 
+updates - 
+```{r}
+# are there dups?
+length(unique(emory_new$ensembl_gene_id))
+nrow(emory_new)
+
+length(unique(emory_updates$ensembl_gene_id))
+nrow(emory_updates)
+
+# what are the dups?
+n_occur <- data.frame(table(emory_updates$ensembl_gene_id))
+dup_updates <- emory_updates[emory_updates$ensembl_gene_id %in% n_occur$Var1[n_occur$Freq > 1],]
+
+# are the dup rows identical? -> Nope
+highlander <- function(x) {
+  x |> 
+    unique() |> 
+    length() |> 
+    {\(x) x == 1}()
+}
+
+dup_updates |> 
+  group_by(ensembl_gene_id) |> 
+  summarise(
+    across(everything(), highlander)
+  )
+
+
+
+```
+
+
+
+Merge Predicted_Therapeutic_Direction_* fields
+```{r}
+merged_ptd <- paste(emory_new$`Predicted_Therapeutic_Direction category`, emory_new$`Predicted_Therapeutic_Direction details`)
+merged_ptd_keeps <- emory_new$`Predicted_Therapeutic_Direction category` <- merged_ptd
+drop_cols <- c("Predicted_Therapeutic_Direction details")
+merged_ptd_keeps <- emory_new[, !(names(emory_new) %in% drop_cols)]
+names(merged_ptd_keeps)[names(merged_ptd_keeps) == "Predicted_Therapeutic_Direction category"] <- "Predicted_Therapeutic_Direction"
+merged_ptd_keeps
+```
+
+Write new nominations csv file
+```{r}
+write.csv(merged_ptd_keeps, "/Users/jbritton/Desktop/New nominations/from drive/Emory/Emory_ptd_merged.csv", row.names=FALSE)
+```
diff --git a/src/other/agora_stats.Rmd b/src/other/agora_stats.Rmd
@@ -0,0 +1,165 @@
+---
+title: "agora_stats.json"
+---
+
+```{r eval=FALSE, echo=FALSE}
+source("~/repos/rstudio/agora-data-validation/src/utils.R")
+prepare()
+```
+
+***
+Nomination stats
+***
+
+## GENE_INFO
+
+
+## Nomination stats
+
+# EXPECTED based on source files
+```{r}
+# new_prod = ALD current RC source, old_prod = ALD last released source
+# use pinned source files used to generate old_prod and new_prod json
+target_list_source_old_prod <- download_file("syn12540368.41", "target_list_source_old_prod", type="csv") # 3.3.0
+target_list_source_new_prod <- download_file("syn12540368.47", "target_list_source_new_prod", type="csv") # dec data release
+```
+
+# ACTUALS based on gene_info_old & gene_info_new
+```{r}
+old_synId <-  "syn12548902" 
+new_synId <- "syn17015359.61" 
+
+sorted_old <- subset(gene_info_old %>% arrange(TRUE), select = c(ensembl_gene_id, target_nominations))
+sorted_new <- subset(gene_info_new %>% arrange(TRUE), select = c(ensembl_gene_id, target_nominations))
+ 
+old <- sorted_old[!is.na(replace_null(sorted_old)$target_nominations),]
+new <- sorted_new[!is.na(replace_null(sorted_new)$target_nominations),]
+```
+
+# EXPECTED VS ACTUAL
+```{r}
+#OLD
+# total # expected nominations (old source)
+expected_noms_old <- length(target_list_source_old$hgnc_symbol)
+cat("\ntotal # expected nominations (old source): ", expected_noms_old)
+
+# total # expected nominated targets (old source)
+expected_targets_old <- length(unique(target_list_source_old$ensembl_gene_id))
+cat("\ntotal # expected nominated targets (old source): ", expected_targets_old)
+
+#total actual nominations (old json)
+flat_noms_old <- tidyr::unnest(old, target_nominations)
+old_noms <- nrow(flat_noms_old)
+cat("\ntotal actual nominations (old json): ", old_noms)
+
+# total actual nominated targets (old json)
+old_targets <- length(unique(old$ensembl_gene_id))
+cat("\ntotal actual nominated targets (old json): ", old_targets)
+
+# total actual # genes (old json)
+old_genes <- length(unique(gene_info_old$ensembl_gene_id))
+cat("\ntotal actual # genes (old json): ", old_genes, "\ntotal and unique equal: ", length(gene_info_old$ensembl_gene_id) == length(unique(gene_info_old$ensembl_gene_id)))
+
+
+# total # expected nominations (new source)
+expected_noms_new <- length(target_list_source_new$hgnc_symbol)
+cat("\ntotal # expected nominations (new source): ", expected_noms_new)
+
+# total # expected nominated targets (new source)
+expected_targets_new <- length(unique(target_list_source_new$ensembl_gene_id))
+cat("\ntotal # expected nominated targets (new source): ", expected_targets_new)
+
+# NEW
+
+#total actual nominations (old json)
+flat_noms_new <- tidyr::unnest(new, target_nominations,names_sep=".")
+new_noms <- nrow(flat_noms_new)
+cat("\ntotal actual nominations (new json): ", new_noms)
+
+# total actual nominated targets (old json)
+new_targets <- length(unique(new$ensembl_gene_id))
+cat("\ntotal actual nominated targets (new json): ", new_targets)
+
+# total actual # genes (new json)
+new_genes <- length(unique(gene_info_new$ensembl_gene_id))
+cat("\ntotal actual # genes (new json): ", new_genes, "\ntotal and unique equal: ", length(gene_info_new$ensembl_gene_id) == length(unique(gene_info_new$ensembl_gene_id)))
+```
+
+```{r}
+flat_noms_new
+amp <- nrow(flat_noms_new[flat_noms_new$target_nominations.source == "AMP-AD",])
+treat <- nrow(flat_noms_new[flat_noms_new$target_nominations.source == "TREAT-AD",])
+cc <- nrow(flat_noms_new[flat_noms_new$target_nominations.source == "Community",])
+resil <- nrow(flat_noms_new[flat_noms_new$target_nominations.source == "Resilience-AD",])
+
+multiple_nom_targets <- flat_noms_new %>% group_by(ensembl_gene_id) %>% count()
+n_multiple_nom_targets <- multiple_nom_targets[multiple_nom_targets$n > 1,]
+
+
+cat("\nAMP-AD: ", amp)
+cat("\nTREAT-AD: ", treat)
+cat("\nCC: ", cc)
+cat("\nResilience-AD: ", resil)
+cat("\nMulti-nom: ", nrow(n_multiple_nom_targets))
+```
+
+```{r}
+unique(target_list_source_new$Source)
+unique(target_list_source_new$Team)
+```
+
+## Poster stats
+```{r}
+nominated_targets_new <- get_subobject(gene_info_new, "target_nominations")
+unique(nominated_targets_new$source)
+
+amp_nt <- nominated_targets_new[nominated_targets_new$source == 'AMP-AD',]
+treat_nt <- nominated_targets_new[nominated_targets_new$source == 'TREAT-AD',]
+com_nt <- nominated_targets_new[nominated_targets_new$source == 'Community',]
+res_nt <- nominated_targets_new[nominated_targets_new$source == 'Resilience-AD',]
+
+cat("num amp-ad noms: ", nrow(amp_nt))
+cat("num treat-ad noms: ", nrow(treat_nt))
+cat("num cc noms: ", nrow(com_nt))
+cat("num resilience-ad noms: ", nrow(res_nt))
+
+print(amp_nt)
+print(treat_nt)
+print(com_nt)
+print(res_nt)
+```
+
+
+## ADKP jan newsletter
+```{r}
+noms_23 <- target_list_source_new_prod[target_list_source_new_prod$initial_nomination == '2023',]
+cat("\ntotal '23 noms: ", nrow(noms_23))
+cat("\ntotal noms: ", nrow(target_list_source_new_prod))
+cat("\ntargets: ", length(unique(target_list_source_new_prod$hgnc_symbol)))
+
+unique(noms_23$Source)
+
+```
+
+## NUM GENES w/EVIDENCE
+
+```{r}
+
+```
+
+find all ENSGs that we have results for (RNA, RNA DE, prot (any), metabl)
+```{r}
+# rna ME
+keeps <- c("ensembl_gene_id", "median_expression")
+infos <- subset(gene_info_new, select = keeps)
+me <- infos[infos$median_expression != 'NULL',]
+rna <- unique(me$ensembl_gene_id)
+
+rna_de <- unique(rna_seq_differential_expression_new$ensembl_gene_id)
+metab <- unique(metabolomics_new$ensembl_gene_id)
+prot <- unique(proteomics_new$ensembl_gene_id)
+prot_tmt <- unique(proteomics_tmt_new$ensembl_gene_id)
+prot_srm <- unique(proteomics_srm_new$ensembl_gene_id)
+
+length(purrr::reduce(list(rna, rna_de, metab, prot, prot_tmt, prot_srm),union))
+```
diff --git a/src/rules/rules_gene_info_target_nominations.R b/src/rules/rules_gene_info_target_nominations.R
@@ -43,7 +43,7 @@ rules_gene_info_target_nominations <-  validator(
   # study
   is.character(study),
   !is.na(study),
-  field_length(study, min=2, max=200),
+  field_length(study, min=2, max=500),
 
   # target_choice_justification
   is.character(target_choice_justification),

diff --git a/src/rules/rules_overall_scores.R b/src/rules/rules_overall_scores.R
@@ -24,6 +24,6 @@ rules_overall_scores <- validator(
   # overall
   is.numeric(target_risk_score),
   !is.na(target_risk_score),
-  in_range(target_risk_score, min=0, max=9)
+  in_range(target_risk_score, min=0, max=5)
 
 )
diff --git a/src/rules/rules_proteomics_distribution_data.R b/src/rules/rules_proteomics_distribution_data.R
@@ -3,7 +3,7 @@ rules_proteomics_distribution_data <- validator(
   # type
   is.character(type),
   field_length(type, min=3, max=3), 
-  type%in% c('LFQ', 'TMT'),
+  type%in% c('LFQ', 'TMT', 'SRM'),
 
   # tissue
   is.character(tissue),