-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add manifest validation workbook and misc updates
- Loading branch information
Showing
18 changed files
with
1,073 additions
and
130 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
--- | ||
title: "misc investigations etc." | ||
--- | ||
***** | ||
Setup | ||
***** | ||
```{r eval=FALSE, echo=FALSE} | ||
source("~/repos/rstudio/agora-data-validation/src/utils.R") | ||
prepare() | ||
``` | ||
|
||
|
||
***** | ||
MSSM/ZHANG | ||
***** | ||
additional notes on https://sagebionetworks.jira.com/browse/AG-1252 | ||
|
||
prior to loading csv into this workbook: | ||
1. populate internal-only columns (Source, input_data, initial_nomination_) | ||
2. adjust user-supplied values as required (study, Team) | ||
3. export each sheet as csv | ||
4. remove 'empty' rows from csvs as required (new only for this one) | ||
|
||
In this workbook: | ||
1. remove genes in 'updates' from 'new' | ||
2. merge Predicted_Theraputic_Direction_* columns -> Predicted_Theraputic_Direction | ||
3. populate 'internal' columns (source, intital_nomination) | ||
4. export modified csv that can be cleanly merged with targets source file | ||
|
||
```{r} | ||
zhang_new <- read_csv("/Users/jbritton/Desktop/New nominations/from drive/Zhang/Zhang_new_noms.csv") | ||
zhang_updates <- read_csv("/Users/jbritton/Desktop/New nominations/from drive/Zhang/Zhang_updated_noms.csv") | ||
zhang_new | ||
zhang_updates | ||
``` | ||
Sanity check uniques | ||
new - no dups | ||
updates - 3 dups?? -> TODO why????!?!?!?!?! | ||
```{r} | ||
# are there dups? | ||
length(unique(zhang_new$ensembl_gene_id)) | ||
nrow(zhang_new) | ||
length(unique(zhang_updates$ensembl_gene_id)) | ||
nrow(zhang_updates) | ||
# what are the dups? | ||
n_occur <- data.frame(table(zhang_updates$ensembl_gene_id)) | ||
dup_updates <- zhang_updates[zhang_updates$ensembl_gene_id %in% n_occur$Var1[n_occur$Freq > 1],] | ||
# are the dup rows identical? -> Nope | ||
highlander <- function(x) { | ||
x |> | ||
unique() |> | ||
length() |> | ||
{\(x) x == 1}() | ||
} | ||
dup_updates |> | ||
group_by(ensembl_gene_id) |> | ||
summarise( | ||
across(everything(), highlander) | ||
) | ||
# look at the dups | ||
zhang_updates[zhang_updates$ensembl_gene_id == "ENSG00000118733",] | ||
zhang_updates[zhang_updates$ensembl_gene_id == "ENSG00000124785",] | ||
zhang_updates[zhang_updates$ensembl_gene_id == "ENSG00000147065",] | ||
``` | ||
|
||
remove targets on the updates list from the new list, because they are not in fact new | ||
```{r} | ||
keep <- zhang_new[!(zhang_new$ensembl_gene_id %in% zhang_updates$ensembl_gene_id),] | ||
dropped <- zhang_new[zhang_new$ensembl_gene_id %in% zhang_updates$ensembl_gene_id,] | ||
cat(dropped$hgnc_symbol) | ||
cat("\nnew: ", nrow(zhang_new), "\nupdates: ", nrow(zhang_updates), "\ndropped: ", nrow(dropped), "\nkeeps: ", nrow(keep)) | ||
keep | ||
dropped | ||
``` | ||
|
||
|
||
Merge Predicted_Therapeutic_Direction_* fields in 'keep' | ||
```{r} | ||
merged_ptd <- paste(keep$`Predicted_Therapeutic_Direction category`, keep$`Predicted_Therapeutic_Direction details`) | ||
merged_ptd_keeps <- keep$`Predicted_Therapeutic_Direction category` <- merged_ptd | ||
drop_cols <- c("Predicted_Therapeutic_Direction details") | ||
merged_ptd_keeps <- keep[, !(names(keep) %in% drop_cols)] | ||
names(merged_ptd_keeps)[names(merged_ptd_keeps) == "Predicted_Therapeutic_Direction category"] <- "Predicted_Therapeutic_Direction" | ||
merged_ptd_keeps | ||
``` | ||
|
||
Write new nominations csv file | ||
```{r} | ||
write.csv(merged_ptd_keeps, "/Users/jbritton/Desktop/New nominations/from drive/Zhang/Zhang_new_noms_deduped.csv", row.names=FALSE) | ||
``` | ||
|
||
|
||
***** | ||
Emory | ||
***** | ||
additional notes on https://sagebionetworks.jira.com/browse/AG-1254 | ||
|
||
prior to loading csv into this workbook: | ||
1. populate internal-only columns (Source, input_data, initial_nomination) | ||
2. adjust user-supplied values as required (Study, Team) | ||
3. export each sheet as csv | ||
4. remove 'empty' rows from csvs as required | ||
|
||
In this workbook: | ||
1. merge Predicted_Theraputic_Direction_* columns -> Predicted_Theraputic_Direction | ||
2. export modified csv that can be cleanly merged with targets source file | ||
|
||
```{r} | ||
emory_new <- read_csv("/Users/jbritton/Desktop/New nominations/from drive/Emory/12.6.23_Emory Agora Target Submission filled+11Nov23_updates.csv") | ||
emory_updates <- read_csv("/Users/jbritton/Desktop/New nominations/from drive/Emory/Emory Agora target submissions_for_empty_updates.xlsm - Nominated Target Updates.csv") | ||
emory_new | ||
emory_updates | ||
``` | ||
|
||
Sanity check uniques | ||
new - | ||
updates - | ||
```{r} | ||
# are there dups? | ||
length(unique(emory_new$ensembl_gene_id)) | ||
nrow(emory_new) | ||
length(unique(emory_updates$ensembl_gene_id)) | ||
nrow(emory_updates) | ||
# what are the dups? | ||
n_occur <- data.frame(table(emory_updates$ensembl_gene_id)) | ||
dup_updates <- emory_updates[emory_updates$ensembl_gene_id %in% n_occur$Var1[n_occur$Freq > 1],] | ||
# are the dup rows identical? -> Nope | ||
highlander <- function(x) { | ||
x |> | ||
unique() |> | ||
length() |> | ||
{\(x) x == 1}() | ||
} | ||
dup_updates |> | ||
group_by(ensembl_gene_id) |> | ||
summarise( | ||
across(everything(), highlander) | ||
) | ||
``` | ||
|
||
|
||
|
||
Merge Predicted_Therapeutic_Direction_* fields | ||
```{r} | ||
merged_ptd <- paste(emory_new$`Predicted_Therapeutic_Direction category`, emory_new$`Predicted_Therapeutic_Direction details`) | ||
merged_ptd_keeps <- emory_new$`Predicted_Therapeutic_Direction category` <- merged_ptd | ||
drop_cols <- c("Predicted_Therapeutic_Direction details") | ||
merged_ptd_keeps <- emory_new[, !(names(emory_new) %in% drop_cols)] | ||
names(merged_ptd_keeps)[names(merged_ptd_keeps) == "Predicted_Therapeutic_Direction category"] <- "Predicted_Therapeutic_Direction" | ||
merged_ptd_keeps | ||
``` | ||
|
||
Write new nominations csv file | ||
```{r} | ||
write.csv(merged_ptd_keeps, "/Users/jbritton/Desktop/New nominations/from drive/Emory/Emory_ptd_merged.csv", row.names=FALSE) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
--- | ||
title: "agora_stats.json" | ||
--- | ||
|
||
```{r eval=FALSE, echo=FALSE} | ||
source("~/repos/rstudio/agora-data-validation/src/utils.R") | ||
prepare() | ||
``` | ||
|
||
*** | ||
Nomination stats | ||
*** | ||
|
||
## GENE_INFO | ||
|
||
|
||
## Nomination stats | ||
|
||
# EXPECTED based on source files | ||
```{r} | ||
# new_prod = ALD current RC source, old_prod = ALD last released source | ||
# use pinned source files used to generate old_prod and new_prod json | ||
target_list_source_old_prod <- download_file("syn12540368.41", "target_list_source_old_prod", type="csv") # 3.3.0 | ||
target_list_source_new_prod <- download_file("syn12540368.47", "target_list_source_new_prod", type="csv") # dec data release | ||
``` | ||
|
||
# ACTUALS based on gene_info_old & gene_info_new | ||
```{r} | ||
old_synId <- "syn12548902" | ||
new_synId <- "syn17015359.61" | ||
sorted_old <- subset(gene_info_old %>% arrange(TRUE), select = c(ensembl_gene_id, target_nominations)) | ||
sorted_new <- subset(gene_info_new %>% arrange(TRUE), select = c(ensembl_gene_id, target_nominations)) | ||
old <- sorted_old[!is.na(replace_null(sorted_old)$target_nominations),] | ||
new <- sorted_new[!is.na(replace_null(sorted_new)$target_nominations),] | ||
``` | ||
|
||
# EXPECTED VS ACTUAL | ||
```{r} | ||
#OLD | ||
# total # expected nominations (old source) | ||
expected_noms_old <- length(target_list_source_old$hgnc_symbol) | ||
cat("\ntotal # expected nominations (old source): ", expected_noms_old) | ||
# total # expected nominated targets (old source) | ||
expected_targets_old <- length(unique(target_list_source_old$ensembl_gene_id)) | ||
cat("\ntotal # expected nominated targets (old source): ", expected_targets_old) | ||
#total actual nominations (old json) | ||
flat_noms_old <- tidyr::unnest(old, target_nominations) | ||
old_noms <- nrow(flat_noms_old) | ||
cat("\ntotal actual nominations (old json): ", old_noms) | ||
# total actual nominated targets (old json) | ||
old_targets <- length(unique(old$ensembl_gene_id)) | ||
cat("\ntotal actual nominated targets (old json): ", old_targets) | ||
# total actual # genes (old json) | ||
old_genes <- length(unique(gene_info_old$ensembl_gene_id)) | ||
cat("\ntotal actual # genes (old json): ", old_genes, "\ntotal and unique equal: ", length(gene_info_old$ensembl_gene_id) == length(unique(gene_info_old$ensembl_gene_id))) | ||
# total # expected nominations (new source) | ||
expected_noms_new <- length(target_list_source_new$hgnc_symbol) | ||
cat("\ntotal # expected nominations (new source): ", expected_noms_new) | ||
# total # expected nominated targets (new source) | ||
expected_targets_new <- length(unique(target_list_source_new$ensembl_gene_id)) | ||
cat("\ntotal # expected nominated targets (new source): ", expected_targets_new) | ||
# NEW | ||
#total actual nominations (old json) | ||
flat_noms_new <- tidyr::unnest(new, target_nominations,names_sep=".") | ||
new_noms <- nrow(flat_noms_new) | ||
cat("\ntotal actual nominations (new json): ", new_noms) | ||
# total actual nominated targets (old json) | ||
new_targets <- length(unique(new$ensembl_gene_id)) | ||
cat("\ntotal actual nominated targets (new json): ", new_targets) | ||
# total actual # genes (new json) | ||
new_genes <- length(unique(gene_info_new$ensembl_gene_id)) | ||
cat("\ntotal actual # genes (new json): ", new_genes, "\ntotal and unique equal: ", length(gene_info_new$ensembl_gene_id) == length(unique(gene_info_new$ensembl_gene_id))) | ||
``` | ||
|
||
```{r} | ||
flat_noms_new | ||
amp <- nrow(flat_noms_new[flat_noms_new$target_nominations.source == "AMP-AD",]) | ||
treat <- nrow(flat_noms_new[flat_noms_new$target_nominations.source == "TREAT-AD",]) | ||
cc <- nrow(flat_noms_new[flat_noms_new$target_nominations.source == "Community",]) | ||
resil <- nrow(flat_noms_new[flat_noms_new$target_nominations.source == "Resilience-AD",]) | ||
multiple_nom_targets <- flat_noms_new %>% group_by(ensembl_gene_id) %>% count() | ||
n_multiple_nom_targets <- multiple_nom_targets[multiple_nom_targets$n > 1,] | ||
cat("\nAMP-AD: ", amp) | ||
cat("\nTREAT-AD: ", treat) | ||
cat("\nCC: ", cc) | ||
cat("\nResilience-AD: ", resil) | ||
cat("\nMulti-nom: ", nrow(n_multiple_nom_targets)) | ||
``` | ||
|
||
```{r} | ||
unique(target_list_source_new$Source) | ||
unique(target_list_source_new$Team) | ||
``` | ||
|
||
## Poster stats | ||
```{r} | ||
nominated_targets_new <- get_subobject(gene_info_new, "target_nominations") | ||
unique(nominated_targets_new$source) | ||
amp_nt <- nominated_targets_new[nominated_targets_new$source == 'AMP-AD',] | ||
treat_nt <- nominated_targets_new[nominated_targets_new$source == 'TREAT-AD',] | ||
com_nt <- nominated_targets_new[nominated_targets_new$source == 'Community',] | ||
res_nt <- nominated_targets_new[nominated_targets_new$source == 'Resilience-AD',] | ||
cat("num amp-ad noms: ", nrow(amp_nt)) | ||
cat("num treat-ad noms: ", nrow(treat_nt)) | ||
cat("num cc noms: ", nrow(com_nt)) | ||
cat("num resilience-ad noms: ", nrow(res_nt)) | ||
print(amp_nt) | ||
print(treat_nt) | ||
print(com_nt) | ||
print(res_nt) | ||
``` | ||
|
||
|
||
## ADKP jan newsletter | ||
```{r} | ||
noms_23 <- target_list_source_new_prod[target_list_source_new_prod$initial_nomination == '2023',] | ||
cat("\ntotal '23 noms: ", nrow(noms_23)) | ||
cat("\ntotal noms: ", nrow(target_list_source_new_prod)) | ||
cat("\ntargets: ", length(unique(target_list_source_new_prod$hgnc_symbol))) | ||
unique(noms_23$Source) | ||
``` | ||
|
||
## NUM GENES w/EVIDENCE | ||
|
||
```{r} | ||
``` | ||
|
||
find all ENSGs that we have results for (RNA, RNA DE, prot (any), metabl) | ||
```{r} | ||
# rna ME | ||
keeps <- c("ensembl_gene_id", "median_expression") | ||
infos <- subset(gene_info_new, select = keeps) | ||
me <- infos[infos$median_expression != 'NULL',] | ||
rna <- unique(me$ensembl_gene_id) | ||
rna_de <- unique(rna_seq_differential_expression_new$ensembl_gene_id) | ||
metab <- unique(metabolomics_new$ensembl_gene_id) | ||
prot <- unique(proteomics_new$ensembl_gene_id) | ||
prot_tmt <- unique(proteomics_tmt_new$ensembl_gene_id) | ||
prot_srm <- unique(proteomics_srm_new$ensembl_gene_id) | ||
length(purrr::reduce(list(rna, rna_de, metab, prot, prot_tmt, prot_srm),union)) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.