diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d81fea0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ + +# Jupyter Notebook +.ipynb_checkpoints diff --git a/GSFA/GSFA_time_complexity.R b/GSFA/GSFA_time_complexity.R new file mode 100644 index 0000000..c2104f4 --- /dev/null +++ b/GSFA/GSFA_time_complexity.R @@ -0,0 +1,43 @@ +library(data.table) +library(tidyverse) +library(Matrix) +library(GSFA) +library(ggplot2) + +install.packages('reticulate') +library(reticulate) +use_python("/usr/bin/python3") +py_discover_config() +np <- import("numpy") + +npz <- np$load("inhouse_GSFA_inputs.npz", allow_pickle=TRUE) +Y <- npz$get("array1") +G <- npz$get("array2") + +print(dim(Y)) +print(dim(G)) +print("loaded data") + +dev_res <- deviance_residual_transform(Y) +top_gene_index <- select_top_devres_genes(dev_res, num_top_genes = 6000) +dev_res_filtered <- dev_res[, top_gene_index] + +write.csv(top_gene_index, "inhouse_top_genes.csv") +rm(npz) +rm(Y) +print(dim(dev_res_filtered)) +print("processed data") + +set.seed(14314) +time_start = Sys.time() +num_cells = 5000 +fit <- fit_gsfa_multivar(Y = dev_res_filtered[1:num_cells,], G = G[1:num_cells,], + K = 20, + prior_type = "mixture_normal", + init.method = "svd", + niter = 3000, used_niter = 1000, + verbose = T, return_samples = T) +print(Sys.time()-time_start) +rm(G) +rm(dev_res_filtered) +saveRDS(fit, file = "fitted_inhouse.rds") diff --git a/GSFA/README.md b/GSFA/README.md new file mode 100644 index 0000000..2c130cf --- /dev/null +++ b/GSFA/README.md @@ -0,0 +1,8 @@ +# GSFA[(Guided Sparse Factor Analysis)] (Guided Sparse Factor Analysis) + +``` +Yifan Zhou, Kaixuan Luo, Lifan Liang, Mengjie Chen and Xin He. A new Bayesian factor analysis method improves detection of genes and biological processes affected by perturbations in single-cell CRISPR screening. Nature Methods. (2023). doi: 10.1038/s41592-023-02017-4. PMID: 37770710 +``` + +Training scripts for GSFA to use as a benchmark. +Link to paper: [paper](https://www.nature.com/articles/s41592-023-02017-4) diff --git a/GSFA/inhouse_GSFA.R b/GSFA/inhouse_GSFA.R new file mode 100644 index 0000000..835982d --- /dev/null +++ b/GSFA/inhouse_GSFA.R @@ -0,0 +1,46 @@ +library(data.table) +library(tidyverse) +library(Matrix) +library(GSFA) +library(ggplot2) + +install.packages('reticulate') +library(reticulate) +use_python("/usr/bin/python3") +py_discover_config() +np <- import("numpy") + +# read in inputs generated by inhouse_gsfa_preprocessing.ipynb +npz <- np$load("inhouse_GSFA_inputs.npz", allow_pickle=TRUE) +Y <- npz$get("array1") +G <- npz$get("array2") + +print(dim(Y)) +print(dim(G)) +print("loaded data") + +# GSFA-specific preprocessing of gene expression +dev_res <- deviance_residual_transform(Y) +top_gene_index <- select_top_devres_genes(dev_res, num_top_genes = 6000) +dev_res_filtered <- dev_res[, top_gene_index] +# save for downstream analysis +np$savez("inhouse_GSFA_preprocessed.npz", array1 = dev_res_filtered) +write.csv(top_gene_index, "inhouse_top_genes.csv") +rm(npz) +rm(Y) +print(dim(dev_res_filtered)) +print("processed data") + +# train and save GSFA model +set.seed(14314) +time_start = Sys.time() +fit <- fit_gsfa_multivar(Y = dev_res_filtered, G = G, + K = 20, + prior_type = "mixture_normal", + init.method = "svd", + niter = 3000, used_niter = 1000, + verbose = T, return_samples = T) +print(Sys.time()-time_start) +rm(G) +rm(dev_res_filtered) +saveRDS(fit, file = "fitted_inhouse.rds") diff --git a/GSFA/inhouse_gsfa_preprocessing.ipynb b/GSFA/inhouse_gsfa_preprocessing.ipynb new file mode 100644 index 0000000..eb47346 --- /dev/null +++ b/GSFA/inhouse_gsfa_preprocessing.ipynb @@ -0,0 +1,249 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "0c3f7ba9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "dfbdfee5", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "sys.path.append(\"..\")\n", + "from src.Spectra.Spectra_Pert import vectorize_perts\n", + "from utils import (\n", + " filter_noisy_genes,\n", + " generate_k_fold,\n", + " inhouse_preprocess,\n", + " read_aws_h5ad,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "426b0898-25f4-492b-b77c-d9f1fc17010f", + "metadata": {}, + "source": [ + "### Get train/test splits consistent with other models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dd10044-66e2-41c3-a724-282c4792f6d8", + "metadata": {}, + "outputs": [], + "source": [ + "# use anndata generate by ..data_processing/inhouse_prior_graph_preprocessing.ipynb\n", + "unfilterd_adata = read_aws_h5ad(\"path to preprocessed h5ad here\")\n", + "adata = filter_noisy_genes(unfilterd_adata)\n", + "adata = inhouse_preprocess(adata)\n", + "adata.layers[\"logcounts\"] = adata.X.copy()\n", + "adata.X = adata.X.todense()\n", + "gene_network = adata.uns[\"sparse_gene_network\"].todense()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d7e163dc-2a94-41de-b7f5-f717ac47fb4a", + "metadata": {}, + "outputs": [], + "source": [ + "# powered perturbations\n", + "adata.obs[\"condition\"] = adata.obs[\"condition\"].astype(str)\n", + "adata.obs[\"Treatment\"] = adata.obs[\"Treatment\"].astype(str)\n", + "adata.obs[\"pert_treat\"] = adata.obs[\"condition\"] + \"+\" + adata.obs[\"Treatment\"]\n", + "obs_df = pd.DataFrame(adata.obs[\"pert_treat\"])\n", + "category_counts = obs_df[\"pert_treat\"].value_counts()\n", + "filtered_categories = category_counts[category_counts >= 50].index\n", + "adata = adata[adata.obs[\"pert_treat\"].isin(filtered_categories)]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2e20577d-23ec-4336-a3f5-8fcae53252a8", + "metadata": {}, + "outputs": [], + "source": [ + "train_idx, val_idx, test_idx = generate_k_fold(\n", + " adata, adata.X, adata.obs[\"condition\"], fold_idx=0\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "431abcfa-4929-4c0c-8651-c1f9c35f37f2", + "metadata": {}, + "source": [ + "### Process GSFA-specifc input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96e401f3", + "metadata": {}, + "outputs": [], + "source": [ + "# use inhouse dataset from s3://pert-spectra\n", + "adata = read_aws_h5ad(\n", + " \"s3://pert-spectra/rnaseq565.filtered.actionet.guide_corrected.h5ad\"\n", + ")\n", + "adata = inhouse_preprocess(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "76e4b5e7", + "metadata": {}, + "outputs": [], + "source": [ + "# filter adata to perturbations with at least 50 samples for each treatment\n", + "adata.obs[\"condition\"] = adata.obs[\"condition\"].astype(str)\n", + "adata.obs[\"Treatment\"] = adata.obs[\"Treatment\"].astype(str)\n", + "adata.obs[\"pert_treat\"] = adata.obs[\"condition\"] + \"+\" + adata.obs[\"Treatment\"]\n", + "obs_df = pd.DataFrame(adata.obs[\"pert_treat\"])\n", + "category_counts = obs_df[\"pert_treat\"].value_counts()\n", + "filtered_categories = category_counts[category_counts >= 50].index\n", + "adata = adata[adata.obs[\"pert_treat\"].isin(filtered_categories)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fdfbf4b", + "metadata": {}, + "outputs": [], + "source": [ + "# create binary perturbation matrix\n", + "D, pert_labels = vectorize_perts(adata, \"condition\", [\"ctrl\", \"nan\"])\n", + "pert_idx = np.array(\n", + " [\n", + " adata.var_names.get_loc(i.split(\"_\")[1])\n", + " if i.split(\"_\")[1] in adata.var_names\n", + " else -1\n", + " for i in pert_labels\n", + " ]\n", + ")\n", + "# add ctrl one-hot-encoding\n", + "ctrl_vector = np.array([1.0 if i == \"ctrl\" else 0.0 for i in adata.obs[\"condition\"]])\n", + "D = np.concatenate([D, ctrl_vector.reshape(len(ctrl_vector), 1)], axis=1).astype(\n", + " np.float32\n", + ")\n", + "pert_idx = np.append(pert_idx, [-1, -1])\n", + "pert_labels = pert_labels + [\"ctrl\"]\n", + "print(D.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4ac6b398", + "metadata": {}, + "outputs": [], + "source": [ + "# subset to kfold and TNFA+ treatment\n", + "D_train = D[train_idx]\n", + "adata_train = adata[train_idx]\n", + "D_train = D_train[adata_train.obs[\"Treatment\"] == \"TNFA+\"]\n", + "adata_train = adata_train[adata_train.obs[\"Treatment\"] == \"TNFA+\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0eb35473", + "metadata": {}, + "outputs": [], + "source": [ + "# subset further for GSFA to run without OOM issues\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "Y, _, G, _ = train_test_split(\n", + " adata_train.layers[\"counts\"],\n", + " D_train,\n", + " test_size=0.2,\n", + " random_state=42,\n", + " stratify=D_train,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5d496dfd-bbdf-4e90-aba2-a224674d2876", + "metadata": {}, + "outputs": [], + "source": [ + "# save inputs for GSFA\n", + "np.savez(\"rna565_GSFA_inputs.npz\", array1=Y.todense(), array2=G)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c8d4ae72-33c8-4fe1-997b-47be95d0084b", + "metadata": {}, + "outputs": [], + "source": [ + "# save additional perturbation labels for downstream analysis\n", + "np.savez(\"rna565_G_labels.npz\", pert_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c76eeff-6cbb-4d30-a2ae-a336fd6e1794", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/GSFA/load_inhouse_GSFA.R b/GSFA/load_inhouse_GSFA.R new file mode 100644 index 0000000..95e5fe1 --- /dev/null +++ b/GSFA/load_inhouse_GSFA.R @@ -0,0 +1,13 @@ +fitted_rna565_GSFA <- readRDS("~/GSFA/fitted_rna565_GSFA.Rds") + +Z <- fitted_rna565_GSFA$posterior_means$Z_pm +beta <- fitted_rna565_GSFA$posterior_means$beta_pm +W <- fitted_rna565_GSFA$posterior_means$W_pm +F <- fitted_rna565_GSFA$posterior_means$F_pm +lsfr <- fitted_rna565_GSFA$lfsr + +write.csv(Z,"~/GSFA/rna565_gsfa_outputs/Z.csv") +write.csv(beta,"~/GSFA/rna565_gsfa_outputs/beta.csv") +write.csv(W,"~/GSFA/rna565_gsfa_outputs/W.csv") +write.csv(F,"~/GSFA/rna565_gsfa_outputs/F.csv") +write.csv(lsfr,"~/GSFA/rna565_gsfa_outputs/lsfr.csv") diff --git a/GSFA/load_norman_GSFA.R b/GSFA/load_norman_GSFA.R new file mode 100644 index 0000000..e06c2b6 --- /dev/null +++ b/GSFA/load_norman_GSFA.R @@ -0,0 +1,13 @@ +fitted_norman_GSFA <- readRDS("~/GSFA/fitted_norman_GSFA.rds") + +Z <- fitted_norman_GSFA$posterior_means$Z_pm +beta <- fitted_norman_GSFA$posterior_means$beta_pm +W <- fitted_norman_GSFA$posterior_means$W_pm +F <- fitted_norman_GSFA$posterior_means$F_pm +lsfr <- fitted_norman_GSFA$lfsr + +write.csv(Z,"~/GSFA/norman_gsfa_outputs/Z.csv") +write.csv(beta,"~/GSFA/norman_gsfa_outputs/beta.csv") +write.csv(W,"~/GSFA/norman_gsfa_outputs/W.csv") +write.csv(F,"~/GSFA/norman_gsfa_outputs/F.csv") +write.csv(lsfr,"~/GSFA/norman_gsfa_outputs/lsfr.csv") diff --git a/GSFA/norman_GSFA.R b/GSFA/norman_GSFA.R new file mode 100644 index 0000000..4b68e5a --- /dev/null +++ b/GSFA/norman_GSFA.R @@ -0,0 +1,46 @@ +library(data.table) +library(tidyverse) +library(Matrix) +library(GSFA) +library(ggplot2) + +install.packages('reticulate') +library(reticulate) +use_python("/usr/bin/python3") +py_discover_config() +np <- import("numpy") + +# read in inputs generated by norman_gsfa_preprocessing.ipynb +npz <- np$load("norman_GSFA_inputs.npz", allow_pickle=TRUE) +Y <- npz$get("array1") +G <- npz$get("array2") + +print(dim(Y)) +print(dim(G)) +print("loaded data") + +# GSFA-specific preprocessing of gene expression +dev_res <- deviance_residual_transform(Y) +top_gene_index <- select_top_devres_genes(dev_res, num_top_genes = 4000) +dev_res_filtered <- dev_res[, top_gene_index] +# save for downstream analysis +np$savez("norman_GSFA_preprocessed.npz", array1 = dev_res_filtered) +write.csv(top_gene_index, "norman_top_genes.csv") +rm(npz) +rm(Y) +print(dim(dev_res_filtered)) +print("processed data") + +# train and save GSFA model +set.seed(14314) +time_start = Sys.time() +fit <- fit_gsfa_multivar(Y = dev_res_filtered, G = G, + K = 20, + prior_type = "mixture_normal", + init.method = "svd", + niter = 3000, used_niter = 1000, + verbose = T, return_samples = T) +print(Sys.time()-time_start) +rm(G) +rm(dev_res_filtered) +saveRDS(fit, file = "fitted_norman.rds") diff --git a/GSFA/norman_gsfa_preprocessing.ipynb b/GSFA/norman_gsfa_preprocessing.ipynb new file mode 100644 index 0000000..9db834a --- /dev/null +++ b/GSFA/norman_gsfa_preprocessing.ipynb @@ -0,0 +1,242 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "id": "0c3f7ba9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "dfbdfee5", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "sys.path.append(\"..\")\n", + "from src.Spectra.Spectra_Pert import vectorize_perts\n", + "from utils import (\n", + " filter_noisy_genes,\n", + " generate_k_fold,\n", + " read_aws_h5ad,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "15d54d79-721c-4e13-869d-b398460d7fb7", + "metadata": {}, + "source": [ + "### Get train/test split consistent with other models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1868a84f-9b4a-4853-8658-e597e027af94", + "metadata": {}, + "outputs": [], + "source": [ + "# use anndata generate by ..data_processing/norman_prior_graph_preprocessing.ipynb\n", + "unfiltered_adata = read_aws_h5ad(\"path to preprocessed h5ad here\")\n", + "adata = filter_noisy_genes(unfiltered_adata)\n", + "adata.layers[\"logcounts\"] = adata.X.copy()\n", + "adata.X = adata.X.todense()\n", + "gene_network = adata.uns[\"sparse_gene_network\"].todense()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b62c8814-501e-4f9d-a5f9-f7ac30f16912", + "metadata": {}, + "outputs": [], + "source": [ + "# subset to powered perturbations\n", + "obs_df = pd.DataFrame(adata.obs[\"perturbation_name\"])\n", + "category_counts = obs_df[\"perturbation_name\"].value_counts()\n", + "filtered_categories = category_counts[category_counts >= 50].index\n", + "adata = adata[adata.obs[\"perturbation_name\"].isin(filtered_categories)]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0ffce6f2-b975-40f6-ab97-0f676c5bb0d9", + "metadata": {}, + "outputs": [], + "source": [ + "# retrieve same data splits for consistency\n", + "train_idx, val_idx, test_idx = generate_k_fold(\n", + " adata,\n", + " adata.X,\n", + " adata.obs[\"perturbation_name\"],\n", + " fold_idx=0,\n", + " perturbation_key=\"perturbation_name\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "347d91bf-eb3f-470b-8fff-956b1dedd41a", + "metadata": {}, + "source": [ + "### Process GSFA-specifc inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "96e401f3", + "metadata": {}, + "outputs": [], + "source": [ + "# use Norman dataset from https://github.com/theislab/sc-pert\n", + "adata = read_aws_h5ad(\"path to h5ad here\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "76e4b5e7", + "metadata": {}, + "outputs": [], + "source": [ + "# subset to powered perturbations\n", + "obs_df = pd.DataFrame(adata.obs[\"perturbation_name\"])\n", + "category_counts = obs_df[\"perturbation_name\"].value_counts()\n", + "filtered_categories = category_counts[category_counts >= 50].index\n", + "adata = adata[adata.obs[\"perturbation_name\"].isin(filtered_categories)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fdfbf4b", + "metadata": {}, + "outputs": [], + "source": [ + "# create binary perturbation matrix\n", + "D, pert_labels = vectorize_perts(adata, \"perturbation_name\", [\"control\", \"nan\"])\n", + "pert_idx = np.array(\n", + " [\n", + " adata.var_names.get_loc(i.split(\"_\")[1])\n", + " if i.split(\"_\")[1] in adata.var_names\n", + " else -1\n", + " for i in pert_labels\n", + " ]\n", + ")\n", + "# add ctrl one-hot-encoding\n", + "ctrl_vector = np.array(\n", + " [1.0 if i == \"control\" else 0.0 for i in adata.obs[\"perturbation_name\"]]\n", + ")\n", + "D = np.concatenate([D, ctrl_vector.reshape(len(ctrl_vector), 1)], axis=1).astype(\n", + " np.float32\n", + ")\n", + "pert_idx = np.append(pert_idx, [-1, -1])\n", + "pert_labels = pert_labels + [\"ctrl\"]\n", + "print(D.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4ac6b398", + "metadata": {}, + "outputs": [], + "source": [ + "# subset to kfold\n", + "adata_train = adata[train_idx]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0eb35473", + "metadata": {}, + "outputs": [], + "source": [ + "# subset further for GSFA to run without OOM issues\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "Y, _, G, _ = train_test_split(\n", + " adata_train.layers[\"counts\"],\n", + " D[train_idx],\n", + " test_size=0.70,\n", + " random_state=42,\n", + " stratify=D[train_idx],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5d496dfd-bbdf-4e90-aba2-a224674d2876", + "metadata": {}, + "outputs": [], + "source": [ + "# save inputs for GSFA\n", + "np.savez(\"norman_GSFA_inputs.npz\", array1=Y.todense(), array2=G)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c8d4ae72-33c8-4fe1-997b-47be95d0084b", + "metadata": {}, + "outputs": [], + "source": [ + "# save perturbation labels for downstream analysis\n", + "np.savez(\"norman_G_labels.npz\", pert_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17c84b85-395a-4b9c-adef-3b505a5b8fb0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..bcd8787 --- /dev/null +++ b/LICENSE @@ -0,0 +1 @@ +Copyright (C) 2024 Insitro, Inc. This software and any derivative works are licensed under the terms of the Creative Commons Attribution-NonCommercial 4.0 International Public License (CC-BY-NC 4.0), accessible at https://creativecommons.org/licenses/by-nc/4.0/legalcode diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..dd89917 --- /dev/null +++ b/Makefile @@ -0,0 +1,64 @@ +ENV_NAME=pertspectra +CONDA_BASE=$(shell conda run -n base conda info --base) +ENV_DIR=$(CONDA_BASE)/envs/$(ENV_NAME) + + +## Installation +.PHONY: check-conda +check-conda: +ifeq (,$(shell which conda)) + $(error "This project uses conda for environment management. Please install conda before continuing.") +endif + +check-conda-lock: +ifeq (,$(shell which conda-lock)) + $(error "conda-lock command not found, install with `pip install conda-lock`") +endif + +.PHONY: conda-lock +conda-lock: check-conda + pip install conda-lock && \ + conda-lock lock -f environment.yaml -p osx-64 -p linux-64 + +.PHONY: install-env +install-env: check-conda check-conda-lock + conda-lock install -p $(ENV_DIR) conda-lock.yml && \ + conda run -p $(ENV_DIR) python -m pip install -r requirements.txt + +.PHONY: install-pre-commit +install-pre-commit: + conda run -p $(ENV_DIR) pre-commit install + +.PHONY: jupyter-kernel +jupyter-kernel: + conda run -p $(ENV_DIR) $(ENV_DIR)/bin/pip install ipykernel + conda run -p $(ENV_DIR) $(ENV_DIR)/bin/ipython kernel install --user --name=$(ENV_NAME) + +install: install-env install-pre-commit jupyter-kernel + +## Linting + +.PHONY: format +format: + ruff check --fix + +.PHONY: type-check +type-check: + pre-commit run mypy --all-files + +.PHONY: lint +lint: + pre-commit run --all-files + + +## Testing +COVERAGE_REPORT_FILE=coverage.xml +PYTEST_REPORT_FILE=report.xml + +.PHONY: pytest +pytest: + pytest -v --junitxml $(PYTEST_REPORT_FILE) $(ENV_NAME) + +.PHONY: pytest-cov +pytest-cov: + pytest --cov $(PACKAGE_NAME) $(ENV_NAME) diff --git a/PertSpectra_load_checkpoints/pertspectra_inhouse.ipynb b/PertSpectra_load_checkpoints/pertspectra_inhouse.ipynb new file mode 100644 index 0000000..a57962c --- /dev/null +++ b/PertSpectra_load_checkpoints/pertspectra_inhouse.ipynb @@ -0,0 +1,432 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "65f8246a", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bb2ff34c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Seed set\n" + ] + } + ], + "source": [ + "import sys\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import torch\n", + "\n", + "sys.path.append(\"..\")\n", + "from utils import (\n", + " filter_noisy_genes,\n", + " generate_k_fold,\n", + " load_model,\n", + " read_aws_h5ad,\n", + " set_seed,\n", + " write_adata_to_s3,\n", + ")\n", + "\n", + "set_seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "d6389804-10c8-4626-8982-bf994a98c18b", + "metadata": {}, + "source": [ + "## Load Model from Checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5280863b", + "metadata": {}, + "outputs": [], + "source": [ + "# use anndata generate by ..data_processing/inhouse_prior_graph_preprocessing.ipynb\n", + "unfilterd_adata = read_aws_h5ad(\"path to preprocessed h5ad\")\n", + "adata = filter_noisy_genes(unfilterd_adata)\n", + "adata.layers[\"logcounts\"] = adata.X.copy()\n", + "adata.X = adata.X.todense()\n", + "device = torch.device(\"cuda:0\")\n", + "gene_network = adata.uns[\"sparse_gene_network\"].todense()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2972c1e4", + "metadata": {}, + "outputs": [], + "source": [ + "# subset to powered perturbations\n", + "adata.obs[\"condition\"] = adata.obs[\"condition\"].astype(str)\n", + "adata.obs[\"Treatment\"] = adata.obs[\"Treatment\"].astype(str)\n", + "adata.obs[\"pert_treat\"] = adata.obs[\"condition\"] + \"+\" + adata.obs[\"Treatment\"]\n", + "obs_df = pd.DataFrame(adata.obs[\"pert_treat\"])\n", + "category_counts = obs_df[\"pert_treat\"].value_counts()\n", + "filtered_categories = category_counts[category_counts >= 50].index\n", + "adata = adata[adata.obs[\"pert_treat\"].isin(filtered_categories)]\n", + "labels = adata.obs[\"Treatment\"].values" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "0a65f48c", + "metadata": {}, + "outputs": [], + "source": [ + "# load model from checkpoint\n", + "s3_dir = \"s3://pert-spectra/PertSpectra_checkpoints/\"\n", + "experiment_name = \"pertspectra_inhouse/\"\n", + "model_name = \"kfold_4\"\n", + "wrapper, adata = load_model(\n", + " adata=adata,\n", + " s3_dir=s3_dir,\n", + " experiment_name=experiment_name,\n", + " model_name=model_name,\n", + " use_cell_types=True,\n", + " cell_type_key=\"Treatment\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "89206201", + "metadata": {}, + "outputs": [], + "source": [ + "# save trained PertSpectra parameters\n", + "adata.uns[\"SPECTRA_factors\"] = wrapper.factors\n", + "adata.uns[\"SPECTRA_L\"] = wrapper.internal_model.L\n", + "adata.uns[\"SPECTRA_pert_scores\"] = wrapper.cell_scores" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "4d3a17dc", + "metadata": {}, + "outputs": [], + "source": [ + "# reconstruct binarized perturbation matrix\n", + "pert_idx = list(wrapper.internal_model.pert_idx)\n", + "pert_labels = [x.split(\"_\")[-1] for x in wrapper.internal_model.pert_labels]\n", + "adata.uns[\"Spectra_pert_labels\"] = pert_labels\n", + "D = []\n", + "for i in adata.obs[\"condition\"]:\n", + " d = [0.0 for _ in range(len(pert_idx))]\n", + " # add intercept\n", + " d[-1] = 1\n", + " if i == \"nan\":\n", + " D.append(d)\n", + " continue\n", + " # control\n", + " if i == \"ctrl\":\n", + " d[-2] = 1.0\n", + " D.append(d)\n", + " continue\n", + " guides = i.split(\"+\")\n", + "\n", + " # pert or intergenic\n", + " if guides[0] in adata.var_names:\n", + " adata_idx = adata.var_names.get_loc(guides[0])\n", + " one_hot_idx = pert_idx.index(adata_idx)\n", + " d[one_hot_idx] = 1.0\n", + " else:\n", + " one_hot_idx = pert_labels.index(guides[0])\n", + " d[one_hot_idx] = 1.0\n", + "\n", + " if len(guides) > 1:\n", + " if guides[1] in adata.var_names:\n", + " adata_idx = adata.var_names.get_loc(guides[1])\n", + " one_hot_idx = pert_idx.index(adata_idx)\n", + " d[one_hot_idx] = 1.0\n", + " else:\n", + " one_hot_idx = pert_labels.index(guides[1])\n", + " d[one_hot_idx] = 1.0\n", + " D.append(d)\n", + "D = np.stack(D).astype(np.float32)" + ] + }, + { + "cell_type": "markdown", + "id": "1a483a3b-44d7-455b-957e-b2b89ff527ff", + "metadata": {}, + "source": [ + "## Reconstructed Gene Expression\n", + "- Visualize reconstructed gene expression for a single perturbation\n", + "- Save reconstructed gene expression for all heldout cells" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "4184472e", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "SPECTRA(\n", + " (theta): ParameterDict(\n", + " (global): Parameter containing: [torch.cuda.FloatTensor of size 4997x20 (cuda:0)]\n", + " (TNFA+): Parameter containing: [torch.cuda.FloatTensor of size 4997x5 (cuda:0)]\n", + " (TNFA-): Parameter containing: [torch.cuda.FloatTensor of size 4997x5 (cuda:0)]\n", + " )\n", + " (alpha): ParameterDict(\n", + " (TNFA+): Parameter containing: [torch.cuda.FloatTensor of size 24x25 (cuda:0)]\n", + " (TNFA-): Parameter containing: [torch.cuda.FloatTensor of size 24x25 (cuda:0)]\n", + " )\n", + " (eta): ParameterDict(\n", + " (global): Parameter containing: [torch.cuda.FloatTensor of size 20x20 (cuda:0)]\n", + " (TNFA+): Parameter containing: [torch.cuda.FloatTensor of size 5x5 (cuda:0)]\n", + " (TNFA-): Parameter containing: [torch.cuda.FloatTensor of size 5x5 (cuda:0)]\n", + " )\n", + " (gene_scaling): ParameterDict(\n", + " (global): Parameter containing: [torch.cuda.FloatTensor of size 4997 (cuda:0)]\n", + " (TNFA+): Parameter containing: [torch.cuda.FloatTensor of size 4997 (cuda:0)]\n", + " (TNFA-): Parameter containing: [torch.cuda.FloatTensor of size 4997 (cuda:0)]\n", + " )\n", + " (kappa): ParameterDict(\n", + " (global): Parameter containing: [torch.cuda.FloatTensor of size (cuda:0)]\n", + " (TNFA+): Parameter containing: [torch.cuda.FloatTensor of size (cuda:0)]\n", + " (TNFA-): Parameter containing: [torch.cuda.FloatTensor of size (cuda:0)]\n", + " )\n", + ")" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wrapper.internal_model.to(torch.device(\"cuda:0\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6a88483", + "metadata": {}, + "outputs": [], + "source": [ + "# retrieve heldout cells\n", + "train_idx, val_idx, test_idx = generate_k_fold(\n", + " adata, adata.X, adata.obs[\"condition\"], fold_idx=4\n", + ")\n", + "loss_weights = np.ones(adata.shape[0])\n", + "holdout_adata = adata[test_idx]\n", + "train_adata = adata[train_idx]" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "1328f936", + "metadata": {}, + "outputs": [], + "source": [ + "# visualize reconstructed vs observed expression for cells of a given perturbation\n", + "holdout_perts = \"RIPK1\"\n", + "hold_idx = [\n", + " i for i, x in enumerate(holdout_adata.obs[\"condition\"]) if x == holdout_perts\n", + "]\n", + "X_holdout = torch.from_numpy(holdout_adata.X[hold_idx])\n", + "D_holdout = torch.from_numpy(D[hold_idx])\n", + "labels_holdout = labels[hold_idx]\n", + "loss_weights_holdout = torch.from_numpy(loss_weights[hold_idx])" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "9657f513", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " loss, recon = wrapper.internal_model.loss(\n", + " X=X_holdout,\n", + " D=D_holdout,\n", + " labels=labels_holdout,\n", + " loss_weights=loss_weights_holdout,\n", + " forward=True,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "d91358d1", + "metadata": {}, + "outputs": [], + "source": [ + "mean_reconstruction = torch.mean(recon[\"TNFA+\"], dim=0).detach().cpu().numpy()\n", + "observed = torch.from_numpy(\n", + " holdout_adata[\n", + " (holdout_adata.obs[\"Treatment\"] == \"TNFA+\")\n", + " & (holdout_adata.obs[\"condition\"] == holdout_perts)\n", + " ].X\n", + ")\n", + "mean_observed = torch.mean(observed, dim=0).detach().cpu().numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08ccadde-8c6c-494f-bfeb-feddfd668d1b", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import spearmanr\n", + "\n", + "spearmans = spearmanr(mean_reconstruction, mean_observed)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "bd66f54b-a52a-48e7-8529-f3d2a31a15b1", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAqsAAAInCAYAAAC/aRBSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8fJSN1AAAACXBIWXMAAA9hAAAPYQGoP6dpAABzmElEQVR4nO3deVxU5eIG8GeYYVVZZJNFZJFNNsEFVHDBLE1JjbpZqXVbNE3tV9cy7XYrS229lZiW2qIttqEWZpqGGxpqrohgLCKy7yAyLDOc3x/cmRg2BxicYXi+nw8f5cyZc96Zd+ach/e873tEgiAIICIiIiLSQQbaLgARERERUXsYVomIiIhIZzGsEhEREZHOYlglIiIiIp3FsEpEREREOothlYiIiIh0FsMqEREREekshlUiIiIi0lkMq0RERESksxhWtez8+fNYtmwZwsPD4e/vj3HjxmHZsmU4d+5cq3VjYmLg7e2NsrIyLZRUuyIjI/Hiiy9qtQw5OTnw9vZW/vj4+CA0NBRPPvlkm/XVE1588UVERkaqLPP29kZMTEyntlNYWIiYmBikpKRosngAgJ07d8Lb2xs5OTkdrqf4PPv4+OD69eutHq+pqUFISAi8vb21Xve3EhkZqfLZaP4zb948bRfvtmjrs6lrKioq8Oyzz2LMmDHw9vbG4sWL21133rx5KvUYEBCAu+++Gxs3bkR9fb3Kuopjw6effqpcdvLkSZXn+/r6YuzYsVi2bBkyMjI6fC4AyOVyrFy5Et7e3ti0aZNy+fvvv4+FCxciIiKi098NxXdT8TNs2DCMHz8eK1euRGFhodrbUcfXX3+NnTt3anSbzUVGRmLhwoU9tv3mOnOMLSkpwbvvvouoqCgEBwcjICAAd955J9544w1kZWV1et+Kz9HJkyeVyxTHzr5Eou0C9GVffvkl1q5di8DAQCxfvhxOTk7Iy8vDN998g4ceeggvvfQS5s6dq+1iUgvz5s3DjBkzIJfLkZ6ejg0bNmD+/Pn47rvvMGzYsNtenu+++w6DBg3q1HOKioqwYcMGODk5wdfXt4dKph4zMzPExsbi//7v/1SW79u3DzKZDIaGhtopWCeFhIRgxYoVrZb3799fC6W5/RYvXoz58+druxgd2rhxIw4cOIC1a9fCxcUFFhYWHa4/ePBgvPvuuwCAsrIy/PDDD/jwww+Rn5+P119/Xa19PvfccwgNDUVDQwMuXbqEjz76CImJiYiLi4O9vX2bz6mvr8e//vUvHDx4EK+88goeeugh5WPbtm2Dt7c3IiMjERsbq+YrV7Vu3Tq4u7ujtrYWf/75Jz755BOcOnUKcXFxMDMz69I2W9qxYwesrKxw7733amR7vcHFixexcOFCCIKAuXPnYvjw4TA0NMTVq1fx888/4/7778fp06e1XcxeiWFVS86cOYO1a9diwoQJ2LBhAySSv6ti+vTpWLJkCdasWQNfX1+MGDFCiyVtW21tLUxMTLRdDK1wcHDA8OHDAQAjRoyAi4sLHn30UXzzzTd444032nxObW0tjI2NIRKJNF4eRVl6q7vvvhu7d+/GsmXLYGDw98WeH3/8EVOmTEF8fLwWS6c+c3NznagLQRBQV1d327+fLi4ut3V/XZGWlgYXFxfcc889aq1vYmKiUqfjx4/H9OnTsWvXLvz73/+GsbHxLbcxZMgQ5TZGjRqFAQMG4KWXXsLOnTuxaNGiVuvX1NTg6aefxunTp/Huu+9i+vTpKo+fPXtW+T35+eef1XodLXl6eiIgIAAAEBYWBrlcjo0bN+LgwYNqvzftkUqlMDU17dY2tLn9rqqursbixYthbGyMb7/9VqUBITQ0FHPmzMG+ffu0WMLejd0AtGTz5s0QiUR49dVXVYIqAEgkErzyyisQiUTYsmVLq+cWFBRgyZIlCAkJwYgRI7B8+fJWXQP++OMPzJs3D6GhoQgMDMTEiROxdOlSSKVS5Tr19fXYuHEjpk6dCn9/f4SFhWHlypWttqW43PLbb79h1qxZCAgIwIYNGzBr1iyVv/gV5HI5IiIisGTJkk7vq6GhAW+//TbGjRuHoKAgPPjgg7h48eIt38+GhgaMGTMGzz//fKvHqqqqEBgYiHXr1gEAGhsbsXHjRtx1110IDAzEyJEjERUVhW3btt1yP21RnIjy8vIA/H2pLSEhAStXrkRYWBiCgoKUlw737t2LBx54AMOHD0dwcDAef/xxXL58udV2d+7cibvuugv+/v6YNm0adu/e3eb+27pEVVhYiJdffhkTJkyAv78/wsPDsWzZMpSUlODkyZO47777AEB5mbHlNpKSkvDUU09h9OjRCAgIwKxZs7B3795W+z5//jzmzJmDgIAAhIeH47333oNMJuvU+xcdHY38/HwcP35cuezq1as4c+YMoqOj23xOdXU13nrrLURGRsLf3x8RERFYs2YNampqVNb7+uuv8fDDD2PMmDEYPnw4oqKisGXLFjQ0NKisp2gtv3jxIh566CEEBQVh8uTJ2Lx5MxobGzv1etpTV1eHWbNmYcqUKbhx44ZyeXFxMcaNG4d58+ZBLpcDaLqkHhwcjLS0NDzyyCMYPnw4wsLCsHr1apXvMNBU/6tXr8aOHTswbdo0BAQEYNeuXQCArKws/Otf/8KYMWOUn6Ovv/5a5fnqfB/KyspUPk9hYWGYM2cOTpw4oVynrW4AdXV1eO+991Tq6bXXXkNVVZXKeopjzNGjRzF79mwEBgZi6tSp+PHHH9V6bysqKvDqq68iIiIC/v7+mDx5Mt5//33ld05xqf3EiRPIyMhQfuabX1pVh0QigY+PDxoaGlq9BnW1PF40V1lZiX/+8584e/YsPvroo1ZBFYDKH3Sa0rJMgiDg66+/xsyZMxEYGIhRo0Zh2bJlrbrrKL43p0+fxpw5cxAUFIRVq1YhMjISaWlpOHXqlPK9Vnw22usm1Nbl7va239yBAwcQFRWFgIAATJ48Gdu3b2/1+tQ9XlRXV+Pf//43QkNDlcfmq1evqvUefv/99yguLsbzzz/f7pWuqVOnqvyu7nFWHeqc83sztqxqgVwux8mTJ+Hv79/uh9rBwQF+fn5ITEyEXC6HWCxWPrZkyRJMnToVc+bMQXp6Oj788ENkZGTg+++/h6GhIXJycrBw4UKMHDkSa9asgbm5OQoLC3Hs2DE0NDTA1NQUjY2NWLx4Mc6cOYPHH38cISEhyM3NRUxMDC5evIjY2FiVlpnk5GRkZGRg0aJFcHZ2hqmpKezs7LBmzRpkZWXB1dVVuW5CQgKKioqUl386s6+XX34Zu3fvxmOPPYZx48YhLS0NS5Yswc2bNzt8Tw0NDXHPPffg22+/xSuvvKJy6XXPnj2oq6tTlmfr1q3YsGEDFi1ahJEjR0ImkyEzM1MlQHTGtWvXAABWVlYqy1etWoWJEyfi7bffhlQqhUQiwccff4wPPvgA9957LxYtWoSGhgZ8+umnePjhh/HDDz9g6NChAJoO6CtXrsTkyZPx4osv4saNG9iwYQPq6+tvebIqLCxEdHQ0ZDIZnnrqKXh7e6O8vBwJCQmorKyEn58f1q1bh5UrV2LRokWYOHEiACg/i4mJiXjiiScQFBSEV199FQMGDMDevXvx7LPPora2Vvk+pqen49FHH4WTkxPefPNNmJiY4JtvvsGePXs69f4NGTIEI0eORGxsLCIiIgAAsbGxcHJywpgxY1qtL5VKMXfuXBQUFChfX1paGtavX4+//voLX3zxhbIFOzs7GzNmzICzszMMDQ2RmpqKjz/+GJmZmco/XhQUJ5p//vOfWLJkCQ4cOID33nsPdnZ2mDVr1i1fhyAIbQZ1sVgMkUgEY2NjZd2vWrUKMTExaGxsxPLlyyEIAt577z2V73lDQwMWLFiABx54AAsWLMC5c+ewadMm5OXl4eOPP1bZx8GDB/Hnn3/i6aefho2NDaytrZGeno45c+bAwcEBK1asgK2tLRISEvDGG2+gvLxc+cekOt+H559/HpcvX8azzz4LV1dXVFVV4fLly6ioqOjw/Vi8eDESExOxYMECjBw5EleuXEFMTAzOnz+P7777DkZGRsr1U1NT8dZbb+HJJ5+EjY0NfvjhB7z00ksYMmQIRo0a1e5+6urqMH/+fFy/fh1Lly6Ft7c3/vzzT2zevBkpKSnYvHkz7Ozs8N133+G1117DjRs3lJf2Fd+3zsjJyYG5uTkGDhzY6ecCfx8vWj6/uLgYc+fORX5+Pj799FOMHDmyS9vXRJn+85//YNeuXZg3bx6WL1+OyspKfPTRR5gzZw5++ukn2NjYqJT7+eefxxNPPIFnn30WBgYGePLJJ7Fs2TIMGDAAr7zyCgCo1HVntLV9hZSUFKxduxZLliyBjY0N4uLisGbNGjQ0NODxxx8HoP7xQvF5PXfuHJ5++mkEBATg7NmzePLJJ9Uq5/HjxyEWizFp0iS11lf3OKsOdc75vZ5At11xcbHg5eUlPPvssx2u93//93+Cl5eXUFJSIgiCIKxfv17w8vIS1q5dq7Lezz//LHh5eQk//fSTIAiCsG/fPsHLy0tISUlpd9t79uwRvLy8hP3796ssv3jxouDl5SV8/fXXymWTJk0SfH19hczMTJV1y8rKBD8/P+G///2vyvJnnnlGGDt2rNDQ0NCpfaWnp3f4+lasWNHu6xEEQUhNTRW8vLyE7777TmX5fffdJ8yePVv5+8KFC4WZM2d2uK22XL9+XfDy8hI2b94sNDQ0CHV1dcKlS5eE6OhowcvLSzh8+LAgCIIQGxsreHl5CS+88ILK8/Py8oRhw4YJr7/+usry6upqYdy4ccIzzzwjCIIgyOVyITw8XJg9e7bQ2NioXC8nJ0fw8/MTJk2apPJ8Ly8vYf369crfV65cKfj5+Qnp6entvhbFex8bG9vqsalTpwqzZs1S1p/CwoULhXHjxglyuVwQhKbPZ2BgoFBcXKxcRyaTCVOnThW8vLyE69evt7t/Qfj781xaWirExsYK/v7+Qnl5uSCTyYRx48YJMTExgiAIwvDhw1Xq/pNPPhF8fHyEixcvqmxP8blX1ENLcrlcaGhoEHbt2iX4+voKFRUVysfmzp0reHl5CRcuXFB5zt133y089thjHb4OQWj6jnh5ebX589FHH6ms+8svvwheXl7CF198IXz44YeCj4+PkJCQoLLOihUrBC8vL2Hbtm0qyzdt2iR4eXkJf/75p3KZl5eXMGLECJXXIwiC8Nhjjwnjx48Xbty4obJ89erVQkBAgHJ9db4Pw4cPF9asWdPhOitWrFD5bB49elTw8vIStmzZ0ubrb/49nTRpkhAQECDk5uYql9XW1gqjR48WXn755Q73u2PHDsHLy0vYu3evyvLNmzcLXl5eKu/t3LlzhenTp3e4vZbrNjQ0CA0NDUJRUZHw4YcfCl5eXsKOHTtU1lUcG7Zu3apclpiYKHh5eQm//PKL0NDQIEilUuH06dPClClTBF9fX+XxWfFcxU/Lz0JHWn43bkVxbDp//rzQ0NAgVFdXC4cOHRLCwsKE4OBgobi4WDh37pzg5eUlfPbZZyrPzc/PFwIDA4W3335b5T3y8vISTpw40Wpf06dPF+bOndtuGVoeHxTvV2JiolrbnzRpkuDt7d3qPPfPf/5TCAkJEWpqagRBUP94ceTIkQ6/c82PsW2ZOnWqMG7cuA7Xabm+OsfZtt4XxbGz5Wvp6Jzf27EbgA4TBAEAWvVzjIqKUvl92rRpkEgkyssnvr6+MDQ0xMsvv4xdu3a1OdL60KFDMDc3x6RJkyCTyZQ/vr6+sLW1xalTp1TW9/b2hpubm8oyKysrREZGYteuXcpLpZWVlfj9998xc+ZMZfcGdfelKH97r+9WvL294efnpzICNSMjAxcvXlS5nBwQEIDU1FS8+uqrOHbsGKqrq2+57ebeffdd+Pn5ISAgAPfeey/y8/OxevVqTJgwQWW9O++8U+X3hIQEyGQyzJw5U+V9MDY2xqhRo5Tvw9WrV1FUVIQZM2ao1L2TkxOCg4NvWb6jR48iNDQUHh4enXpdQFMLS2ZmprIOmpdz/PjxKC4uVl4WO3nyJMaMGaPSyiIWi3H33Xd3er9Tp06FoaEh4uLicOTIEZSUlGD27Nltrnvo0CF4enrC19dXpXzh4eEQiUQqn93Lly/jqaeeQmhoKHx9feHn54cVK1ZALpe3Gplra2uLwMBAlWXe3t5tXq5ty4gRI/Djjz+2+lF0uVC4++678eCDD+Kdd97Bpk2bsHDhQowbN67Nbbb8LsyYMQMAWl2+DgsLUxksVFdXh8TEREyZMgUmJiat6rGurg7nz58HoN73ITAwELt27cLGjRtx/vz5Vt0o2pKYmAgArVqIpk2bBjMzM/zxxx8qy319feHo6Kj83djYGK6urrd8/xMTE2FmZtbqEqtivy330xlpaWnw8/ODn58fwsPD8dFHH2HhwoWYM2eO2tt49tln4efnh6CgIDz88MOQy+VYv349fHx8VNYLDw+HkZER3nzzzR6f8eUf//gH/Pz8EBISgoULF8LGxgZbtmyBjY0NDh06BJFIhHvuuUflc2NjYwMfH59W5wYLC4s2r4BoSkfb9/T0bPU+zpgxA9XV1UhOTgag/vGivfOP4junSZ05zqpDnXN+b8duAFpgZWUFU1PTW07tk5ubC1NT01YjVm1tbVV+l0gksLS0VF6Sc3FxwRdffIGtW7di9erVqKmpweDBgzFv3jw88sgjAIDS0lJUVVXB39+/zX2Xl5d3uE+F6Oho7N+/H8ePH0dERAT27NmD+vp6lROUuvtSlL+916eO6OhorF69GhkZGfDw8EBsbCyMjIxUDjgLFy6EmZkZfv75Z3z77bcQi8UYOXIkli9frhx00JH58+fjnnvugYGBAczNzeHs7NzmwKmWr6OkpAQAWoUXBcXlLcX70TwEKtjY2CA3N7fD8pWXl7c7yvhWFGV866238NZbb7W7faCpvtorY2eZmZnh7rvvRmxsLBwdHTF27Fg4OTm1uW5paSmuXbsGPz+/DsuXl5eHhx9+GG5ubli1ahWcnJxgbGyMixcvYvXq1aitrVV5XlufMSMjI9TV1an1GgYMGKDW5wdo+pzu2LEDhoaG7Y6gl0gkrbqWKD5TLS+/t/ysVVRUQCaT4csvv8SXX37Z5vYV75M634f3338fmzZtwo8//ogPP/wQZmZmmDJlCp5//vl2jw0VFRWQSCStLneLRCLY2Ni0eg1dff8Vn8OW30Fra2tIJJIOuyrciouLC/773/9CEATk5eVh06ZN+OSTT+Dt7d1mf9K2LF++HGFhYRCLxbCysoKDg0Ob640dOxbz58/HkiVLMH/+fGzbtg3W1tZdLntH3nrrLXh4eEAikcDa2hp2dnbKx0pLSyEIAsaOHdvmcwcPHqzye3v1rykdbb+j44+i3tU9Xig+r+19527F0dERf/zxB2pqam45o0JnjrPqUOec39sxrGqBWCxGaGgojh07hoKCgjb7rRYUFCA5ORnjx49X6ccGNPXhaR5GZDIZKioqVA72I0eOxMiRIyGXy3Hp0iXlNFk2NjaYPn06rKysYGlpia1bt7ZZxn79+qn83t4o9vDwcNjZ2WHnzp2IiIjAzp07ERQUpNIXTN19Kcrf3utTx4wZM/Dmm29i165dePbZZ/Hzzz/jjjvuUAn8EokE//znP/HPf/4TVVVVOHHiBN5//3088cQTOHz48C379wwaNEitUNLyPVMcBNevX6/SgtSSYj3FAa25tpa19fyuzpmo2PfChQsxZcqUNtdRtLBbWlp2uYxtiY6Oxg8//IArV64o+xS2V0ZjY2OsXbu23ceBpn6cNTU1iImJUQm+qampXSqfptTU1OCFF16Aq6srSktL8dJLL6nMo6kgk8lQXl6ucvIsLi4G0DrYtfysmZubQywWY+bMmW0OggQAZ2dnAOp9HwYOHIiXXnoJL730EvLy8hAfH4/33nsPpaWlreYHVbC0tIRMJkNZWZlKYBUEASUlJWoH+1uxtLTEhQsXIAiCyvtQWloKmUzWKnx0hrGxsbKcgYGBCA0NxYwZM7B27VpMnDix1XGyLYMHD1b7tU6YMAEbN27E008/rQysXfnj71Y8PDzaLZOVlRVEIhG+/vrrNvuZtlzW2RlOFDMotJyrtr1w1tH2Ozr+KL4j6h4vFJ/X9r5ztxIeHo6EhAQcOnToln/IdOY4q65bnfN7O3YD0JIFCxZAEAS8+uqryhHACnK5HK+++ioEQcCCBQtaPTcuLk7l919//RUymQyjR49uta5YLEZQUJCyk7vi0sjEiRNRUVGBxsZGBAQEtPpxd3dX63UoToiKAR6XLl1qNYJb3X2FhoZ2+PrUYWFhgTvuuAO7d+/GoUOHUFxc3O6IcqDppD516lQ89NBDqKiouGWrZXeEh4dDIpEgOzu7zfdBcfJwc3ODra0t9uzZo+wKAjS1tKtz84Hx48fj5MmTyMzMbHcdxQmnZeuiu7s7XF1dkZqa2m4ZFYPXQkND8ccff6icMORyeZdHswYHByM6OhpTpkxp9wAONH2erl+/DktLyzbLpwhhipNc85OrIAj4/vvvu1Q+TXnllVeQn5+PDRs2YM2aNYiPj8cXX3zR5rotvwuKwWttfdebMzU1RWhoKC5fvqyc0L7lT1shTp3vg6OjI+bOnYuxY8e2OYuFguLSbcvplfbv34+amhqNXToeM2YMampqcPDgQZXlitkzNHmJ2srKCv/6179QUlKCr776SmPbbS4iIgIbN27E9evXMX/+fLXDkqZMnDgRgiCgsLCwzc+NupPRGxkZtTq+AFD+4XjlyhWV5V2Zoi4tLa3VH5979uxBv379lC2p6h4v2jv/qDtg9L777oOtrS3eeeeddhsLfvvtNwCdO852Vnvn/N6OLataMmLECKxatQpr167FQw89hIcffhiOjo7KmwJcuHABq1atQkhISKvnHjhwAGKxWDla/sMPP4SPjw+mTZsGoGky5sTEREycOBEODg6oq6tTTh6tuLQzffp0xMXFYcGCBZg3bx4CAwNhaGiIgoICnDx5EpMnT+4wMDQXHR2NLVu24F//+hdMTExa9VlUd18eHh645557sG3bNkgkEowdOxZpaWn49NNPO/XFjY6Oxt69e/H6669j0KBBrS5nPfXUU/D09IS/vz8GDhyI3NxcbNu2DU5OThgyZIja++ksZ2dnLFu2DB988AGuX7+O8ePHw9zcHCUlJUhKSoKpqalyrtFnnnkG//73v/H000/jH//4B6qqqrBhwwa1WlmeeeYZHD16FHPnzsXChQvh5eWFGzdu4NixY3j00Ufh4eEBFxcXmJiYIC4uDh4eHjAzM4OdnR3s7e3x2muv4cknn8Tjjz+O2bNnw97eHpWVlcjIyEBycjLWr18PAFi0aBHi4+PxyCOP4Omnn4aJiQm+/vrrbk2V0l7rR3OPPPIIfvvtN8ydOxePPvoovL290djYiPz8fCQkJOCxxx5DUFAQxo4dC0NDQzz33HN44oknUF9fjx07dnR5yqFbqaqqUvYDbc7IyEh5s4gffvgBP//8M9atWwdPT094enpi7ty5ePfddxESEqLSZ9bQ0BCff/45ampqEBAQoJwNYPz48WqNFH/ppZeUx5YHH3wQTk5OuHnzJrKzsxEfH6+c4udW34cbN25g/vz5mDFjBtzd3dGvXz8kJSXh2LFjHR4jxo0bh/DwcLz77ruorq5GSEgIrly5gvXr12PYsGGYOXNmJ9/hts2aNQtff/01VqxYgdzcXHh5eeHMmTP45JNPMGHChHYvZ3dnf59//jk+++wzPPzwwz1y04fw8HBs2rRJeaOFbdu2KS/Vnzp1StmnVS6XIzc3Vzl/5+jRo7s8S4HCiBEj8MADD2DVqlW4dOkSRo0aBVNTUxQXF+PMmTPw8vJqt7W+OS8vL/zyyy/Yu3cvnJ2dYWxsrPzDyc3NDW+//TbkcjnMzc1x8OBBnDlzptNltbOzw6JFi7BkyRLY2tri559/xvHjx7F8+XLlFTJ1jxfh4eEYNWoU3nnnHUilUvj7++Ps2bP46aef1CrLgAEDsHHjRixcuBCzZs3Cww8/jODgYBgaGuLatWv4+eefkZqaqhzLoO5xVh3qnPN7O4ZVLZo3bx4CAgLw2Wef4a233kJFRQUsLCwwYsQIfPPNN+0OpomJiUFMTAx27NgBkUiEyMhIrFq1StmC5Ovri+PHjyMmJgbFxcUwMzODl5cXNm3ahPDwcABNf31t2rQJ27dvx08//YTNmzdDLBZj0KBBGDVqFLy8vNR+HW5ubggODsa5c+cQFRWFAQMGqDzemX2tWbMGNjY22LVrF7788kv4+voiJiYGzz33nNrlGTt2LBwcHJCfn4+nnnqq1VRPoaGh2L9/P3744QdUV1fD1tYWY8eOxeLFi3v8bkkLFy6Eh4cHtm/fjl9++QX19fWwtbWFv78/HnzwQeV6999/P4CmaYWWLFkCJycnLFy4EKdPn241wKEle3t7/Pjjj1i/fj22bNmCiooKWFlZYcSIEcpLY6ampli7di02bNiAxx9/HA0NDViyZAmWLl2KsLAw/PDDD/j444+xdu1aVFVVwdLSEh4eHso/iICmk9Hnn3+Ot956CytWrICFhQXuuece3HXXXXj55Zc1/+b9j5mZGb7++mts3rwZ3333HXJycmBiYgIHBweVvq4eHh6IiYnBBx98gKVLl8LS0hIzZszAo48+qvZ0NJ1x9uxZPPDAA62W29vb4+jRo7hy5QreeOMNzJ49W6VP94oVK3Du3Dn83//9H3bv3g1zc3MATWH1448/xhtvvIFNmzbBxMQE999/P1544QW1yjN06FDs3LkTGzduxAcffICysjIMGDAAQ4YMURkMeKvvgyAICAwMxE8//YTc3FzIZDI4ODjgySefxBNPPNHu/kUiETZu3IiYmBjs3LkTH3/8MSwtLTFz5kw899xzXZ7KqCVjY2Ns374d77//PrZu3arss/3YY4+pzPWsKQYGBli+fDkWLFiAL774okf2ATSF/Y8//hhPPfWUMrDa29sjJiZG5Rhw6tQp5e/bt29XthB2x+rVqxEUFITvvvsOO3bsQGNjI+zs7Fr9QdWRpUuXori4GP/+979x8+ZNODk5IT4+HmKxGB9//DFef/11vPLKKzAyMsL06dPxn//8p80riR3x9fXFvffei5iYGGRlZcHOzg4rV67Eo48+qlxH3eOFgYEBNm3ahHXr1mHr1q1oaGhASEgINm/erHLc60hgYCDi4uLwxRdfYN++fdi6dSvkcjkcHBwQFhamclxU9zir7vtwq3N+bycSml9nJCIirXvxxRexf/9+tbp9EBHpO/ZZJSIiIiKdxbBKRERERDqL3QCIiIiISGexZZWIiIiIdBbDKhERERHpLIZVIiIiItJZejfP6pUrV1BTUwOJRO9eGhEREZHeaGhogEgkandeeQW9a1mtr6/H7R4zJgiCVvZLmse61B+sS/3ButQPrEf9oam6FARBrW3oZfOjoaGh8j7rt0NNTQ1SUlIwdOhQmJmZ3bb9kuaxLvUH61J/sC71A+tRf2iqLpOSktRaT+9aVomIiIhIfzCsEhEREZHOYlglIiIiIp3FsEpEREREOothlYiIiIh0FsMqEREREekshlUiIiIi0lkMq0RERESksxhWiYiIiEhnMawSERERkc5iWCUiIiIincWwSkREREQ6i2GViIiIiHQWwyoRERER6SyGVSIiIiLSWQyrRERERKSzGFaJiIiISGcxrBIRERERZPJGbRehTQyrRERERH1cXEImnlhzAHEJmdouSisMq0RERER9mEzeiNj4NJRW1iI2Pk3nWlgZVomIiIj6MInYANGRnrC2MEF0pCckYt2KhxJtF4CIiIiItCsq3B3TxrjqXFAF2LJKRERERIBOBlWAYZWIiIiIdBjDKhERERHpLIZVIiIiItJZDKtEREREpLMYVomIiIhIZzGsEhEREZHOYlglIiIiIp3FsEpEREREOothlYiIiIh0FsMqEREREekshlUiIiIi0lkMq0RERNQnyOSN2i4CdQHDKhEREem9uIRMPLHmAOISMrVdFOokhlUiIiLSazJ5I2Lj01BaWYvY+DS2sPYyOhVWb968ifHjx8Pb2xtJSUnaLg4RERHpAYnYANGRnrC2MEF0pCckYp2KP3QLEm0XoLmNGzdCLpdruxhERESkZ6LC3TFtjCuDai+kMzWWkZGBb775BkuXLtV2UYiIiEgPMaj2TjpTa2vWrMGcOXPg5uam7aIQERERkY7QibC6b98+pKam4umnn9Z2UYiIiIhIh2i9z6pUKsWbb76J5557Dv3799fINgVBQE1NjUa2pQ6pVKryL/VerEv9wbrUH6xL/cB61B+aqktBECASiW65ntbD6qZNm2BtbY17771XY9tsaGhASkqKxranrqysrNu+T+oZrEv9wbrUH6xL/cB61B+aqEsjI6NbrqPVsJqbm4vPPvsMH330EaqrqwFA2SJaU1ODmzdvol+/fp3erqGhIYYOHarRsnZEKpUiKysLrq6uMDU1vW37Jc1jXeoP1qX+YF3qB9aj/tBUXaanp6u1nlbDak5ODhoaGrBgwYJWj82fPx9BQUH4/vvvO71dkUgEMzMzTRSxU0xNTbWyX9I81qX+YF3qD9alfmA96o/u1qU6XQAALYdVX19fbN++XWVZSkoK1q1bh9deew0BAQFaKhkRERER6QKthlVzc3OEhoa2+Zifnx/8/Pxuc4mIiIiISJfoxNRVRERERERt0fpsAC2FhobiypUr2i4GEREREekAtqwSERERkc5iWCUiIiIincWwSkREREQ6i2GViIiIiHQWwyoRERER6SyGVSIiIiLSWQyrRERERKSzGFaJiIiISGcxrBIRERGRzmJYJSIiIiKdxbBKRERERDqLYZWIiIiIdBbDKhERERHpLIZVIiIiItJZDKtEREREpLMYVomIiIhIZzGsEhEREZHOYlglIiIiIp3FsEpEREREOothlYiIiIh0FsMqEREREekshlUiIiIi0lkMq0RERESksxhWiYiIiEhnMawSERERkc5iWCUiIiIincWwSkREREQ6i2GViIiIiHQWwyoRERER6SyGVSIiIiLSWQyrRERERKSzGFaJiIiISGcxrBIRERGRzmJYJSIiIiKdxbBKRERERDqLYZWIiIiIdBbDKhERERHpLIZVIiIiItJZDKtEREREpLMYVomIiIhIZzGsEhEREZHOYlglIiIiIp3FsEpEREREOothlYiIiIh0FsMqEREREekshlUiIiIi0lkMq0RERESksxhWiYiIiEhnMawSERERkc5iWCUiIiIincWwSkREREQ6i2GViIiIiHQWwyoRERER6SyGVSIiIiLSWQyrRERERKSzGFaJiIiISGcxrBIRERGRzmJYJSIiIiKdxbBKRERERDqLYZWIiIiIdBbDKhERERHpLIZVIiIiItJZDKtEREREpLMYVomIiIhIZzGsEhEREZHOYlglIiIiIp3FsEpEREREOothlYiIiIh0FsMqEREREekshlUiIiIi0lkMq0RERESksxhWiYiIiEhnMawSERERkc5iWCUiIiIincWwSkREREQ6i2GViIiIiHQWwyoRERER6SyGVSIiIiLSWQyrRERERKSzGFaJiIiISGcxrBIRERGRzmJYJSIiIiKdxbBKRERERDqLYZWIiIiIdBbDKhERERHpLIZVIiIiItJZEm0X4NixY/jkk0+Qnp6O6upq2Nvb44477sCSJUswYMAAbRePiIiIiLRI62G1srISwcHBeOSRR2Bubo60tDTExMQgLS0Nn332mbaLR0RERERapPWwOmPGDMyYMUP5e2hoKIyMjPDyyy+jsLAQ9vb2WiwdEREREWmTTvZZtbS0BADIZDLtFoSIiIiItErrLasKcrkcMpkM6enp+OijjzBp0iQ4OTlpu1hEREREpEU6E1YnTZqEwsJCAEBERAT++9//dnlbgiCgpqZGU0W7JalUqvIv9V6sS/3ButQfrEv9wHrUH5qqS0EQIBKJbrmeSBAEoVt70pDU1FTU1NQgPT0dGzduhIuLCz7//HOIxeJObScpKQn19fU9VEoiIiIi0hQjIyMEBAR0uI7OtKz6+PgAAEJCQjBs2DBER0fjwIEDmDp1aqe3ZWhoiKFDh2q6iO2SSqXIysqCq6srTE1Nb9t+SfNYl/qDdak/WJf6gfWoPzRVl+np6WqtpzNhtTlfX1+IxWJkZ2d36fkikQhmZmYaLtWtmZqaamW/pHmsS/3ButQfrEv9wHrUH92tS3W6AAA6OhvAuXPnIJfL4ezsrO2iEBEREZEWab1ldcmSJfD394e3tzdMTEyQmpqKrVu3wtvbG3fccYe2i0dEREREWqT1sBoYGIi9e/di8+bNEAQBTk5O+Mc//oHHH38cRkZG2i4eEREREWmR1sPqggULsGDBAm0Xg4iIiIh0kE72WSUiIiIiAhhWiYiIepxM3qjtIhD1WgyrREREPSguIRNPrDmAuIRMbReFqFdiWCUiIuohMnkjYuPTUFpZi9j4NLawEnUBwyoREVEPkYgNEB3pCWsLE0RHekIi5mmXqLO0PhsAERGRPosKd8e0Ma4MqkRdxG8OERHpLH25bM6gStR1bFklIiKdFJeQidj4NMye4IHBA9S7hzgR6R/+qUdERDqn+cCkXUcyIJYYartIRKQlDKtERKRzmg9Mmj3BA3JZg7aLRERawm4ARESkkxQDk+rrapGSUqnt4hCRlrBllYiIdBYHJhERjwJEREREpLMYVomIiIhIZzGsEhEREZHOYlglIiIiIp3FsEpEREREOothlYiIiIh0FsMqEREREekshlUiIiIi0lkMq0RERESksxhWiYiIiEhnMawSERERkc5iWCUiIiIincWwSkREREQ6i2GViIj0hkzeqO0iEJGGMawSEZFeiEvIxBNrDiAuIVPbRSEiDWJYJSKiXk8mb0RsfBpKK2sRG5/GFlYiPcKwSkREvZ5EbIDoSE9YW5ggOtITEjFPb0T6QqLtAhAREWlCVLg7po1xZVAl0jP8RhMRkd5gUCXSP/xWExEREZHOYlglIiIiIp3FsEpEREREOothlYioD+GUTkTU2zCsEhH1EZw0n4h6I4ZVIqI+gJPmE1FvxbBKRNQHcNJ8IuqteFMAIqI+gpPmE1FvxCMWEVEfwqBKRL0Nj1pEREREpLMYVomIiIhIZzGsEhEREZHOYlglIiIiIp3FsEpEREREOothlYiIiIh0VqfmWY2MjIRIJFJ7/d9//73TBSIiIs2SyRs5ZRUR9VqdCqujR49WCauJiYkoLi5GcHAwbG1tUVxcjHPnzsHOzg6hoaEaLywREXVOXEImYuPTEB3piahwd20Xh4io0zoVVt98803l/3fv3o2zZ8/it99+g6Ojo3J5bm4uHnvsMYwePVpzpSQiok6TyRsRG5+G0spaxMan8e5VRNQrdfmotWXLFixdulQlqAKAk5MTnn76aWzevLnbhSMioq6TiA0QHekJawsTREd6MqgSUa/UqZbV5rKzszFgwIA2H7OwsEBubm6XC0VERJoRFe7OFlUi6tW6fPRycnLCjz/+2OZj33//fasWVyIi0g4GVSLqzbrcsrpgwQKsWrUK9913H2bMmAEbGxuUlJRgz549SE5OxhtvvKHJchIRERFRH9TlsHrvvfcCAD744AOVgVe2trZ4/fXXER0d3f3SEREREVGf1uWwCjQF1tmzZyMzMxMVFRWwtLSEu7t7p+ZiJSIiIiJqT7fCKgCIRCJ4eHhooixERERERCq61es+IyMDzz33HMLDw+Hv74/k5GQAwIYNG5CYmKiRAhIRERFR39XlsJqSkoL77rsPp06dwujRoyGXy5WP3bx5E99++61GCkhEREREfVeXw+q7774Lb29vHDhwAG+//TYEQVA+FhgYiKSkJI0UkIiImsjkjdouAhHRbdflsHr27Fk88cQTMDU1bTWgSjGNFRERaUZcQiaeWHMAcQmZ2i4KEdFt1a0+q4aGhm0ur6yshJGRUXc2TURE/yOTNyI2Pg2llbWIjU9jCysR9SldDqve3t44ePBgm48dO3YMfn5+XS4UERH9TSI2QHSkJ6wtTBAd6ck7UhFRn9Llqavmz5+Pf/3rXzA1NcXMmTMBAPn5+UhMTERsbCzWr1+vsUISEfV1UeHumDbGlUGViPqcLofVu+++G9nZ2diwYQO+/PJLAMDSpUshFouxbNkyREZGaqyQREQEBlUi6pO6dVOAp556CrNmzcKxY8dQWloKKysrhIeHw8nJSVPlIyIiIqI+rMth9fTp0xg2bBgGDRqE+++/X+Wxmzdv4vLlyxg1alS3C0hEREREfVeXrynNnz8fGRkZbT529epVzJ8/v8uFIiIiIiICuhFWm98EoCWZTAYDA/atIiLqLE5LRUSkqlPdAKqrq1FVVaX8vbi4GHl5eSrr1NbWYteuXbCxsdFMCYmI+oi4hEzExqchOtITUeHu2i4OEZFO6FRY/eKLL/DRRx8BAEQiEZYsWdLmeoIgYOHChd0vHRFRH9Fy4v/2pqmSyRs5KwAR9SmdCqvjxo2DmZkZBEHAO++8g7lz58LR0VFlHSMjI3h5eWH06NEaLSgRkT5TTPwfG5+G6EltT/zPllci6os6FVaDg4MRHBwMAJBKpbj//vthb2/fIwUjIuprosLdYWokwY7fUgERVAKpui2vRET6pstHuiVLljCoEhFpkEzeiK/2paCoXIrY+DSVwVa85SoR9VVdnmd13bp1KCkpwXvvvdfqseXLl8PW1hYrVqzoVuGIiPoSla4AbQRS3nKViPqiLh/x4uPjER4e3uZj4eHhiI+P73KhiIj6qqhwd2x9aUq7fVIZVImor+nyUa+wsLDd26o6OjqioKCgy4UiIurLGEiJiP7W5SOiqakp8vPz23wsLy8PxsbGXS4UERERERHQjbAaHByMzz//HA0NDSrLGxoasG3bNuWsAUREREREXdXlAVaLFi3Cww8/jBkzZuC+++6Dvb09CgoKEBsbi7y8PLz22muaLCcRERER9UFdDqtBQUHYtGkTVq9erTIjgIuLCzZt2oTAwECNFJCIiIiI+q4uh1UAiIiIwIEDB5CVlYWysjIMHDgQrq6uGioaEZFu4i1PiYhun26FVQVXV1eGVCLqE3jLUyKi26tTYfX06dMYNmwY+vXrh9OnT99y/VGjRnW5YEREukLRkspbnhIR3X6dCqvz5s3D999/j8DAQMybNw8ikajN9QRBgEgkQkpKikYKSUSkLS1bUpvfYYqIiHpep8Lq9u3b4eHhofw/EZE+a6slVXHL032JWXhizQF2ByAi6mGdCqujR49u8/9ERPpIIjZQaUltfsn/x997tjsAB3ERETXRyACr7vj1118RFxeH5ORkVFZWYvDgwXjwwQcxZ84cGBjwQE1E2hUV7o47R7vA2Ojvw2VHIVYTOIiLiOhvnQqrK1euVHtdkUiEtWvX3nK9zz//HI6OjnjhhRdgbW2NkydPYs2aNbh+/TpWrFjRmeIREWlce8FR0R2gJ1pUOYiLiOhvnQqrJ0+eVPn9xo0buHHjBiQSCSwtLVFRUQGZTIYBAwbA3NxcrW1+/PHHGDhwoPL3sLAw1NTU4Ouvv8azzz4LIyOjzhSRiEhjbhUceyJE9nSrLRFRb9OpsBofH6/8/8WLF7F06VK88sormDZtGsRiMeRyOfbu3Yt33nkH77//vlrbbB5UFXx9fVFXV4eKigrY2dl1pohERBqjreDYU622RES9UZf7rL711lt47LHHMGPGDOUysViMqKgolJaWYu3atfj222+7tO0zZ87A0tIS1tbWXS0eEZFGaCs4MqgSETXpclhNTk7GkiVL2nzMy8sLH3zwQZe2m5SUhJ07d+Lpp5+GWCzu0jYEQUBNTU2XntsVUqlU5V/qvViX+kPTdVmvka1QV/B7qR9Yj/pDU3WpmJf/VrocVvv3748TJ05gzJgxrR47ceIE+vfv3+ltFhcXY9myZQgICMCTTz7Z1aKhoaFBKzckyMrKuu37pJ7ButQft6suRSIRJBJDyGQNEAThtuyzr+H3Uj+wHvWHJupSnbFJXQ6r99xzDz799FPIZDJERUXBxsYGJSUliIuLw7Zt2/Doo492ans3btzAk08+CRMTE2zatAmGhoZdLRoMDQ0xdOjQLj+/s6RSKbKysuDq6gpTU9Pbtl/SPNal/rjddXngdB52HcnA7AkemDLKscf315fwe6kfWI/6Q1N1mZ6ertZ6XQ6rzz33HMrKyvD555/jiy++UC4XBAH33HMPnnvuObW3VVdXh0WLFqGkpATfffcdrKysulosAE0tHGZmZt3aRleYmppqZb+keaxL/XE76lImb8SuIxkorazFriMZmB7uzj6nPYDfS/3AetQf3a1LdboAAN0IqxKJBG+++SYWLFiAxMREVFZWwtLSEqNHj1beklUdMpkMzzzzDFJTU/HVV1/Bycmpq0UiIuoxHd1RitNNERH1nG7fwcrd3R3u7l2/w8rq1atx6NAhPP/886itrcX58+eVjw0dOrRLfV+JiDRJnTtKcbopIqKe0a2wWl9fj507d+LUqVOoqKjAf/7zH7i6uuLgwYPw9vbG4MGDb7mNhIQEAMA777zT6rHt27cjNDS0O0UkIuqWztxRikGViEjzuhxWy8rK8MgjjyAtLQ02NjYoLS3FzZs3AQC///47EhIS8Oqrr95yO81vNEBEpGt4iZ+ISLu6HFbfeecdVFVVITY2Ft7e3vD391c+Fhoaii1btmikgERE2sZL/ERE2tPlI+/hw4exbNky+Pn5tRrNZW9vj4KCgm4XjoioM2Tyxh7bNoMqEZF2dPnoW11dDUfHtucSlMlkkMvlXS4UEVFnxSVk4ok1BxCXkKntohARkQZ1Oaw6OzurjNxv7uLFi3Bzc+vqpomIOqXlIChFC6tIJEJjI+8mRUTUm3U5rEZFRWHLli04ePCg8taCIpEIFy9exPbt2zFz5kyNFZKIqCOKQVDWFiYqg6CuFIqx5L8JbG0lIurFujzA6sknn8TZs2exZMkSWFhYAAAef/xxVFRUICIiAvPnz9dYIYmIbqXlIKjGRgH7TxeoNeUUERHpri6HVUNDQ2zZsgV79+7F4cOHUVpaCisrK0ycOBHTp0+HgQFPCkR0ezUPowYGItw1ahD2ny7glFNERL1Yl8JqbW0tHn30USxbtgzTp0/H9OnTNV0uIqJu87aXI+q5cPTv30/bRSEioi7qUlODiYkJ/vrrL4jFYk2Xh4hIYwRBgIGBqEentCIiop7V5etiwcHBuHjxoibLQkSkUSKRCAdO53FKKyKiXqzLYXXFihX47rvvsHv3buVtVomIdIlEYohdRzJaTWlFRES9R5cHWD3wwANoaGjAypUrsXLlSpiYmKjcyUokEuHMmTMaKSQR9R0yeaPGBkPJZA2YPcEDu45kcJAVEVEv1eWwOnXqVE2Wg4gIcQmZiI1PQ3SkJ6LC3bu9PUEQMGWUI6aHuzOoEhH1Up0Oq7W1tTh48CDc3NxgZWWFyZMnY+DAgT1RNiLqQ1rehUqT86IyqBIR9V6dCquFhYWYO3cucnJyIAgCRCIR3n77bWzZsgXDhw/voSISUV+guAuVomWVAZOIiIBOhtUPPvgAhYWFWLRoEYKCgnDt2jV8/PHHePXVV7F79+4eKiIR9RUt70LVXYaGRhrZDhERaU+nwuqJEyewcOFCPP3008plLi4uWLRoEUpKSmBjY6PxAhJR36KpoHrgdB52HcnD7AmmmDlhqEa2SUREt1+nzgolJSUYNWqUyrLRo0dDEASUlJRotGBERF0lkzcqp6zadSSDU1YREfVinQqrcrkcJiYmKsuMjY2VjxER9TR1gqdEbIDZEzxgbWGC2RM82P+ViKgX6/RsAJmZmSq3WVWE1MzM1neH8fPz60bRiIhUdWZqqymjHOFqJYXnUMfbVDoiIuoJnQ6rK1eubHP5Cy+8oPy/YqaAlJSUrpeMiKiZrkxtJWuov02lIyKintKpsLpu3bqeKgcRUYc4tRURUd/UqbA6e/bsnioHEdEtaXpqKyIi0n084hNRr8KgSkTUt/CoT0REREQ6i2GViIiIiHQWwyoRERER6SyGVSIiIiLSWQyrRNSjeKtTIiLqDoZVIuoxcQmZeGLNAcQltL7DHRERkToYVomoR7S845Q6LaxshSUiopYYVomoRyjuOGVtYaLWHae62grLgEtEpN86dQcrIqLOUPeOUy1bYdW9S1VcQqby9qtR4e6aKjYREekQtqwSUY9SJ3R2thUW6Fo3AyIi6n3YskpEOkHdVlgFRcBVtKzyNqxERPqJYZWIdEZnA2dnAy4REfU+PMITUbdp8xI8gyoRkX7jUZ6IbqmjMLrneCZWbUzAnuOcS5WIiDSPYZWIOtTRlFIyeSNSMstQVC5FSmYZBzkREZHGMawSUbvUGXGffLUUpZW1SL5aqoUSEhGRvmNYJaJ23WpKqa5MOUVERNQZnA2AiDp0qxH3HJFPREQ9iWcXImql5eX+lkH0Vo93tC4REVFnMKwSkYqOBlSp83hX1yUiImoLwyoRKcnkjdh9OB3GhmLsPpzeqlW0M7c45e1QiYhIExhWiUhJIjZAxHAn1DXIETHcqVsDqjj4ioiINIEDrIhISSZvxOGzOSitrMXhszmYO823VcjszIAqDr4iIqLu4hmEiAA0BVV1W0MVy9W5tM+gSkRE3cGWVSJC3LFMxB5KQ3Skp9qtoXEJmYiN//s5REREPYFNHkR93MHT2Yg9pDoQ6lZBlYOniIjodmFYJerFuhsSZfJG7NifCi8Xq6ZL/5PUGwjFwVNERHS7sBsAUS+licvwErEBZk0cit2H0zF3qi/uGO2i9nM5eIqIiG4HhlWiXqDlpfmWl+G7Exq7EzoZVImIqKfxTEOk49q6C5SmL8MzdBIRka5iyyqRDuuoBZWX4YmIqC/gWY5Ih92qBbWjoMoR+kREpA/Yskqk47rSgqoYfHXfZE9MDWt6rjpTUhEREekahlWiXqAzIVPRdaCyug4pmWXYdSgdEcOdcPhsDifwJyKiXofNLER6RtF1wHOwJZKvlkIiNsDhszmcwJ+IiHolhlUiPaIIolHh7li7OBzRkZ6QyRsxMcSZE/gTEVGvxG4ARHqi5U0CJGIDlf6uc6f5MqgSEVGvwzMXkR5oOcVV80v9ioDKoEpERL0Rz15EekDTNwkgIiLSFewGQKRjujrFFG8SQERE+ohnNSId0tatVTuDQZWIiPQNz2xEOqKjfqdERER9FcMqkY7QdL9Thl0iItIH7LNKpEM66nfamb6sbd1ulYiIqDfiGYxIx7QVLNvqy9pey2nL2612pw8sERGRtjGsEum4tvqydjQQq+XtVtkHloiIejOGVaLbrLOhURE+7axMMXeaLwDcciBW89utcu5VIiLqzdhnleg2anlLVHVFhbsDAvDVrymQ1slw32RP/Ph7WochtOXtVomIiHojnsGIbpPuTE0lkzci9tDfz50a5oqtL01RK/AyqBIRUW/GsxjRbdKdqanaei5DKBER9QXsBkDUw5pPOdWdy/KauqTf1du5EhERaQPPWEQ9qK1R+90Jit0Nmd29nSsREdHtxrBK1EN07fapulYeIiIidTCsEmmYIgSq9DOdpPmpo7o6BRansiIiot6EfVaJNKjl1FRR4e4wNZJgx2+pgAgdjt7vTF/SPcczcfRsDsaHOGPGuM5NgcWprIiIqDfhGYtIQ9q6zC6TN+KrfSkoKpd2eOm9M31JZfJGpGSWoahcipTMsi61sBIREfUWPGsRaUh700vd6tJ7V/qSKm6jmny1tCdeChERkc5gNwAiDWrrMntby5pf8lcEWkX3gVu1fHZ2/ZY4dRUREfUmDKtEGtZWEJSIDZQhMS4hE7sPp+PBu3xwxygXAJ3vS9rVvqddvd0rERGRtrB5haiHNL+cr+yTeiwTcUcz4OFsia9+TUHcsa7Pv9qVFlVOXUVERL0NwypRD2g+YEolJB5Kw7y7h+Gv7HLl77crNHLqKiIi6o3YDYBIw1q2YE4b44r7Jnsqp5qKGO6Eiht1iD3UtT6n3cGpq4iIqLfReli9du0aPv30U1y4cAFpaWlwd3fHnj17tF0soi5rawCUIABF5VIIQtM6URHumDZWO6GRQZWIiHoTrYfVtLQ0HDlyBEFBQWhsbISgOJsT9WLNWzDbamlVTGtFREREHdP62TIyMhJHjhzB+vXr4efnp+3iEHWb4mYALaemYl9RIiKiztN6y6qBAU/cpD8UU0P5uVnD132g8lao7fUV5ZynREREHeNZkkhDml/uT75aiqNnc1RG+rcMpZ25xSoREVFfpfWW1Z4gCAJqampu2/6kUqnKv9R7Na/LxkYBBgaiTj1/9gQP7DqSAT83a3gPsUR9XS3q21ivsVFQ6cc6abh9p/dFHeP3Un+wLvUD61F/aKouBUGASHTrc59I0KERTS+++CIuXbrUrdkAkpKSUF/fVjwgUo9IJMKVQjH2ny7AXaMGwdtervbAP5FIBImhEQBA1lDf7vO6sw8iIiJ9YWRkhICAgA7X0cuWVUNDQwwdOvS27U8qlSIrKwuurq4wNTW9bfslzZNKpcjNzcP+03korazF/tMFiHouvEdaPX18gKiIoWxR7SH8XuoP1qV+YD3qD03VZXp6ulrr6WVYFYlEMDMzu+37NTU11cp+SbNksgbl5fzoSE/079+PA6F6MX4v9QfrUj+wHvVHd+tSnS4AgJ6GVaLuEAQBU0Y5Ynq4OyRiA+UI/+hIT0SFu2u7eERERH2K1sOqVCrFkSNHAAC5ubmorq7Gvn37AACjR4/GwIEDtVk86oMM/9fntKMJ/RUtrWxxJSIi6llaD6ulpaV45plnVJYpft++fTtCQ0O1USzqow6czsOuI3mYPcEUMycMbfPWqYqW1okhzjh2PhezJg5liysREVEP0XpYdXZ2xpUrV7RdDOqjmreMyuSN2HUkA6WVtdh1JEPZDaC9W6cePpsDI0OxSosrERERaRbPrtRntTUpv5+bNawtTODnZq1c1t6tUyeGOEMub+QtVImIiHqQ1ltWibShvb6o3kMsUVxRAx9XSwBoc3BV85bWudN8NRJU2feViIiobTw7Up/UvIU0OtJTufyOkY6YO3EgGhuBVRsTVAJt81unNt9Od/G2q0RERO1jWCW91Va4bC4q3B1bX5oCkQjKsNjY2HQXqZ+PZaK6pgF+7tbKQKsIppoMly1beG9VZiIior6GYZX0UmcC5Y+//x0WV39+BlfyBIQHOaGmTgZ7KzNsfWmKsguAJsOl4tJ/8xZedgUgIiJSxTMj6Z22AmV7oVLR79TOyhR+btZIu16B03+V48i5HJRW1iL+zPVW62siXDYP04oWXk5/RURE1BrDKumdloFyX2JWu62scQmZ+OrXFMyeOBQhvnYYaG6CEZ6WmD3Bo91A2t1w2VaYZosqERFR2zgbAOklxYh9AFi47iCMDcXYfThduUwiNkBdvQyx8WmorK5DSlYZkjNLMXu8O1wsquHt/fftVtvSnXDZ1o0GiIiIqG0Mq6S3FCEwYrgTDp/NwcQQZ+xLzMKPvzfdferExTxMDHHGlWtNQbW0sha7jmbiudlOKs/vCc2nvyIiIqL28UxJvYY6g5lariOTN+Lw2RxUVtfh8tVSHD+fq7z7FEQinLiYh5WPjEKAh02rmwH0NAZVIiKiW+PZknoFdUb3t7WORGyA+yZ7YmyAI4rKpfAeMhB2VqaYGOIMCAJGDRuEZf89Aot+RnCwNoP3EEvIGupvx0siIiIiNTCsks5TZ7qojtaZGuaK5KulyhbVBbMD8OgMP3zw3AQcv5iH0spaJFzMw8uPh+KOkY4QBOF2vjwiIiLqAMMq6Tx1povqaB25vBHRk5oeG+lrj827khB3LBNmJkaYGOIMawsTTAxxhpmJ0e18WURERKQGDrCiXqHlgKTm0z0p/j9tjGurdb7Zl4r4M9cROWIwnpzlj89+TkZZVS2Onc/BtLGueHSGHx680xti9h8lIiLSSQyr1Gs0v92pYtonkQjK0f3Hzudi9qShmBrmil//yEJsfBr83KxRWV2H+DPX8cAUL5RX1SHlahmSr5bi1z+yEBXujt9OZSu3NzlkkJZfJRERETXHsEq9Ssu+qXZWpsq+qKbGYqRkluHo2RwUlUtRWlmL5Kul8BxsiWFu1jA2kmDKaBfsOpyufP6do11UtjdpuL22XyIRERE1w2uf1Ov4uVk3TTPlbo3xwc6wszLFjHFusDY3Qeq1MlTX1CunopoY4gyxgQi2A00Rl5CJhW/+jojhTrCzMkV0pCeMjSQqfV0NDETafnlERETUDFtWqVeRiA3g6z4QxRU18HUbiBnj3GFiJMGO31Ixe+JQlFRIcfhsDmytTPHJi5Ox+O14FJVLIW8UlK2th8/m4JMXJ8PYqOnj37w/bE1NTat98naoRERE2sOwSjqvZVicMc4dU8OawmXcsUzEHkqDl4sVjl/IRX5pDUoraxH/53U42vTHhGBnxJ+5Dn93G1hZmCj7piqCqkJ7YbR5/9iocPcefZ1ERETUGpuLSKfFJWRi4bqDOHg6W2W5RGzQ1H/1UFN/07+yyzF5lItyiiovFyvs+C0Vl7NKYWQoxpFzOZg2xhVbX5qiduhUZ35XIiIi6lkMq6SzZPJG7D6cDg9nS+zYn4qDp1oHVsU8qRFBjoiNT0PljVrMneqLjJwKzJo4FOHDnVDfIMesiUMhERt06nK+OvO73g4MyURE1JexGwBpVVvzpSpIxAZ48C4f7NifCg9nS3y1LwXSOhmiItyVAe7Y+VwYGYpxIikfpsYS5JZU4/47vDBxhLNyW4ouA+3ttyMt53e93dgNgYiI+jq2rJLWxCVk4ok1BxCXkIk9x//+f3N3jHLBg3f64K/scpRW1mL3kXTEHWta99c/sjB70lDUN8gxxt8Rw9yskZJVjh2/XWkVetvbrzq02aLKbghERNTXMaySVrQMYkfP5rQbyu4Y7YL7Ij3h62qFB+/yUfZTjY1Pw5RRLrCzMkVhWRVOXS5Qjvavq5eptV9dDoC60g2BiIhIm3j2I61oGcTG/6/vaXuhTABQVC7FhbRijPYbpFzX2EiCiGBnXCu4iXGBjsq5VZuP9m8eSHtbAIwKd+/UoDAiIiJ9wz6rpDUt+4NOGeXSKmQqR/3/rzU0Kb0E4YGOsLMyhajFdvafzILvECvYDjRVbqOtPp/a7ofaWb2lnERERD2BZ0HSKkUQU9xdStGPtGW/UkVr6HAvW/xxKR8pWeX4sdllfLm8ET8cTEPCxXz8cLBpeUeX/NsKgLrcJYCIiKivYlil265lKGwZKqtr6lR+X7UxASIAn7w4GUvuH45ZE4aqXMaPS8jE4rfjldNYKZZ35pK/IhwfOJ0Hkaj1LVc7CrIMuURERD2H3QDotmpvKiY/N2skXy2Fn5s13v7yT4wLdMTxi3nwc7PGiaQ8FJVLkXAhF+OCnBAV4Y47Q5u6DDQPusfO56rcRhVQ75J/823sOpKBZ2c5qlXmWz1GRERE3ceWVepRzVsd27ssvy8xCwZiEeysTGFgIIKZqRFOXy7AhuUTYWAggkV/Y/i5WaOyuh67DzdNXaXoMtC89XTWxKGtbqMK3LrPZ/NtzJ7gAbms4ZZlvtVjREREpBkMq9RjFJfW9xzPVA6Wan5ZHgDq6mX4+UgGqqrrYGAgQsKFXKRmleHO0CHob2YMb1cr2FmZYoCZEeob5K2mrpLJGzUyYl6xjSmjHCEIgnJ5R10JetvMAkRERL0RuwFQj1DcKtXMWIwrWeX48fe/L5UrRu5/+O05JGWUYEyAA04lFyDM3wHF5VLMHO+Bu8KGAGi6+9TUMFcAwBOz/CERG0BaJ1NeelcERE0ERYnYAPVtLO+oK0Fvm1mAiIiot+EZlnqERGyACSHOqKmTo7FRQGV1nbIlVC5vREZ2OZIySlBaWYs/kvJhZGiA4xfzsPGFSFTcqMPSdw/hs58vNd2p6kSWcsCUplpSu/J6uvIYERERdQ9bVqlHHDydjfg/r6O0shap18owNtABIb722H8yC5czypB6rQyjhw3CqcsF8HOzRlmVFHePcwIAHDmXgxG+9jh6Prfpcv+hNJiZSFDTrEVV3aCq6H5AREREvRPDKnVZe0FQJm/Ejv2p8HKxQkZOBUb52iMxuQAQRDAxFiP1WhmKyqU4k1qIYW5WcB3UH88+FKLc1rQxrvj1jyzl8yOGO+FyZjHOXClR9lVV59I7R+oTERH1fmxyoi5pPmm/YhS84l+J2ACzJg5FRk4F7hg1GInJBSitrEXy1VJcL7yB0P/dLnXWxKHwcR2IfSezcfhsjvL59032wszxHsjKq8TYAAccPpsDI0NDBHjYqD2YiSP1iYiI9ANbVqnTWgbByxklcLDpjyPncjBr4lBEhbsjKtwdk0c6Y/fhdJXL/QYGIhw4dQ1ujha4c7QLFr8dDw9nS3z1awouZZRg6GBLzBjnjnvGe+CO0YOx+O1DTf1aL+XD1FgMOytTTBvjessyKkbqtxyIRURERL0Lz+DUac2nbBo9bBD+ul6BwrIamBqLcexcUwvpF3uSsfjtQxAEEc6kFsLUWIyK6lr0M5Ggn6kRbC3NcPDPbDx4lw/+yi5HaWUtzv9VjKPNWljNTIyUd6UK83dAfUMjIoKd1Q6e2hiIRURERJrFllVSi6J/quJfkQjwc7PCmdRCFJVLIW8sxbggRyQm5ePQn9dx4mIeSitrEX/mOkL9BuFEUj6cbAfg7JUijA1wwIFT15B8tRRbX5oCaa0MsYfS4OdmDV/3gSph9NEZfnjwTm+IxQZ4YqZ/p1tI2aJKRETUuzGs0i3FJWRi9+F0RAx3wuGzOYie5IkTF3ORnlOB0X4OkDeW4o5Rg5GaVQZPFyt8vT8V44c7IfFSPqaPdYWBpOnuVNI6GYorpDiRlA83RwvYWpr97+5VwCBrM4T42GHyKJdW+2/rrlRERETUNzAFUIcU/VONDcU4fDZHOZWU7xArhPk7Nt0OtZ8hpPUy5JfWwKe/CSqr63D8Yh7uGe+OXYcz4OdmjbTrFai4UYepY4ZggJkhLmeW4WpeBVKvlcHawgQpWeUoKK3BhBBnAGwRJSIioiZMBHRL0ZGekMkblf1HFf1UGxsFWPY3hruTJY5fyIeHsyVSr5XBc7AlwgIGYdfhDOUsAAHu1gjxsccfSQWoqZXDxd4cNXVyRAx3woQRzspR/vsSs5SzDBARERExrFK7Dv6ZjZc/Pg4DA+DBu3xw8lI+Ft0XqOynmny1FLnFlTj3VzFKK2vxV3Y5po11hdhABKt+hgjxtoO1hQn83K0xPripW0BpZS0SLuQhPbcCpZW1OHw2B3eFumLrS1MwbYwrfvyd000RERHR3xhWqU1xxzLx1d4U2Fv3w1/XKnD07HV4DLbCp7svYVqYKxyszTB5xGA0NAjwc7OGtYUJhnvZIircHQMtTJCRV40LacUYG+gAkYEIh8/mIDzQUdkya21h2tSaOslTeSvV5rMMcLopIiIiAthnldogkzci9lCacjqpsQEO8HV1xA+/p8HD2RJ7TlxFmP8gHD6XgzEBDpAYGMDOyhQ25ib47VQ2isulKCqXorSyFicvFSBgqA1yim/CysIU44IGofxGA/7KLsfcqb64Y7TqgKqocPcO707F26cSERH1LTzrUysSsQHunTgU1hYm8HKxwsnkApy4kIcJwc7KOVETLxVALDbAH0n5MDOVoKa2AclXSxEbn4areZUY7mULawsTPHiXD87/r5tAUnoJ3J2scCWrDA/e6dMqqDbff1ua3zWLiIiI+ga2rFIrB09n49cTVzEzwh2nLhfAe4gVbMxN8Vd2GQI9bXExrRjhQU64XlAJe+v+2HsiC8Fethhs3x9GhhKkXC1DUnoJJo8ajIkhzpDWyRAbnwY/d2tI62T4ZOUdnW4dbXnXrI5aX4mIiEh/8GxPKgOZ4o5l4qtfU+Drbo2aOhmKyqUwlIghiATkFN9EY2MjpoUORvmNWkjr5Th1uQCllbU491cx0rIrYGoiRvLVUhSVS/H76euQyxtx52gXDLI2w4mLefjx97QulZH9WYmIiPomtqz2cXEJmYiNT0N0pCemjXFF7KE0VFbXobFRwIFT2fBysUJpRQ2KygSYGTe1msrljUjJKkdldR3GBjgi+WopRg8b1HRbVVND+LlbIzmzFH7u1jhwKhs/xqdhYogzisulmDVxaJeD5q36sxIREZH+4Vm/D2k5FVTLS+tyeSPuGDUYYf6DkJReAmNDMbLyKuEyyAL5pTVwc7TAGH8HWPU3hZ+bNSz6G8Pe2gyfvDgZ1wurUFZViwt/FWNMgAMcrM1gNcAYP/5v+4fP5mDjC5GICnfv1mtgUCUiIupbeObvI1oOTlKMqldcWvdzs8aWny+hsEyKjJxKjB42CHUNcowf7owTSXnKyf3/ul6OzPwKZOZVYpC1GR66ywe/ncxGfmkNxgY6YkKwM7b+dAn21v1w+Gy2cqBVdKSnRm+byjlYiYiI+gZ2A+gDmreg7j6cDqFRwM7D6cpL/1eySnEtvxIW/Y2RU3wTxoZinLpcAGNDMY6ez8HYAEecSMqDn5s1BAgwlIjR2CggItgJh8/mKKe5Ss4sRbFVjXLKq/HDB+NkckGbU1R1R/OuC91tqSUiIiLdxpbVPqB5C+q0Ma7YeThdGVy/+S0VlzLL4OtmjeGe1vD3sIYgCAj1a2pZDfa2Q51cjv6mEpRVSVFVXQdjQzGKyqUwNpRgx/5U+LgOhK+rFYI8bWE/sB+sLUwQObIpqBaVS/HVvhSNtYS27LrAFlYiIiL9xrDaR0SFu2PjC5OQnFmCsAAH2FmZ4sE7vRF/+jpKK2tx6nIhGuSAo7UZ5kzxwsnkplH+J5MLYN3fGNVSGbxcrGA5wFg5A8CO/amYPXEoxAYiFJVLIW9shIGBCKbGYiScz8WsCUM1PnqfswIQERH1LewG0EfsScjElWvlMDMxQmJSPqaEuqC+vgFjAxxwIikfXi5WOHQmB37u1vjt1HXl6P4wfwdkF1XC2bYfruZWwszMCH5u1ki+WopZE4di6hhX5SCqSxmlmDvNFxfSipWX6KeN1fzofc4KQERE1HcwrPYBB09n4+i5HABQ3gb1wMls2FmZwtGuP6aGDcGBU9kI9RuEE0n5/2tpLYDvECtk5lZA3ijAc7AVaupkuJhWjDH+DvjkxckQiw2ULZ2KPqR3jHLBxBBnZZDUdKBUDAxjUCUiIuobGFb1XF29DN8fuAJ3Z0sYig3gbD8AOYU3YG/dDxnXK5CUVoIbgwbgjpGDUVAu/XuOVDdrGBiIYGNlhoTzuSgql+KJmf7ILboBB7t++O1Utsogp+YtnT0VJDmwioiIqO9h85Qe23M8Ey9/cgLhQU64llcJL2dz9DeRoKhciv4mhqhrkGOEjz1c7M1x6GwO/souR8WNWkSOcIa1pTFsLIxh0c8YFv2N4edmjYrqOqxdHI67Ql1bDXLq6ZZODqwiIiLqmxhW9YxM3qj8SblahqJyKQrLa/DOsggUV9bi6PmmOVP/uJQPsdgApy4XIDmrFCN97SGtk8HZbgCOnMtFfUMjBEGE8htS2FmZ4kTS37dK1cYgJw6sIiIi6pvYDUCPKC6T+7lbY9hgCyRnlirnP333qz/hYm+J4V62OP9XMcYGOOBkcgGGe9nCUGKA6joZ7gp1wcHT11FULkXipQKMDXRAalY5RvjYo6hcqhIStTHIiQOriIiI+h6GVT1RVy9DbHwabkrrIYKAG7Uy5ah9Pzdr2FmZoaiiBqlZZZg8cjD+vFyAeyLc8cvxq/AaYgVbC1McPZcLTxcrWFuYwN3RAicvNc2Teia1UDmgqjlthEYGVSIior6FYbUXU/TbPHo+B5nZVRg/3An1DY1ITM6HWCxGP1MJ7KxMYWAgQoNMrmxpPXw2B0Getth1JAPeLlZN3QUsTDDSdxDKq2tRVC7FUGcrBHvb4c+UQsyaOBQHTmfjx987P7jpdvRnJSIiIv3FsNpL/XIiE5k5lbheeAP2A/uhn4khruVVwsbSDHX1DTAUi3AmtQihfoNw4NQ1mPczxpgAB/yRlI+pYUPwy4kslFbW4kp2OcYFOWLv8asA/p7a6kRSHgZZm2HuVF9MHOGMJ9YcUA5uUvdSPEfvExERUXexyasXkskbkV90E7V1chSVS9HYKCCnqAoezhZwdzDH41H++DO1CEXlUpxIyoebowWGOlvi/JUiTAh2woW0Yvi5WcPawgR+btawtTCBRX9jjA92xsQQZ1hbmGD0sEHIyKnAV/tSAKDTg5s4ep+IiIg0gS2rvYwi9GXkViC/tAallbVIvVaGsQEOOHYhr2l+VLEIEUFOuJJdhqFOlhgyaAC+j09DqN8gHDmXi8rqOoQHOcHOyhTGxmL8mVKI+dN8ceDUNRSVS2FkKMaZ1EK4OVogItgZADBtjGunBje1vFkAuwIQERFRVzCs9iJ7jmfi6NkcTBzpjHFBTsjKq8SfqUWYFOKMg39ebxr5f7UUvkOsIEBAUbkUtlZmKKuqxYxwV5SW1ytvr9rPxBDeQwZCLhPgYm+Bbw9cQXGFFGMDHJF8tRTRkZ6YNsYV+09m4f1vziqXdeZyPkfvExERUXcxRfQSdfUyZFyvQFG5FLmFN2HeT4yhg83hO8QK+05eRajfIOXl+7+uV6Ciug5mxhKkZpXhQkYJ0q5XQGIowsnkAozys8fZK4Uoq6zB1fwK5BRVYWygIyz6G8PXfSC2vjRFGUqPnMlB8tXSLl/OZ1AlIiKi7mDLai8Ql5CJY+dylIOfjl/MQ1Z+JXyHWOOv6xUI9hoEUyMJfIdY4UxqIYrKpZA3lsLOyhT+DjYwFBtggJkE+05mo7SyFqeTCzE2wAE3ahrg62qNB6Z4wdhIgrnTfFXCpURsgPEhzkjJLFO2rDJ8EhER0e3EsKrDFK2YsfFpqKtvwISQwUi8VIAwPwfU1DXg4J/Z8HKxQklFDZIySlBZXYexAY6QNzbNrXoiKQ9F5VJEjXXH3sSrCPN3QOKlfIR426Gw7CZqamUoqajBy5+cQESwc5uX+GeMc8fUMFcAbCUlIiKi249hVUclXMzFvuNXMSbQEaOHDYKZiQTHzufCyFCMawWVysFVf2WX4+4xrrApqkZSRgkaBQFONv1gZiKBRX9j+LlZo7hSiqqbdTh9uQC+Q6xgb2WKC2nF8BkyEAYGIiRcyEVRubTd/qUMqURERKQtDKs66Mff/8Kvf2TBZ8hApOdWwtPRHOfTihEV4Y69x6/CxsoMQ50tcexCHsICHHAxvRg2A/thkLUZKm7UYqCFKdwcB+BaQRVOJOXBor8xRvrYw8DAAD6uVoiK8MCsiUMhFhtgX2IWkjJKeImfiIiIdBLDqo7Z8Vsq9ideg7eLFTJzKxAe5ITsopuwsTTD7iMZCPMfhMzr5SivlODO0S7IL61BTvFNWPQ3wbAh1qiXyVFWXYe9x7MQ4GmLonIpxgY4IL+kGk52A2AgFgEAjI2aql5xmZ9BlYiIiHQRE4oO2Rmfhv2J15R3lrov0gsJF3Lh62KFE0n5KK2sbeqzGuiEnOKbqK2XIymjRDll1dm/CpBwMQ8l5TW4M9QVZ1IK4WTbH3+mFMLESIJfjl/FDwdbj+hnUCUiIiJdxZSiI+rqZUjJKsVwL1tYW5ggPNARB09fQ7C3Hb49eAVjAxyUU1Pt+yNLOStAiLcdrC1MMDbAETdqZPBzs4bXYCskZRRj1LBByC2uRtR4D/gNtYFFf2Ne7iciIqJehd0AdESDTA4Xe3Ok5ZRj3lRvnE8vRX5pDQZamMJuoBkuphXjvkke+OV4Fkb42uOPpHwEDLXBH0m5GGxvDpdB/ZCZZ4rMvErUN8hhZGiAzLwqfPLiZOUlf17uJyIiot6GyUWLFJfjt+6+hE92J6HsRi2yC6tx9koJAGCAqSEqb9Siv5khhnvZYd8f1zAhxBlJacWYEuqCpPQShHgPgquDBX46kokhg8whrZNh9LBBqG9oxKyJQ5VBFeDlfiIiIup9mF5uM0VAPfhnNt784iSuXC2CmZEBBtv2x7m/ipX9TxvlcoQPd0B+aQ0giFB5sx6TR7nAvL8Ed40ZggMns1FULkXy1VLkFt2Au5MljCVizI5wx/m/ivDgnT6dujUqERERkS5iWL2N4hIysXDdQXyxJxk79qXCc7Alvv89Hf36G2H/yaYJ/q0tTDA20BGp1yqQXXATni5WSL1WhsKymwCAlKxK7D6SiTDF7VX9BqFREHAiKQ9HL+Tij8sFyCm+ia/2pXT61qhEREREuoZ9Vm8TmbwRsfFpMDYU49j5XHi7DkRBmRQZeTdgYmKEET72OJNaiLvHuuJiWjHKqppaWO2sTBHqNwgGIhF+P52NAf2MIBEb4OyVIiy+PxA/H86A95CByC+tQXSkJ0QioOB//+dlfyIiIurtGFZvE4nYAJEjBuOv7DKEeNuhpKoWJy42TUeVnFmK4Z7WWHCPPxJTC5FTfBNjAxxhbCyGXC4gu6AK8kYBk0a4oLiyBvmlBRgX6IjPf0pGTvFN5JfWcCAVERER6SUmmh6muBR/IjkXhhIRnO3N4eo4AOdSixDm7wBfVyuMDXCEAANs2p0EQwMD3JTWI/lqKRwGmuFSRgks+pvAwbY/MvMqkHipQDltVfRkL/i6WmH2pL8HUsnkjQyqREREpDfYstqD4hIysftwOu6Z4A5ZvRx/ZZfD19UaCRfyMTHYGTmlTf1Qb0jrUVJeg9LKWpy6XNB0a1SxCDklN2FrZQqrAca4IW2ARCKGt4sVrmSXI3LkYNTWy1BULoUg/L2/2Pg0REd6cnAVERER6QU2wfUQmbwRuw+nw9PFCtcLq1FeXY/+ZsbY+0cWJGID1NQ3oFEuoKhcika5gKGDrZQ3AxjtZ4e07AoMMDGCj6sVBlmbItjbBkOdLZCeU4FRfvb461oZfvw9DaWVtYiNT0NdvQyx8X//zsFVREREpA8YVnuIRGyAOVO8UHmjFjJZI/5IyscAM0PclNbj1OVC2FmYIflqqXKqqtKKm/Bzs4JYLMK2vVeaBlVBBAkMUHmjHjdrZYiK8MCDd/rgbEoRwgIdER3pCWsLE0RHesLYSKLyO7sCEBERkT5gN4AecupyDozEIozwssevJ7Pg6WKF9JwKRAQ7w9DAAGeuFMLfwxqXMkrh72GN0gopHGz6I+FCHkora3HsQh5G+dqhUSTB/sQs9DM1wl2hrrhjtAsmjnBWhtFpY/4eTBUV7q7yOxEREVFvx1SjYTJ5I3b8ehlJaeXYvu8KxGIR7h7jArFIhKJyKerq5MgpvgE7634QiUSwszKFWGwAQ7EBzIzEGOFjD2sLE/i5WeOJmQEYZNMP/UyNVFpLm4fRlsGUQZWIiIj0CVtWNUQkEuHgn3moq62Hk31//HYyG/4e1vgpIRMh3nZIvVamvOQ/NcwV8X9mI8zfAYNtBmD/qSyM8rWHVCaDu6MlsvIr4es+EMZGEswY586pqIiIiKjP0omwevXqVbzxxhs4c+YMTE1NMX36dCxfvhwmJibaLpraTPsNwM28eggGQEpWOepljbiQVoLSylqcvVKEicHOOHwuB2MDHFFfJ8c/o3xQUyNDQZkU3kMsIa2Xw9PFEtPHueOuFuGUQZWIiIj6Kq2H1aqqKjzyyCNwdHTE+vXrUVZWhnXr1qGiogLvvvuutounlgOn85B6tRRhgfYwMjTE1p8uw8xEjDA/ByQm5yPUbxCu5lVgYrAzGuSNSEjKhb21O76Pz8DsSUPxwJ0+ANDmZX4iIiKivkzrYfXbb79FVVUVdu/ejYEDBwIAxGIxli9fjkWLFsHDw0PLJeyYTN6IispaeDhZ4ExKKXxcrTDGzxEJSbm4Ia2HqbEEf6YUwsjQANcKq2FnZYo7RrmgukaGQQPNeImfiIiIqANaT0lHjx7FmDFjlEEVAO666y4YGRnhyJEjWiyZeiRiAzjbGaOhESirqkVeSQ0AoekuVJmlGGBmiFC/QahvaISfmzV8hgyEgUiO/aeyERboyKBKRERE1AGtt6xmZGQgOjpaZZmRkRFcXFyQkZGhpVKpr7CsGnllDSgoq0FO8U1Y9DeBibEYo4cNgrW5CZwHmSEnX4rF0f4wlIgwzN0eErEBZk3yYVAlIiIiugWth9WqqiqYm5u3Wm5ubo7KysoubVMQBNTU1HS3aGpLyihBUblUOdrfzsoUT830RVJWBSqrGmBnbQxPZ3NIxAaor6tF/f+eV9/hVkkbpFKpyr/Ue7Eu9QfrUj+wHvWHpupSEASIRKJbrqf1sNoedV9AWxoaGpCSkqLhErXN1NQUPi4DYWtVi+TMUvi5WcPGwgQX0orhO6Q/GuurUVtbg7S/qm5LeUgzsrKytF0E0hDWpf5gXeoH1qP+0ERdGhkZ3XIdrYdVc3NzVFW1DnI3btzo8uAqQ0NDDB06tLtFU5u9vRT1DSKIDN0BkREMAJgZiWBgIAJgc9vKQd0nlUqRlZUFV1dXmJqaars41A2sS/3ButQPrEf9oam6TE9PV2s9rYdVDw+PVn1T6+vrkZ2d3aovq7pEIhHMzMw0UTy1ZWWlwNfX97bvl3qGqakp61JPsC71B+tSP7Ae9Ud361LdK+haH+Ezfvx4JCYmory8XLnswIEDqK+vx4QJE7RYMiIiIiLSNq2H1Tlz5mDAgAFYvHgxjh07ht27d+P1119HVFSUzs+xSkREREQ9S+vdAMzNzbFt2za88cYbWLp0KUxMTDBjxgwsX75c20UjIiIiIi3TelgFADc3N3z66afaLgYRERER6RitdwMgIiIiImoPwyoRERER6SyGVSIiIiLSWQyrRERERKSzGFaJiIiISGcxrBIRERGRzmJYJSIiIiKdxbBKRERERDqLYZWIiIiIdBbDKhERERHpLIZVIiIiItJZDKtEREREpLNEgiAI2i6EJiUlJaGhoQGGhoa3bZ+CICj3KRKJbtt+SfNYl/qDdak/WJf6gfWoPzRVl/X19RCJRAgJCelwPUmX96CjjIyMIJPJbus+RSIRjIyMbus+qWewLvUH61J/sC71A+tRf2iqLkUikVphV+9aVomIiIhIf7DPKhERERHpLIZVIiIiItJZDKtEREREpLMYVomIiIhIZzGsEhEREZHOYlglIiIiIp3FsEpEREREOothlYiIiIh0FsMqEREREekshlUiIiIi0lkMq0RERESksxhWiYiIiEhnMax2w9WrV/H4449j+PDhGDNmDN544w3U1tZqu1jUSdeuXcN//vMfzJw5E8OGDcOMGTO0XSTqol9//RWLFy/GhAkTMHz4cERFReGbb75BY2OjtotGnXTs2DHMnTsXYWFh8Pf3x+TJk7Fu3TrcuHFD20Wjbrh58ybGjx8Pb29vJCUlabs41Ek7d+6Et7d3q5933323R/cr6dGt67Gqqio88sgjcHR0xPr161FWVoZ169ahoqKixyuNNCstLQ1HjhxBUFAQGhsbIQiCtotEXfT555/D0dERL7zwAqytrXHy5EmsWbMG169fx4oVK7RdPOqEyspKBAcH45FHHoG5uTnS0tIQExODtLQ0fPbZZ9ouHnXRxo0bIZfLtV0M6qatW7diwIAByt/t7e17dH8Mq1307bffoqqqCrt378bAgQMBAGKxGMuXL8eiRYvg4eGh5RKSuiIjI3HHHXcAAF588UVcunRJyyWirvr444+V30cACAsLQ01NDb7++ms8++yzMDIy0mLpqDNmzJihcpUjNDQURkZGePnll1FYWNjjJ0fSvIyMDHzzzTdYsWIFXnnlFW0Xh7rBz89P5Vjb09gNoIuOHj2KMWPGqFTWXXfdBSMjIxw5ckSLJaPOMjDg10BftHXw9PX1RV1dHSoqKm5/gUijLC0tAQAymUy7BaEuWbNmDebMmQM3NzdtF4V6GZ6luygjI6NV66mRkRFcXFyQkZGhpVIRUUtnzpyBpaUlrK2ttV0U6gK5XI66ujokJyfjo48+wqRJk+Dk5KTtYlEn7du3D6mpqXj66ae1XRTSgBkzZsDX1xeTJ0/GJ5980uNdO9gNoIuqqqpgbm7earm5uTkqKyu1UCIiaikpKQk7d+7E008/DbFYrO3iUBdMmjQJhYWFAICIiAj897//1XKJqLOkUinefPNNPPfcc+jfv7+2i0PdYGtri6VLlyIoKAgikQjx8fH44IMPUFhYiP/85z89tl+GVQ0TBAEikUjbxSDq84qLi7Fs2TIEBATgySef1HZxqIs2b96MmpoapKenY+PGjXjqqafw+eef84+PXmTTpk2wtrbGvffeq+2iUDdFREQgIiJC+Xt4eDiMjY2xbds2PPXUU7Czs+uR/bIbQBeZm5ujqqqq1fIbN2602eJKRLfPjRs38OSTT8LExASbNm2CoaGhtotEXeTj44OQkBD84x//wIYNG3Dy5EkcOHBA28UiNeXm5uKzzz7DsmXLUF1djaqqKtTU1AAAampqcPPmTS2XkLpr2rRpkMvlSElJ6bF9sGW1izw8PFr1Ta2vr0d2djaio6O1VCoiqqurw6JFi1BSUoLvvvsOVlZW2i4SaYivry/EYjGys7O1XRRSU05ODhoaGrBgwYJWj82fPx9BQUH4/vvvtVAy6k0YVrto/Pjx2LRpE8rLy5UnwwMHDqC+vh4TJkzQcumI+iaZTIZnnnkGqamp+OqrrzgQR8+cO3cOcrkczs7O2i4KqcnX1xfbt29XWZaSkoJ169bhtddeQ0BAgJZKRpqyd+9eiMViDBs2rMf2wbDaRXPmzMFXX32FxYsXY/HixSgtLcWbb76JqKgozrHay0ilUuV0Y7m5uaiursa+ffsAAKNHj76tc8lR96xevRqHDh3C888/j9raWpw/f1752NChQzm4oxdZsmQJ/P394e3tDRMTE6SmpmLr1q3w9vZWzotMus/c3ByhoaFtPubn5wc/P7/bXCLqjscffxxhYWHw8vICAPz+++/4/vvvMX/+fNja2vbYfkUCb9fTZVevXsUbb7yBM2fOwMTEBDNmzMDy5cthYmKi7aJRJ+Tk5GDy5MltPrZ9+/Z2D7SkeyIjI5Gbm9vmY6zL3mXz5s3Yu3cvsrOzIQgCnJycMGXKFDz++OP8o6OXO3nyJObPn48ff/yRLau9zBtvvIFjx46hoKAAjY2NcHV1xf3334958+b16OByhlUiIiIi0lmcDYCIiIiIdBbDKhERERHpLIZVIiIiItJZDKtEREREpLMYVomIiIhIZzGsEhEREZHOYlglIiIiIp3FsEpE1Ibz589j2bJlCA8Ph7+/P8aNG4dly5bh3LlzKuvFxMTA29sbZWVlWiqp9kRGRuLFF1/UdjGISM8xrBIRtfDll1/iwQcfRGFhIZYvX47PP/8cL7zwAgoLC/HQQw/hq6++0nYRiYj6DIm2C0BEpEvOnDmDtWvXYsKECdiwYQMkkr8Pk9OnT8eSJUuwZs0a+Pr6YsSIEVosaWu1tbW83TMR6R22rBIRNbN582aIRCK8+uqrKkEVACQSCV555RWIRCJs2bJF5bGCggIsWbIEISEhGDFiBJYvX96qa8Aff/yBefPmITQ0FIGBgZg4cSKWLl0KqVSqXKe+vh4bN27E1KlT4e/vj7CwMKxcubLVtiIjI7Fw4UL89ttvmDVrFgICArBhwwbMmjULDz30UKvXJZfLERERgSVLlnR6Xw0NDXj77bcxbtw4BAUF4cEHH8TFixc798YSEXURW1aJiP5HLpfj5MmT8Pf3x6BBg9pcx8HBAX5+fkhMTIRcLlcuX7JkCaZOnYo5c+YgPT0dH374ITIyMvD999/D0NAQOTk5WLhwIUaOHIk1a9bA3NwchYWFOHbsGBoaGmBqaorGxkYsXrwYZ86cweOPP46QkBDk5uYiJiYGFy9eRGxsrErLaXJyMjIyMrBo0SI4OzvD1NQUdnZ2WLNmDbKysuDq6qpcNyEhAUVFRbj33nsBoFP7evnll7F792489thjGDduHNLS0rBkyRLcvHmzB2qBiEgVwyoR0f+Ul5dDKpXC2dm5w/WcnZ1x8eJFVFRUKJdNmTIFL7zwAgAgPDwc1tbWWL58OX799Vfcc889SE5ORl1dHV544QX4+PgonxcVFaX8/6+//opjx44hJiYGd955p3K5j48P7rvvPuzcuVOl1bSsrAy//PIL3NzclMsGDx6Mt99+G7t27cKzzz6rXL5r1y7Y2Nhg/PjxndpXRkYGdu3ahUcffVT5+saNG6d8fUREPY3dAIiIOkkQBACASCRSLmseOgFg2rRpkEgkOHnyJADA19cXhoaGePnll7Fr1y5cv3691XYPHToEc3NzTJo0CTKZTPnj6+sLW1tbnDp1SmV9b29vlaAKAFZWVoiMjMSuXbvQ2NgIAKisrMTvv/+OmTNnKrs2qLsvRfnbe31ERD2NRxoiov+xsrKCqakpcnJyOlwvNzcXpqamsLCwUC6ztbVVWUcikcDS0lLZ+uri4oIvvvgCW7duxerVq1FTU4PBgwdj3rx5eOSRRwAApaWlqKqqgr+/f5v7LS8vV/m95T4VoqOjsX//fhw/fhwRERHYs2cP6uvrlV0AOrMvRfnbe31ERD2NYZWI6H/EYjFCQ0Nx7NgxFBQUtNlvtaCgAMnJyRg/fjzEYrFyeXFxMezt7ZW/y2QyVFRUqAS6kSNHYuTIkZDL5bh06RK+/PJLrF27FjY2Npg+fTqsrKxgaWmJrVu3tlm+fv36qfzevGW3ufDwcNjZ2WHnzp2IiIjAzp07ERQUhKFDhyrXUXdfivK39/qIiHoauwEQETWzYMECCIKAV199VWUAFdA0AOvVV1+FIAhYsGCBymNxcXEqv//666+QyWQYPXp0q32IxWIEBQXhlVdeAdA0UAoAJk6ciIqKCjQ2NiIgIKDVj7u7u1qvQSwWY+bMmTh48CD+/PNPXLp0CdHR0SrrqLuv0NDQDl8fEVFPY8sqEVEzI0aMwKpVq7B27Vo89NBDePjhh+Ho6Ii8vDx88803uHDhAlatWoWQkBCV5x04cABisVg5Wv7DDz+Ej48Ppk2bBgDYsWMHEhMTMXHiRDg4OKCurg6xsbEAgLFjxwJomsc1Li4OCxYswLx58xAYGAhDQ0MUFBTg5MmTmDx5MqZMmaLW64iOjsaWLVvwr3/9CyYmJrj77rtVHld3Xx4eHrjnnnuwbds2SCQSjB07Fmlpafj000/Rv3//7r7dRES3xLBKRNTCvHnzEBAQgM8++wxvvfUWKioqYGFhgREjRuCbb75BcHBwq+fExMQgJiYGO3bsgEgkQmRkJFatWgUjIyMATQOsjh8/jpiYGBQXF8PMzAxeXl7YtGkTwsPDATS1iG7atAnbt2/HTz/9hM2bN0MsFmPQoEEYNWoUvLy81H4Nbm5uCA4Oxrlz5xAVFYUBAwaoPN6Zfa1ZswY2NjbYtWsXvvzyS/j6+iImJgbPPfdcV95eIqJOEQmKYa1ERERERDqGfVaJiIiISGcxrBIRERGRzmJYJSIiIiKdxbBKRERERDqLYZWIiIiIdBbDKhERERHpLIZVIiIiItJZDKtEREREpLMYVomIiIhIZzGsEhEREZHOYlglIiIiIp3FsEpEREREOuv/AfeQvi6BLc6XAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Spearman correlation: SignificanceResult(statistic=0.9771781192927307, pvalue=0.0)\n" + ] + } + ], + "source": [ + "# Plot with Seaborn\n", + "plt.figure(figsize=(8, 6))\n", + "sns.set(style=\"whitegrid\")\n", + "\n", + "sns.scatterplot(x=mean_observed, y=mean_reconstruction, s=5)\n", + "plt.title(f\"Observed vs Predicted Mean Expression of {holdout_perts} Perturbed Cells\")\n", + "plt.ylabel(\"Predicted\")\n", + "plt.xlabel(\"Observed\")\n", + "\n", + "plt.show()\n", + "\n", + "# Print Spearman correlation\n", + "print(f\"Spearman correlation: {spearmans}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "cc37848e-715c-4e13-88ee-6299afc776ea", + "metadata": {}, + "outputs": [], + "source": [ + "# get reconstruction for all heldout samples\n", + "X_holdout = torch.from_numpy(holdout_adata.X)\n", + "D_holdout = torch.from_numpy(D[test_idx])\n", + "labels_holdout = labels[test_idx]\n", + "loss_weights_holdout = torch.from_numpy(np.ones(holdout_adata.shape[0]))\n", + "recons = []\n", + "losses = []\n", + "with torch.no_grad():\n", + " for i in range(0, len(X_holdout), 10000):\n", + " loss, recon = wrapper.internal_model.loss(\n", + " X=X_holdout[i : i + 10000],\n", + " D=D_holdout[i : i + 10000],\n", + " labels=labels_holdout[i : i + 10000],\n", + " loss_weights=loss_weights_holdout[i : i + 10000],\n", + " forward=True,\n", + " )\n", + " losses.append(loss[\"TNFA+\"].detach().cpu().numpy())\n", + " recons.append(recon[\"TNFA+\"].detach().cpu().numpy())\n", + "adata.uns[\"recon\"] = np.concatenate(recons)\n", + "adata.uns[\"losses\"] = np.array(losses)" + ] + }, + { + "cell_type": "markdown", + "id": "f51bf3e9-ef18-4dcd-b964-7891bf9346df", + "metadata": {}, + "source": [ + "## Save anndata for downstream analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "973e78cf", + "metadata": {}, + "outputs": [], + "source": [ + "# save anndata for downstream analysis\n", + "from scipy import sparse\n", + "\n", + "adata.X = sparse.csr_matrix(adata.X)\n", + "write_adata_to_s3(s3_dir + experiment_name, \"fold_4\", adata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "511f8aae", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/PertSpectra_load_checkpoints/pertspectra_norman.ipynb b/PertSpectra_load_checkpoints/pertspectra_norman.ipynb new file mode 100644 index 0000000..f02020a --- /dev/null +++ b/PertSpectra_load_checkpoints/pertspectra_norman.ipynb @@ -0,0 +1,384 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "65f8246a", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bb2ff34c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Seed set\n" + ] + } + ], + "source": [ + "import sys\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import torch\n", + "\n", + "sys.path.append(\"..\")\n", + "from utils import (\n", + " filter_noisy_genes,\n", + " generate_k_fold,\n", + " load_model,\n", + " read_aws_h5ad,\n", + " set_seed,\n", + " write_adata_to_s3,\n", + ")\n", + "\n", + "set_seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "45dd3375-ab1b-423d-9179-7da6da1e6151", + "metadata": {}, + "source": [ + "## Load Model from Checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5280863b", + "metadata": {}, + "outputs": [], + "source": [ + "# use anndata generate by ..data_processing/norman_prior_graph_preprocessing.ipynb\n", + "unfiltered_adata = read_aws_h5ad(\"path to preprocessed h5ad\")\n", + "adata = filter_noisy_genes(unfiltered_adata)\n", + "adata.layers[\"logcounts\"] = adata.X.copy()\n", + "adata.X = adata.X.todense()\n", + "device = torch.device(\"cuda:0\")\n", + "gene_network = adata.uns[\"sparse_gene_network\"].todense()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "78ccb7d2", + "metadata": {}, + "outputs": [], + "source": [ + "# subset to powered perturbations\n", + "obs_df = pd.DataFrame(adata.obs[\"perturbation_name\"])\n", + "category_counts = obs_df[\"perturbation_name\"].value_counts()\n", + "filtered_categories = category_counts[category_counts >= 50].index\n", + "adata = adata[adata.obs[\"perturbation_name\"].isin(filtered_categories)]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "266cc119", + "metadata": {}, + "outputs": [], + "source": [ + "# load model from checkpoint\n", + "s3_dir = \"s3://pert-spectra/PertSpectra_checkpoints/\"\n", + "experiment_name = \"pertspectra_norman/\"\n", + "model_name = \"kfold_4\"\n", + "wrapper, adata = load_model(\n", + " adata=adata,\n", + " s3_dir=s3_dir,\n", + " experiment_name=experiment_name,\n", + " model_name=model_name,\n", + " use_cell_types=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fba884f4-488e-444c-9421-12068d735c70", + "metadata": {}, + "outputs": [], + "source": [ + "# save trained PertSpectra parameters\n", + "adata.uns[\"SPECTRA_factors\"] = wrapper.factors\n", + "adata.uns[\"SPECTRA_L\"] = wrapper.internal_model.L\n", + "adata.uns[\"SPECTRA_pert_scores\"] = wrapper.cell_scores" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9292bf1c", + "metadata": {}, + "outputs": [], + "source": [ + "# reconstruct binarized perturbation matrix\n", + "pert_idx = list(wrapper.internal_model.pert_idx)\n", + "pert_labels = [x.split(\"_\")[-1] for x in wrapper.internal_model.pert_labels]\n", + "adata.uns[\"Spectra_pert_labels\"] = pert_labels\n", + "D = []\n", + "for i in adata.obs[\"perturbation_name\"]:\n", + " d = [0.0 for _ in range(len(pert_idx))]\n", + " # add intercept\n", + " d[-1] = 1\n", + " if i == \"control\":\n", + " d[-2] = 1.0\n", + " D.append(d)\n", + " continue\n", + " guides = i.split(\"+\")\n", + "\n", + " # pert or intergenic\n", + " one_hot_idx = pert_labels.index(guides[0])\n", + " d[one_hot_idx] = 1.0\n", + "\n", + " if len(guides) > 1:\n", + " one_hot_idx = pert_labels.index(guides[1])\n", + " d[one_hot_idx] = 1.0\n", + " D.append(d)\n", + "D = np.stack(D).astype(np.float32)" + ] + }, + { + "cell_type": "markdown", + "id": "d3fef4ad-89a7-4fb6-a8ba-b865ef6e3cf8", + "metadata": {}, + "source": [ + "## Reconstructed Gene Expression\n", + "- Visualize reconstructed gene expression for a single perturbation\n", + "- Save reconstructed gene expression for all heldout cells" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "4184472e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SPECTRA()" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wrapper.internal_model.to(torch.device(\"cuda:0\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a55ef361", + "metadata": {}, + "outputs": [], + "source": [ + "# train-val-test split\n", + "train_idx, val_idx, test_idx = generate_k_fold(\n", + " adata,\n", + " adata.X,\n", + " adata.obs[\"perturbation_name\"],\n", + " fold_idx=4,\n", + " perturbation_key=\"perturbation_name\",\n", + ")\n", + "loss_weights = np.ones(adata.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "671c346f", + "metadata": {}, + "outputs": [], + "source": [ + "holdout_adata = adata[test_idx]\n", + "train_adata = adata[train_idx]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1328f936", + "metadata": {}, + "outputs": [], + "source": [ + "holdout_perts = \"CBL\"\n", + "hold_idx = [\n", + " i\n", + " for i, x in enumerate(holdout_adata.obs[\"perturbation_name\"])\n", + " if x == holdout_perts\n", + "]\n", + "X_holdout = torch.from_numpy(holdout_adata.X[hold_idx])\n", + "D_holdout = torch.from_numpy(D[hold_idx])\n", + "loss_weights_holdout = torch.from_numpy(loss_weights[hold_idx])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "9657f513", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " loss, recon = wrapper.internal_model.loss_no_cell_types(\n", + " X=X_holdout, D=D_holdout, loss_weights=loss_weights_holdout, forward=True\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "f5fc4fdc", + "metadata": {}, + "outputs": [], + "source": [ + "mean_reconstruction = recon.detach().cpu().numpy().mean(axis=0)\n", + "observed = torch.from_numpy(\n", + " holdout_adata[(holdout_adata.obs[\"perturbation_name\"] == holdout_perts)].X\n", + ")\n", + "mean_observed = torch.mean(observed, dim=0).detach().cpu().numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dca3a452-df0a-4a84-9573-3511c77bb470", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import spearmanr\n", + "\n", + "spearmans = spearmanr(mean_reconstruction, mean_observed)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "5f2a50e8", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Spearman correlation: SignificanceResult(statistic=0.9697197539302803, pvalue=9.701068316902012e-35)\n" + ] + } + ], + "source": [ + "plt.scatter(\n", + " mean_observed, mean_reconstruction, s=[5 for _ in range(len(mean_observed))]\n", + ")\n", + "plt.title(f\"Observed vs predicted mean expression of {holdout_perts} perturbed cells\")\n", + "plt.ylabel(\"Predicted\")\n", + "plt.xlabel(\"Observed\")\n", + "plt.show()\n", + "print(f\"Spearman correlation: {spearmans}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ed5cd7ac-d347-439e-ae0f-9cd99b541be3", + "metadata": {}, + "outputs": [], + "source": [ + "# get reconstruction for all heldout samples\n", + "X_holdout = torch.from_numpy(holdout_adata.X)\n", + "D_holdout = torch.from_numpy(D[test_idx])\n", + "loss_weights_holdout = torch.from_numpy(np.ones(holdout_adata.shape[0]))\n", + "recons = []\n", + "losses = []\n", + "with torch.no_grad():\n", + " for i in range(0, len(X_holdout), 1000):\n", + " loss, recon = wrapper.internal_model.loss_no_cell_types(\n", + " X=X_holdout[i : i + 1000],\n", + " D=D_holdout[i : i + 1000],\n", + " loss_weights=loss_weights_holdout[i : i + 1000],\n", + " forward=True,\n", + " )\n", + " losses.append(loss.detach().cpu().numpy())\n", + " recons.append(recon.detach().cpu().numpy())\n", + "adata.uns[\"recon\"] = np.concatenate(recons)\n", + "adata.uns[\"losses\"] = np.array(losses)" + ] + }, + { + "cell_type": "markdown", + "id": "3754cfd3-adb5-4091-bcbb-44d7739514a7", + "metadata": {}, + "source": [ + "## Save anndata for downstream analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "973e78cf", + "metadata": {}, + "outputs": [], + "source": [ + "# save anndata for downstream analysis\n", + "from scipy import sparse\n", + "\n", + "adata.X = sparse.csr_matrix(adata.X)\n", + "write_adata_to_s3(s3_dir + experiment_name, \"fold_4\", adata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0e80c98-9b9b-4d73-91d8-9b750d1c7e7e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/PertSpectra_load_checkpoints/pertspectra_replogle.ipynb b/PertSpectra_load_checkpoints/pertspectra_replogle.ipynb new file mode 100644 index 0000000..8964992 --- /dev/null +++ b/PertSpectra_load_checkpoints/pertspectra_replogle.ipynb @@ -0,0 +1,367 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "65f8246a", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bb2ff34c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Seed set\n" + ] + } + ], + "source": [ + "import sys\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import torch\n", + "\n", + "sys.path.append(\"..\")\n", + "from utils import (\n", + " filter_noisy_genes,\n", + " load_model,\n", + " read_aws_h5ad,\n", + " set_seed,\n", + " split_data_by_cell,\n", + " write_adata_to_s3,\n", + ")\n", + "\n", + "set_seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "1c40154b-7828-4142-9187-170e7909ed72", + "metadata": {}, + "source": [ + "## Load Model from Checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5280863b", + "metadata": {}, + "outputs": [], + "source": [ + "# use anndata generate by ..data_processing/replogle_prior_graph_preprocessing.ipynb\n", + "unfilterd_adata = read_aws_h5ad(\"path to preprocessed h5ad\")\n", + "adata = filter_noisy_genes(unfilterd_adata)\n", + "adata.X = adata.X.todense()\n", + "device = torch.device(\"cuda:0\")\n", + "gene_network = adata.uns[\"sparse_gene_network\"].todense()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5375d253", + "metadata": {}, + "outputs": [], + "source": [ + "# filter adata to perturbations with at least 50 samples\n", + "obs_df = pd.DataFrame(adata.obs[\"gene\"])\n", + "category_counts = obs_df[\"gene\"].value_counts()\n", + "filtered_categories = category_counts[category_counts >= 50].index\n", + "adata = adata[adata.obs[\"gene\"].isin(filtered_categories)]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1551462a", + "metadata": {}, + "outputs": [], + "source": [ + "# get the filtered 'strong' perturbations, but do not use the limited gene expression\n", + "# references the svae filtered replogle anndata to subset to those cells (https://github.com/Genentech/sVAE/blob/main/entry_points/replogle-preprocessing.ipynb)\n", + "filtered_replogle = read_aws_h5ad(\"path to svae filtered replogle h5ad\")\n", + "filtered_perts = set(filtered_replogle.obs[\"gene\"].unique()).union(\n", + " set([\"SKP2\", \"CUL1\", \"UBE2N\"])\n", + ")\n", + "adata = adata[adata.obs[\"gene\"].isin(filtered_perts)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15bdbc4d", + "metadata": {}, + "outputs": [], + "source": [ + "# load model from checkpoint\n", + "s3_dir = \"s3://pert-spectra/PertSpectra_checkpoints/\"\n", + "experiment_name = \"pertspectra_replogle/\"\n", + "model_name = \"replogle\"\n", + "wrapper, adata = load_model(\n", + " adata=adata,\n", + " s3_dir=s3_dir,\n", + " experiment_name=experiment_name,\n", + " model_name=model_name,\n", + " use_cell_types=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b375310a-07eb-47f0-a4d1-c17b19697eae", + "metadata": {}, + "outputs": [], + "source": [ + "# save trained PertSpectra parameters\n", + "adata.uns[\"SPECTRA_factors\"] = wrapper.factors\n", + "adata.uns[\"SPECTRA_L\"] = wrapper.internal_model.L\n", + "adata.uns[\"SPECTRA_pert_scores\"] = wrapper.cell_scores" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6da069f6", + "metadata": {}, + "outputs": [], + "source": [ + "# reconstruct binarized perturbation matrix\n", + "pert_idx = list(wrapper.internal_model.pert_idx)\n", + "pert_labels = [x.split(\"_\")[-1] for x in wrapper.internal_model.pert_labels]\n", + "adata.uns[\"Spectra_pert_labels\"] = pert_labels\n", + "D = []\n", + "for i in adata.obs[\"gene\"]:\n", + " d = [0.0 for _ in range(len(pert_idx))]\n", + " # add intercept\n", + " d[-1] = 1\n", + " if i == \"non-targeting\":\n", + " d[-2] = 1.0\n", + " D.append(d)\n", + " continue\n", + " guides = i.split(\"+\")\n", + "\n", + " # pert or intergenic\n", + " one_hot_idx = pert_labels.index(guides[0])\n", + " d[one_hot_idx] = 1.0\n", + "\n", + " if len(guides) > 1:\n", + " one_hot_idx = pert_labels.index(guides[1])\n", + " d[one_hot_idx] = 1.0\n", + " D.append(d)\n", + "D = np.stack(D).astype(np.float32)" + ] + }, + { + "cell_type": "markdown", + "id": "e7fd5be1-49a2-47b0-bc6c-7a84d489c1f2", + "metadata": {}, + "source": [ + "## Reconstructed Gene Expression\n", + "- Visualize reconstructed gene expression for a single perturbation\n", + "- Save reconstructed gene expression for all heldout cells" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4184472e", + "metadata": {}, + "outputs": [], + "source": [ + "# wrap in cpu\n", + "wrapper.internal_model.to(torch.device(\"cpu\"))\n", + "wrapper.internal_model.device = torch.device(\"cpu\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "83785bb5", + "metadata": {}, + "outputs": [], + "source": [ + "train_idx, val_idx, test_idx = split_data_by_cell(\n", + " adata.X, adata.obs[\"gene\"], test_size=0.2, val_size=0.2\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "1328f936", + "metadata": {}, + "outputs": [], + "source": [ + "holdout_perts = \"RIOK2\"\n", + "X_holdout = torch.from_numpy(adata.X[test_idx])\n", + "D_holdout = torch.from_numpy(D[test_idx])\n", + "loss_weights = torch.from_numpy(np.ones(X_holdout.shape[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "9657f513", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " loss, recon = wrapper.internal_model.loss_no_cell_types(\n", + " X=X_holdout, D=D_holdout, loss_weights=loss_weights, forward=True\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f5fc4fdc", + "metadata": {}, + "outputs": [], + "source": [ + "mean_reconstruction = torch.mean(recon, dim=0).detach().cpu().numpy()\n", + "observed = torch.from_numpy(adata[(adata.obs[\"gene\"] == holdout_perts)].X)\n", + "mean_observed = torch.mean(observed, dim=0).detach().cpu().numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cd3319c-e86e-4d1b-bfcf-79b066430efd", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import spearmanr\n", + "\n", + "spearmans = spearmanr(mean_reconstruction, mean_observed)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "5f2a50e8", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Spearman correlation: SignificanceResult(statistic=0.9748835210506989, pvalue=0.0)\n" + ] + } + ], + "source": [ + "plt.scatter(\n", + " mean_observed, mean_reconstruction, s=[5 for _ in range(len(mean_observed))]\n", + ")\n", + "plt.title(f\"Observed vs predicted mean expression of {holdout_perts} perturbed cells\")\n", + "plt.ylabel(\"Predicted\")\n", + "plt.xlabel(\"Observed\")\n", + "plt.show()\n", + "spearmans = spearmanr(mean_reconstruction, mean_observed)\n", + "print(f\"Spearman correlation: {spearmans}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "d3d8e895", + "metadata": {}, + "outputs": [], + "source": [ + "# get reconstruction for all heldout samples\n", + "X_holdout = torch.from_numpy(adata.X[test_idx])\n", + "D_holdout = torch.from_numpy(D[test_idx])\n", + "loss_weights = torch.from_numpy(np.ones(X_holdout.shape[0]))\n", + "recons = []\n", + "losses = []\n", + "with torch.no_grad():\n", + " for i in range(0, len(X_holdout), 10000):\n", + " loss, recon = wrapper.internal_model.loss_no_cell_types(\n", + " X=X_holdout[i : i + 10000],\n", + " D=D_holdout[i : i + 10000],\n", + " loss_weights=loss_weights[i : i + 10000],\n", + " forward=True,\n", + " )\n", + " losses.append(loss.detach().cpu().numpy())\n", + " recons.append(recon.detach().cpu().numpy())\n", + "adata.uns[\"recon\"] = np.concatenate(recons)\n", + "adata.uns[\"losses\"] = np.array(losses)" + ] + }, + { + "cell_type": "markdown", + "id": "c1312828-84ee-4a2b-81d8-6713cde9ce67", + "metadata": {}, + "source": [ + "## Save anndata for downstream analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "973e78cf", + "metadata": {}, + "outputs": [], + "source": [ + "# save anndata for downstream analysis\n", + "from scipy import sparse\n", + "\n", + "adata.X = sparse.csr_matrix(adata.X)\n", + "write_adata_to_s3(s3_dir + experiment_name, \"replogle\", adata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0e29cc0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/PertSpectra_training_scripts/train_PertSpectra_inhouse.py b/PertSpectra_training_scripts/train_PertSpectra_inhouse.py new file mode 100644 index 0000000..5f16689 --- /dev/null +++ b/PertSpectra_training_scripts/train_PertSpectra_inhouse.py @@ -0,0 +1,208 @@ +import argparse +import sys + +import numpy as np +import pandas as pd +import torch + +sys.path.append("..") +from src.Spectra import Spectra_Pert as spc_linear +from src.Spectra.Spectra_Pert import vectorize_perts, vectorize_perts_combinations +from utils import ( + filter_noisy_genes, + generate_k_fold, + generate_loss_weights, + read_aws_h5ad, + set_seed, + write_model_pickle_to_s3, +) + + +def train(args): + # read in anndata containing prior graph + unfilterd_adata = read_aws_h5ad(args.anndata_s3_url) + adata = filter_noisy_genes(unfilterd_adata) + adata.layers["logcounts"] = adata.X.copy() + adata.X = adata.X.todense() + device = torch.device(args.device) + gene_network = adata.uns["sparse_gene_network"].todense() + + # filter adata to perturbations with at least 50 samples for each treatment + adata.obs["condition"] = adata.obs["condition"].astype(str) + adata.obs["Treatment"] = adata.obs["Treatment"].astype(str) + adata.obs["pert_treat"] = adata.obs["condition"] + "+" + adata.obs["Treatment"] + obs_df = pd.DataFrame(adata.obs["pert_treat"]) + category_counts = obs_df["pert_treat"].value_counts() + filtered_categories = category_counts[category_counts >= 50].index + adata = adata[adata.obs["pert_treat"].isin(filtered_categories)] + + if args.cell_type_key == "Treatment": + L = {"global": args.global_latent_dim} + adj_matrices = {"global": gene_network.copy()} + for key in adata.obs[args.cell_type_key].unique(): + L[key] = args.celltype_latent_dim + adj_matrices[key] = gene_network.copy() + + # init model parameters + vocab = adata.var_names + labels = adata.obs[args.cell_type_key].values + word2id = dict((v, idx) for idx, v in enumerate(vocab)) + X = adata.X + loss_weights = generate_loss_weights(adata, args.perturbation_key) + + # perturbation labels + if args.perturbation_key in adata.obs: + if not args.encode_combos_as_unique: + D, pert_labels = vectorize_perts( + adata, args.perturbation_key, args.control_key + ) + pert_idx = np.array( + [ + adata.var_names.get_loc(i.split("_")[1]) + if i.split("_")[1] in adata.var_names + else -1 + for i in pert_labels + ] + ) + else: + D, pert_labels = vectorize_perts_combinations( + adata, args.perturbation_key, args.control_key + ) + pert_idx = np.array( + [ + adata.var_names.get_loc(i) if i in adata.var_names else -1 + for i in pert_labels + ] + ) + + # add ctrl one-hot-encoding + ctrl_vector = np.array( + [1.0 if i == "ctrl" else 0.0 for i in adata.obs[args.perturbation_key]] + ) + basal_vector = np.array([1.0 for i in adata.obs[args.perturbation_key]]) + D = np.concatenate( + [D, ctrl_vector.reshape(len(ctrl_vector), 1)], axis=1 + ).astype(np.float32) + D = np.concatenate( + [D, basal_vector.reshape(len(basal_vector), 1)], axis=1 + ).astype(np.float32) + pert_idx = np.append(pert_idx, [-1, -1]) + pert_labels = pert_labels + ["ctrl", "basal"] + else: + print("Perturbation key not found or not given!") + D = np.array([]) + pert_idx = np.array([]) + + # init Spectra wrapper + wrapper = spc_linear.SPECTRA_Model( + X=X, + labels=labels, + pert_idx=pert_idx, + pert_labels=pert_labels, + L=L, + vocab=vocab, + adj_matrix=adj_matrices, + use_weights=args.use_weights, + lam=args.lam, + psi=args.psi, + delta=args.delta, + kappa=None, + rho=args.rho, + use_cell_types=args.use_cell_types, + ) + wrapper.initialize(None, word2id, adata.X, 0) + wrapper.internal_model.to(device) + + # train-val-test split + train_idx, val_idx, test_idx = generate_k_fold( + adata, X, adata.obs[args.perturbation_key], fold_idx=args.kfold_idx + ) + X_train = X[train_idx] + D_train = D[train_idx] + loss_weights_train = loss_weights[train_idx] + labels_train = labels[train_idx] + X_val = X[val_idx] + D_val = D[val_idx] + loss_weights_val = loss_weights[val_idx] + labels_val = labels[val_idx] + + # train model + train_loss, val_loss = wrapper.train( + X_train, + D_train, + loss_weights_train, + X_val, + D_val, + loss_weights_val, + labels=labels_train, + labels_val=labels_val, + ) + + write_model_pickle_to_s3( + s3_url=args.model_save_s3_url + args.experiment_name + "/", + model_name=f"kfold_{args.kfold_idx}", + model=wrapper, + ) + + +if __name__ == "__main__": + # set seed for reproducibility + set_seed(0) + parser = argparse.ArgumentParser(description="Run pertspectra experiment.") + # Passing in hyperparameters as arguments + parser.add_argument("--weight_decay", type=float, default=0.001) + parser.add_argument("--lam", type=float, default=0.01) + parser.add_argument("--psi", type=float, default=0.01) + parser.add_argument("--delta", type=float, default=0.001) + parser.add_argument("--kappa", type=float, default=0.00001) + parser.add_argument("--rho", type=float, default=0.001) + parser.add_argument("--global_latent_dim", type=int, default=20) + parser.add_argument("--celltype_latent_dim", type=int, default=5) + parser.add_argument("--use_highly_variable", type=bool, default=False) + parser.add_argument("--use_weights", type=bool, default=True) + parser.add_argument("--kfold_idx", type=int, default=0) + + # cell type + parser.add_argument( + "--use_cell_types", + type=bool, + default=True, + action=argparse.BooleanOptionalAction, + ) + # encode combos as unique + parser.add_argument( + "--encode_combos_as_unique", + type=bool, + default=False, + action=argparse.BooleanOptionalAction, + ) + # cell type key: Treatment, treatment_and_pert + parser.add_argument("--cell_type_key", type=str, default="Treatment") + # perturbation key if using perturbations + parser.add_argument("--perturbation_key", type=str, default="condition") + # control key + parser.add_argument("--control_key", type=str, default=["ctrl", "nan"]) + # prior to use: None, stringdb + parser.add_argument("--prior", type=str, default="stringdb") + # device to use + parser.add_argument("--device", type=str, default="cuda:0") + # name of wandb run + parser.add_argument( + "--experiment_name", + type=str, + default="pertspectra_inhouse", + ) + + parser.add_argument( + "--anndata_s3_url", + type=str, + default="s3://pert-spectra/inhouse_adata_spectra.h5ad", + ) + parser.add_argument( + "--model_save_s3_url", + type=str, + default="s3://pert-spectra/PertSpectra_checkpoints/", + ) + args = parser.parse_args() + + train(args) diff --git a/PertSpectra_training_scripts/train_PertSpectra_norman.py b/PertSpectra_training_scripts/train_PertSpectra_norman.py new file mode 100644 index 0000000..bace2e1 --- /dev/null +++ b/PertSpectra_training_scripts/train_PertSpectra_norman.py @@ -0,0 +1,183 @@ +import argparse +import sys + +import numpy as np +import pandas as pd +import torch + +sys.path.append("..") +from src.Spectra import Spectra_Pert as spc_linear +from src.Spectra.Spectra_Pert import ( + get_guide_one_hot_cols, + vectorize_perts_combinations, +) +from utils import ( + filter_noisy_genes, + generate_k_fold, + generate_loss_weights, + read_aws_h5ad, + set_seed, + write_model_pickle_to_s3, +) + + +def train(args): + # read in anndata containing prior graph + unfilterd_adata = read_aws_h5ad(args.anndata_s3_url) + adata = filter_noisy_genes(unfilterd_adata) + adata.layers["logcounts"] = adata.X.copy() + adata.X = adata.X.todense() + device = torch.device(args.device) + gene_network = adata.uns["sparse_gene_network"].todense() + if args.prior == "None": + gene_network = np.zeros(gene_network.shape) + + # filter adata to perturbations with at least 50 samples + obs_df = pd.DataFrame(adata.obs[args.perturbation_key]) + category_counts = obs_df[args.perturbation_key].value_counts() + filtered_categories = category_counts[category_counts >= 50].index + adata = adata[adata.obs[args.perturbation_key].isin(filtered_categories)] + + # init model parameters + L = args.global_latent_dim + adj_matrices = gene_network.copy() + vocab = adata.var_names + word2id = dict((v, idx) for idx, v in enumerate(vocab)) + loss_weights = generate_loss_weights(adata, args.perturbation_key) + X = adata.X + + # create binary perturbation label matrix from data + if args.perturbation_key in adata.obs: + if not args.encode_combos_as_unique: + pert_labels = get_guide_one_hot_cols(adata.obs) + adata.obs["num_guides"] = adata.obs[pert_labels].sum(1) + # combinations encoded as application of two individual guides + D = adata.obs[pert_labels].to_numpy().astype(np.float32) + pert_id = [] + for i in pert_labels: + guide = i.split("_")[1] + if guide in adata.var_names: + pert_id.append(adata.var_names.get_loc(guide)) + else: + pert_id.append(-1) + pert_idx = np.array(pert_id) + else: + D, pert_labels = vectorize_perts_combinations( + adata, args.perturbation_key, args.control_key + ) + pert_idx = np.array( + [ + adata.var_names.get_loc(i) if i in adata.var_names else -1 + for i in pert_labels + ] + ) + # add ctrl one-hot-encoding + ctrl_vector = np.array( + [1.0 if i == "control" else 0.0 for i in adata.obs[args.perturbation_key]] + ) + basal_vector = np.array([1.0 for i in adata.obs[args.perturbation_key]]) + D = np.concatenate( + [D, ctrl_vector.reshape(len(ctrl_vector), 1)], axis=1 + ).astype(np.float32) + D = np.concatenate( + [D, basal_vector.reshape(len(basal_vector), 1)], axis=1 + ).astype(np.float32) + pert_idx = np.append(pert_idx, [-1, -1]) + pert_labels = pert_labels + ["ctrl", "basal"] + else: + print("Perturbation key not found or not given!") + D = np.array([]) + pert_idx = np.array([]) + + # init Spectra wrapper + wrapper = spc_linear.SPECTRA_Model( + X=X, + labels=None, + pert_idx=pert_idx, + pert_labels=pert_labels, + L=L, + vocab=vocab, + adj_matrix=adj_matrices, + use_weights=args.use_weights, + lam=args.lam, + psi=args.psi, + delta=args.delta, + kappa=None, + rho=args.rho, + use_cell_types=args.use_cell_types, + ) + wrapper.initialize(None, word2id, adata.X, 0) + wrapper.internal_model.to(device) + + # train-val-test split + train_idx, val_idx, _ = generate_k_fold( + adata, X, D, fold_idx=args.kfold_idx, perturbation_key=args.perturbation_key + ) + X_train = X[train_idx] + D_train = D[train_idx] + loss_weights_train = loss_weights[train_idx] + X_val = X[val_idx] + D_val = D[val_idx] + loss_weights_val = loss_weights[val_idx] + + # train model + train_loss, val_loss = wrapper.train( + X_train, D_train, loss_weights_train, X_val, D_val, loss_weights_val + ) + # save to s3 + write_model_pickle_to_s3( + s3_url=args.model_save_s3_url + args.experiment_name + "/", + model_name=f"kfold_{args.kfold_idx}", + model=wrapper, + ) + + +if __name__ == "__main__": + # set seed for reproducibility + set_seed(0) + parser = argparse.ArgumentParser(description="Run pertspectra experiment.") + # Passing in hyperparameters as arguments. + parser.add_argument("--weight_decay", type=float, default=0.001) + parser.add_argument("--lam", type=float, default=1) + parser.add_argument("--psi", type=float, default=0.01) + parser.add_argument("--delta", type=float, default=0.001) + parser.add_argument("--kappa", type=float, default=0.00001) + parser.add_argument("--rho", type=float, default=0.05) + parser.add_argument("--use_highly_variable", type=bool, default=False) + parser.add_argument("--use_weights", type=bool, default=True) + parser.add_argument("--global_latent_dim", type=int, default=200) + parser.add_argument("--kfold_idx", type=int, default=2) + + # cell type + parser.add_argument("--use_cell_types", type=bool, default=False) + # encode combos as unique + parser.add_argument( + "--encode_combos_as_unique", + type=bool, + default=False, + action=argparse.BooleanOptionalAction, + ) + # perturbation key if using perturbations + parser.add_argument("--perturbation_key", type=str, default="perturbation_name") + # control key + parser.add_argument("--control_key", type=str, default=["control"]) + # prior to use: None, stringdb + parser.add_argument("--prior", type=str, default="stringdb") + # device to use + parser.add_argument("--device", type=str, default="cuda:0") + # name of training run + parser.add_argument("--experiment_name", type=str, default="pertspectra_norman") + + parser.add_argument( + "--anndata_s3_url", + type=str, + default="s3://pert-spectra/norman_adata_spectra.h5ad", + ) + parser.add_argument( + "--model_save_s3_url", + type=str, + default="s3://pert-spectra/PertSpectra_checkpoints/", + ) + args = parser.parse_known_args()[0] + + train(args) diff --git a/PertSpectra_training_scripts/train_PertSpectra_replogle.py b/PertSpectra_training_scripts/train_PertSpectra_replogle.py new file mode 100644 index 0000000..2ddaacf --- /dev/null +++ b/PertSpectra_training_scripts/train_PertSpectra_replogle.py @@ -0,0 +1,172 @@ +import argparse +import sys + +import numpy as np +import pandas as pd +import torch + +sys.path.append("..") +from src.Spectra import Spectra_Pert as spc_linear +from src.Spectra.Spectra_Pert import ( + vectorize_perts, +) +from utils import ( + filter_noisy_genes, + generate_loss_weights, + read_aws_h5ad, + set_seed, + split_data_by_cell, + write_model_pickle_to_s3, +) + + +def train(args): + # read in anndata containing prior graph + unfilterd_adata = read_aws_h5ad(args.anndata_s3_url) + adata = filter_noisy_genes(unfilterd_adata) + adata.layers["logcounts"] = adata.X.copy() + adata.X = adata.X.todense() + device = torch.device(args.device) + gene_network = adata.uns["sparse_gene_network"].todense() + if args.prior == "None": + gene_network = np.zeros(gene_network.shape) + + # filter adata to perturbations with at least 50 samples + obs_df = pd.DataFrame(adata.obs[args.perturbation_key]) + category_counts = obs_df[args.perturbation_key].value_counts() + filtered_categories = category_counts[category_counts >= 50].index + adata = adata[adata.obs[args.perturbation_key].isin(filtered_categories)] + + # replogle specific - filter to strong perturbations as done by Lopez et al, 2022 + filtered_replogle = read_aws_h5ad( + "s3://pert-spectra/svae_replogle_k562_gw_dataset_with_nt_guides.h5ad" + ) + filtered_perts = set(filtered_replogle.obs[args.perturbation_key].unique()).union( + set(["SKP2", "CUL1", "UBE2N"]) + ) + adata = adata[adata.obs[args.perturbation_key].isin(filtered_perts)] + + # init model parameters + L = args.global_latent_dim + adj_matrices = gene_network.copy() + vocab = adata.var_names + word2id = dict((v, idx) for idx, v in enumerate(vocab)) + loss_weights = generate_loss_weights(adata, args.perturbation_key) + X = adata.X + + # create binary perturbation label matrix from data + if args.perturbation_key in adata.obs: + D, pert_labels = vectorize_perts(adata, args.perturbation_key, args.control_key) + pert_idx = np.array( + [ + adata.var_names.get_loc(i.split("_")[1]) + if i.split("_")[1] in adata.var_names + else -1 + for i in pert_labels + ] + ) + # add ctrl one-hot-encoding + ctrl_vector = np.array( + [ + 1.0 if i == args.control_key else 0.0 + for i in adata.obs[args.perturbation_key] + ] + ) + basal_vector = np.array([1.0 for i in adata.obs[args.perturbation_key]]) + D = np.concatenate( + [D, ctrl_vector.reshape(len(ctrl_vector), 1)], axis=1 + ).astype(np.float32) + D = np.concatenate( + [D, basal_vector.reshape(len(basal_vector), 1)], axis=1 + ).astype(np.float32) + pert_idx = np.append(pert_idx, [-1, -1]) + pert_labels = pert_labels + ["ctrl", "basal"] + print(D.shape) + else: + print("Perturbation key not found or not given!") + D = np.array([]) + pert_idx = np.array([]) + + # init Spectra wrapper + wrapper = spc_linear.SPECTRA_Model( + X=X, + labels=None, + pert_idx=pert_idx, + pert_labels=pert_labels, + L=L, + vocab=vocab, + adj_matrix=adj_matrices, + use_weights=args.use_weights, + lam=args.lam, + psi=args.psi, + delta=args.delta, + kappa=None, + rho=args.rho, + use_cell_types=args.use_cell_types, + ) + wrapper.initialize(None, word2id, adata.X, 0) + wrapper.internal_model.to(device) + + # train-val-test split + train_idx, val_idx, _ = split_data_by_cell(X, adata.obs[args.perturbation_key]) + X_train = X[train_idx] + D_train = D[train_idx] + loss_weights_train = loss_weights[train_idx] + X_val = X[val_idx] + D_val = D[val_idx] + loss_weights_val = loss_weights[val_idx] + + # train model + train_loss, val_loss = wrapper.train( + X_train, D_train, loss_weights_train, X_val, D_val, loss_weights_val + ) + # save to s3 + write_model_pickle_to_s3( + s3_url=args.model_save_s3_url + args.experiment_name + "/", + model_name="replogle", + model=wrapper, + ) + + +if __name__ == "__main__": + # set seed for reproducibility + set_seed(0) + parser = argparse.ArgumentParser(description="Run pertspectra experiment.") + # Passing in hyperparameters as arguments. + parser.add_argument("--weight_decay", type=float, default=0.001) + parser.add_argument("--lam", type=float, default=1) + parser.add_argument("--psi", type=float, default=0.01) + parser.add_argument("--delta", type=float, default=0.001) + parser.add_argument("--kappa", type=float, default=0.00001) + parser.add_argument("--rho", type=float, default=0.001) + parser.add_argument("--use_highly_variable", type=bool, default=False) + parser.add_argument("--use_weights", type=bool, default=True) + parser.add_argument("--global_latent_dim", type=float, default=300) + parser.add_argument("--seed", type=int, default=0) + + # cell type + parser.add_argument("--use_cell_types", type=bool, default=False) + # perturbation key if using perturbations + parser.add_argument("--perturbation_key", type=str, default="gene") + # control key + parser.add_argument("--control_key", type=str, default=["non-targeting"]) + # prior to use: None, stringdb + parser.add_argument("--prior", type=str, default="stringdb") + # device to use + parser.add_argument("--device", type=str, default="cuda:0") + # name of wandb run + parser.add_argument("--experiment_name", type=str, default="pertspectra_replogle") + + parser.add_argument( + "--anndata_s3_url", + type=str, + default="s3://pert-spectra/replogle_adata_spectra.h5ad", + ) + parser.add_argument( + "--model_save_s3_url", + type=str, + default="s3://pert-spectra/PertSpectra_checkpoints/", + ) + args = parser.parse_known_args()[0] + + train(args) diff --git a/README.md b/README.md new file mode 100644 index 0000000..ad9ee79 --- /dev/null +++ b/README.md @@ -0,0 +1,78 @@ +# PertSpectra: Guided triplet factor analysis of perturb-seq data with a prior +Factor analysis model for perturb-seq data, guided by perturbation labels and prior graph regularization +Code accompanying [TODO: paper link here]() + +## Abstract +Computational modeling of interventional data is a rapidly expanding area of machine learning. In drug discovery, measuring the effects of genetic +interventions on cells is important for characterizing unknown disease mechanisms, but interpreting the resulting measurements remains a challenging task. Reliable latent space interpretability and modeling interactions between interventions are key areas of improvement for current models +in literature. Therefore, we propose PertSpectra, an extension of previously described factor analysis method Spectra (Kunes et al., 2024) to +explicitly support intervensional data. PertSpectra leverages Spectra’s differentiable graph regularization to incorporate prior biological knowledge +to generate sparse, biologically relevant latent factors that capture perturbational effects. We assess PertSpectra on three single cell sequencing +datasets with genetic perturbations, measuring latent space interpretability, predictive ability on unseen combinations of perturbations, and identification of perturbations of similar biological function. We show that PertSpectra provides an integrated modeling approach to understanding +combinatorial interventional data in the context of drug discovery. + +## Directory Overview +Outline of the organization of the codebase. +* `.`: Contains notebooks and helper functions for downstream analyses +* `./src`: Contains the PertSpectra code, edited from the Spectra codebase +* `./data`: Contains instructions for downloading datasets used in analysis +* `./data_preprocessing`: Contains notebooks for preprocessing the input data +* `./PertSpectra_training_scripts`: Contains training scripts for PertSpectra +* `./PertSpectra_load_checkpoints`: Contains notebooks for loading trained PertSpectra models from checkpoints +* `./scETM`: Contains notebooks for training scETM +* `./GSFA`: Contains notebooks for preprocessing and training GSFA +* `./figures`: Contains notebooks for figures + +## Environment Setup +PertSpectra has been tested on Linux systems. + +Please execute: + +``` +make install +``` + +This will generate a conda environment called `pertspectra` and an associated jupyter kernel that +can be used to execute the notebooks in this code repository. + + +## Data / Setup +Data preprocessing and setup. +### Gene expression normalization +The model expects log-normalized count data. Ensure that the log-normalized expression is either in the `.X` field or `.layers` field of the anndata. + +### Stringdb graph pruning +The model accepts an adjacency matrix as a prior to regularize against during training. Currently, the model regularizes against a stringdb prior graph - the notebooks to subset the stringdb graph to the input genes measured in the perturb-seq experiment are located in the `prior_graph_preprocessing/` subdirectory. Otherwise, create any prior graph as desired as a sparse adjacency matrix and store under `.uns["sparse_gene_network"]` in the anndata. + +## Training +Overview of training and saving the model. +### Launching training runs +Training scripts for PertSpectra are located in `./PertSpectra_training_scripts`. Run +``` python3 [training script] [args]``` +to launch a training run. + +### Model loading +The model can be loaded with a helper function in `utils.py` (refer to `utils.py` for details): +``` +wrapper, adata = load_model( + adata=adata, + s3_dir='s3_directory_where_training_runs_are_stored', + experiment_name='folder_in_s3_directory_where_training_run_is_located', + model_name='name_of_saved_model_pickle', + markers_top_n=50, + use_cell_types=False, + ) +``` +From the loaded model, we also need to reconstruct the binarized perturbation matrix (cell x perturbation), as the binarization may differ across different models. For details on reconstructing the binarized perturbation matrix, refer to the analysis notebooks. + +The returned anndata from `load_model` also saves the following outputs from the model: +* `adata.uns['SPECTRA_pert_scores']` stores the learned perturbation-level factors +* `adata.uns['SPECTRA_factors']` stores the learned gene-level factors + +For full details on loading saved PertSpectra models, see `./PertSpectra_load_checkpoints`. + +## Downstream analysis +Please reference the Jupyter notebooks in the main directory for all code relating to downstream analysis. + +## Figure Generation +Code to generate figures based on the downstream analyses located in `./figures`. diff --git a/conda-lock.yml b/conda-lock.yml new file mode 100644 index 0000000..45ce8fe --- /dev/null +++ b/conda-lock.yml @@ -0,0 +1,1254 @@ +# This lock file was generated by conda-lock (https://github.com/conda/conda-lock). DO NOT EDIT! +# +# A "lock file" contains a concrete list of package versions (with checksums) to be installed. Unlike +# e.g. `conda env create`, the resulting environment will not change as new package versions become +# available, unless you explicitly update the lock file. +# +# Install this environment as "YOURENV" with: +# conda-lock install -n YOURENV conda-lock.yml +# To update a single package to the latest version compatible with the version constraints in the source: +# conda-lock lock --lockfile conda-lock.yml --update PACKAGE +# To re-solve the entire environment, e.g. after changing a version constraint in the source file: +# conda-lock -f /home/ubuntu/PertSpectra/environment.yaml -f environment.yaml --lockfile conda-lock.yml +version: 1 +metadata: + content_hash: + linux-64: 3ac846264487603b3725735229011bbaed4344c1650661cadb7bb4dcbb5bc09a + osx-64: 57072fde0fa71f96db2a7fc7b3e6423f1288587cadd631dbf726f1e93ceac827 + channels: + - url: conda-forge + used_env_vars: [] + - url: bioconda + used_env_vars: [] + - url: defaults + used_env_vars: [] + platforms: + - linux-64 + - osx-64 + sources: + - /home/ubuntu/PertSpectra/environment.yaml + - environment.yaml +package: +- name: _libgcc_mutex + version: '0.1' + manager: conda + platform: linux-64 + dependencies: {} + url: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + hash: + md5: d7c89558ba9fa0495403155b64376d81 + sha256: fe51de6107f9edc7aa4f786a70f4a883943bc9d39b3bb7307c04c41410990726 + category: main + optional: false +- name: _openmp_mutex + version: '4.5' + manager: conda + platform: linux-64 + dependencies: + _libgcc_mutex: '0.1' + libgomp: '>=7.5.0' + url: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + hash: + md5: 73aaf86a425cc6e73fcf236a5a46396d + sha256: fbe2c5e56a653bebb982eda4876a9178aedfc2b545f25d0ce9c4c0b508253d22 + category: main + optional: false +- name: bzip2 + version: 1.0.8 + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + libgcc-ng: '>=12' + url: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda + hash: + md5: 62ee74e96c5ebb0af99386de58cf9553 + sha256: 5ced96500d945fb286c9c838e54fa759aa04a7129c59800f0846b4335cee770d + category: main + optional: false +- name: bzip2 + version: 1.0.8 + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + url: https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-hfdf4475_7.conda + hash: + md5: 7ed4301d437b59045be7e051a0308211 + sha256: cad153608b81fb24fc8c509357daa9ae4e49dfc535b2cb49b91e23dbd68fc3c5 + category: main + optional: false +- name: ca-certificates + version: 2024.12.14 + manager: conda + platform: linux-64 + dependencies: {} + url: https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.12.14-hbcca054_0.conda + hash: + md5: 720523eb0d6a9b0f6120c16b2aa4e7de + sha256: 1afd7274cbc9a334d6d0bc62fa760acc7afdaceb0b91a8df370ec01fd75dc7dd + category: main + optional: false +- name: ca-certificates + version: 2024.12.14 + manager: conda + platform: osx-64 + dependencies: {} + url: https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.12.14-h8857fd0_0.conda + hash: + md5: b7b887091c99ed2e74845e75e9128410 + sha256: ddaafdcd1b8ace6ffeea22b6824ca9db8a64cf0a2652a11d7554ece54935fa06 + category: main + optional: false +- name: cffi + version: 1.17.1 + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + libffi: '>=3.4,<4.0a0' + libgcc: '>=13' + pycparser: '' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* + url: https://conda.anaconda.org/conda-forge/linux-64/cffi-1.17.1-py311hf29c0ef_0.conda + hash: + md5: 55553ecd5328336368db611f350b7039 + sha256: bc47aa39c8254e9e487b8bcd74cfa3b4a3de3648869eb1a0b89905986b668e35 + category: main + optional: false +- name: cffi + version: 1.17.1 + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + libffi: '>=3.4,<4.0a0' + pycparser: '' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* + url: https://conda.anaconda.org/conda-forge/osx-64/cffi-1.17.1-py311h137bacd_0.conda + hash: + md5: a4b0f531064fa3dd5e3afbb782ea2cd5 + sha256: 012ee7b1ed4f9b0490d6e90c72decf148d7575173c7eaf851cd87fd434d2cacc + category: main + optional: false +- name: cfgv + version: 3.3.1 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.3.1-pyhd8ed1ab_1.conda + hash: + md5: 57df494053e17dce2ac3a0b33e1b2a2e + sha256: d5696636733b3c301054b948cdd793f118efacce361d9bd4afb57d5980a9064f + category: main + optional: false +- name: cfgv + version: 3.3.1 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.3.1-pyhd8ed1ab_1.conda + hash: + md5: 57df494053e17dce2ac3a0b33e1b2a2e + sha256: d5696636733b3c301054b948cdd793f118efacce361d9bd4afb57d5980a9064f + category: main + optional: false +- name: colorama + version: 0.4.6 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda + hash: + md5: 962b9857ee8e7018c22f2776ffa0b2d7 + sha256: ab29d57dc70786c1269633ba3dff20288b81664d3ff8d21af995742e2bb03287 + category: main + optional: false +- name: colorama + version: 0.4.6 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda + hash: + md5: 962b9857ee8e7018c22f2776ffa0b2d7 + sha256: ab29d57dc70786c1269633ba3dff20288b81664d3ff8d21af995742e2bb03287 + category: main + optional: false +- name: coverage + version: 7.6.10 + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + libgcc: '>=13' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* + tomli: '' + url: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.6.10-py311h2dc5d0c_0.conda + hash: + md5: 2a772b30e69ba8319651e9f3ab01608f + sha256: c5782231c9255f0492728bfb74ebcddf2dd8f5561d4f792d9d186d9d360242b8 + category: main + optional: false +- name: coverage + version: 7.6.10 + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* + tomli: '' + url: https://conda.anaconda.org/conda-forge/osx-64/coverage-7.6.10-py311ha3cf9ac_0.conda + hash: + md5: 79facfcf4bd19a0c0beb0041ee307db0 + sha256: b32d8dfaaff9938aed6be84421a7d3499c205347ee177df5dee484c8d4a7e33a + category: main + optional: false +- name: distlib + version: 0.3.9 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/distlib-0.3.9-pyhd8ed1ab_1.conda + hash: + md5: 8d88f4a2242e6b96f9ecff9a6a05b2f1 + sha256: 0e160c21776bd881b79ce70053e59736f51036784fa43a50da10a04f0c1b9c45 + category: main + optional: false +- name: distlib + version: 0.3.9 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/distlib-0.3.9-pyhd8ed1ab_1.conda + hash: + md5: 8d88f4a2242e6b96f9ecff9a6a05b2f1 + sha256: 0e160c21776bd881b79ce70053e59736f51036784fa43a50da10a04f0c1b9c45 + category: main + optional: false +- name: exceptiongroup + version: 1.2.2 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.2-pyhd8ed1ab_1.conda + hash: + md5: a16662747cdeb9abbac74d0057cc976e + sha256: cbde2c64ec317118fc06b223c5fd87c8a680255e7348dd60e7b292d2e103e701 + category: main + optional: false +- name: exceptiongroup + version: 1.2.2 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.2.2-pyhd8ed1ab_1.conda + hash: + md5: a16662747cdeb9abbac74d0057cc976e + sha256: cbde2c64ec317118fc06b223c5fd87c8a680255e7348dd60e7b292d2e103e701 + category: main + optional: false +- name: filelock + version: 3.16.1 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/filelock-3.16.1-pyhd8ed1ab_1.conda + hash: + md5: d692e9ba6f92dc51484bf3477e36ce7c + sha256: 18dca6e2194732df7ebf824abaefe999e4765ebe8e8a061269406ab88fc418b9 + category: main + optional: false +- name: filelock + version: 3.16.1 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/filelock-3.16.1-pyhd8ed1ab_1.conda + hash: + md5: d692e9ba6f92dc51484bf3477e36ce7c + sha256: 18dca6e2194732df7ebf824abaefe999e4765ebe8e8a061269406ab88fc418b9 + category: main + optional: false +- name: identify + version: 2.6.5 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + ukkonen: '' + url: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.5-pyhd8ed1ab_0.conda + hash: + md5: c1b0f663ff141265d1be1242259063f0 + sha256: e8ea11b8e39a98a9c34efb5c21c3fca718e31e1f41fd9ae5f6918b8eb402da59 + category: main + optional: false +- name: identify + version: 2.6.5 + manager: conda + platform: osx-64 + dependencies: + ukkonen: '' + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.5-pyhd8ed1ab_0.conda + hash: + md5: c1b0f663ff141265d1be1242259063f0 + sha256: e8ea11b8e39a98a9c34efb5c21c3fca718e31e1f41fd9ae5f6918b8eb402da59 + category: main + optional: false +- name: iniconfig + version: 2.0.0 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda + hash: + md5: 6837f3eff7dcea42ecd714ce1ac2b108 + sha256: 0ec8f4d02053cd03b0f3e63168316530949484f80e16f5e2fb199a1d117a89ca + category: main + optional: false +- name: iniconfig + version: 2.0.0 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda + hash: + md5: 6837f3eff7dcea42ecd714ce1ac2b108 + sha256: 0ec8f4d02053cd03b0f3e63168316530949484f80e16f5e2fb199a1d117a89ca + category: main + optional: false +- name: ld_impl_linux-64 + version: '2.43' + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + url: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_2.conda + hash: + md5: 048b02e3962f066da18efe3a21b77672 + sha256: 7c91cea91b13f4314d125d1bedb9d03a29ebbd5080ccdea70260363424646dbe + category: main + optional: false +- name: libcxx + version: 19.1.6 + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + url: https://conda.anaconda.org/conda-forge/osx-64/libcxx-19.1.6-hf95d169_1.conda + hash: + md5: 1bad6c181a0799298aad42fc5a7e98b7 + sha256: c40661648c34c08e21b69e0eec021ccaf090ffff070d2a9cbcb1519e1b310568 + category: main + optional: false +- name: libexpat + version: 2.6.4 + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + libgcc: '>=13' + url: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.6.4-h5888daf_0.conda + hash: + md5: db833e03127376d461e1e13e76f09b6c + sha256: 56541b98447b58e52d824bd59d6382d609e11de1f8adf20b23143e353d2b8d26 + category: main + optional: false +- name: libexpat + version: 2.6.4 + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + url: https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.6.4-h240833e_0.conda + hash: + md5: 20307f4049a735a78a29073be1be2626 + sha256: d10f43d0c5df6c8cf55259bce0fe14d2377eed625956cddce06f58827d288c59 + category: main + optional: false +- name: libffi + version: 3.4.2 + manager: conda + platform: linux-64 + dependencies: + libgcc-ng: '>=9.4.0' + url: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2 + hash: + md5: d645c6d2ac96843a2bfaccd2d62b3ac3 + sha256: ab6e9856c21709b7b517e940ae7028ae0737546122f83c2aa5d692860c3b149e + category: main + optional: false +- name: libffi + version: 3.4.2 + manager: conda + platform: osx-64 + dependencies: {} + url: https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.2-h0d85af4_5.tar.bz2 + hash: + md5: ccb34fb14960ad8b125962d3d79b31a9 + sha256: 7a2d27a936ceee6942ea4d397f9c7d136f12549d86f7617e8b6bad51e01a941f + category: main + optional: false +- name: libgcc + version: 14.2.0 + manager: conda + platform: linux-64 + dependencies: + _libgcc_mutex: '0.1' + _openmp_mutex: '>=4.5' + url: https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h77fa898_1.conda + hash: + md5: 3cb76c3f10d3bc7f1105b2fc9db984df + sha256: 53eb8a79365e58849e7b1a068d31f4f9e718dc938d6f2c03e960345739a03569 + category: main + optional: false +- name: libgcc-ng + version: 14.2.0 + manager: conda + platform: linux-64 + dependencies: + libgcc: 14.2.0 + url: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.2.0-h69a702a_1.conda + hash: + md5: e39480b9ca41323497b05492a63bc35b + sha256: 3a76969c80e9af8b6e7a55090088bc41da4cffcde9e2c71b17f44d37b7cb87f7 + category: main + optional: false +- name: libgomp + version: 14.2.0 + manager: conda + platform: linux-64 + dependencies: + _libgcc_mutex: '0.1' + url: https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.2.0-h77fa898_1.conda + hash: + md5: cc3573974587f12dda90d96e3e55a702 + sha256: 1911c29975ec99b6b906904040c855772ccb265a1c79d5d75c8ceec4ed89cd63 + category: main + optional: false +- name: liblzma + version: 5.6.3 + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + libgcc: '>=13' + url: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.6.3-hb9d3cd8_1.conda + hash: + md5: 2ecf2f1c7e4e21fcfe6423a51a992d84 + sha256: e6e425252f3839e2756e4af1ea2074dffd3396c161bf460629f9dfd6a65f15c6 + category: main + optional: false +- name: liblzma + version: 5.6.3 + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + url: https://conda.anaconda.org/conda-forge/osx-64/liblzma-5.6.3-hd471939_1.conda + hash: + md5: f9e9205fed9c664421c1c09f0b90ce6d + sha256: c70639ff3cb034a8e31cb081c907879b6a639bb12b0e090069a68eb69125b10e + category: main + optional: false +- name: libnsl + version: 2.0.1 + manager: conda + platform: linux-64 + dependencies: + libgcc-ng: '>=12' + url: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda + hash: + md5: 30fd6e37fe21f86f4bd26d6ee73eeec7 + sha256: 26d77a3bb4dceeedc2a41bd688564fe71bf2d149fdcf117049970bc02ff1add6 + category: main + optional: false +- name: libsqlite + version: 3.47.2 + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + libgcc: '>=13' + libzlib: '>=1.3.1,<2.0a0' + url: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.47.2-hee588c1_0.conda + hash: + md5: b58da17db24b6e08bcbf8fed2fb8c915 + sha256: 48af21ebc2cbf358976f1e0f4a0ab9e91dfc83d0ef337cf3837c6f5bc22fb352 + category: main + optional: false +- name: libsqlite + version: 3.47.2 + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + libzlib: '>=1.3.1,<2.0a0' + url: https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.47.2-hdb6dae5_0.conda + hash: + md5: 44d9799fda97eb34f6d88ac1e3eb0ea6 + sha256: 4d5e188d921f93c97ce172fc8c4341e8171670ec98d76f9961f65f6306fcda77 + category: main + optional: false +- name: libstdcxx + version: 14.2.0 + manager: conda + platform: linux-64 + dependencies: + libgcc: 14.2.0 + url: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-14.2.0-hc0a3c3a_1.conda + hash: + md5: 234a5554c53625688d51062645337328 + sha256: 4661af0eb9bdcbb5fb33e5d0023b001ad4be828fccdcc56500059d56f9869462 + category: main + optional: false +- name: libuuid + version: 2.38.1 + manager: conda + platform: linux-64 + dependencies: + libgcc-ng: '>=12' + url: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda + hash: + md5: 40b61aab5c7ba9ff276c41cfffe6b80b + sha256: 787eb542f055a2b3de553614b25f09eefb0a0931b0c87dbcce6efdfd92f04f18 + category: main + optional: false +- name: libxcrypt + version: 4.4.36 + manager: conda + platform: linux-64 + dependencies: + libgcc-ng: '>=12' + url: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda + hash: + md5: 5aa797f8787fe7a17d1b0821485b5adc + sha256: 6ae68e0b86423ef188196fff6207ed0c8195dd84273cb5623b85aa08033a410c + category: main + optional: false +- name: libzlib + version: 1.3.1 + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + libgcc: '>=13' + url: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + hash: + md5: edb0dca6bc32e4f4789199455a1dbeb8 + sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 + category: main + optional: false +- name: libzlib + version: 1.3.1 + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + url: https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-hd23fc13_2.conda + hash: + md5: 003a54a4e32b02f7355b50a837e699da + sha256: 8412f96504fc5993a63edf1e211d042a1fd5b1d51dedec755d2058948fcced09 + category: main + optional: false +- name: ncurses + version: '6.5' + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + libgcc-ng: '>=12' + url: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-he02047a_1.conda + hash: + md5: 70caf8bb6cf39a0b6b7efc885f51c0fe + sha256: 6a1d5d8634c1a07913f1c525db6455918cbc589d745fac46d9d6e30340c8731a + category: main + optional: false +- name: ncurses + version: '6.5' + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + url: https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-hf036a51_1.conda + hash: + md5: e102bbf8a6ceeaf429deab8032fc8977 + sha256: b0b3180039ef19502525a2abd5833c00f9624af830fd391f851934d57bffb9af + category: main + optional: false +- name: nodeenv + version: 1.9.1 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + setuptools: '' + url: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.9.1-pyhd8ed1ab_1.conda + hash: + md5: 7ba3f09fceae6a120d664217e58fe686 + sha256: 3636eec0e60466a00069b47ce94b6d88b01419b6577d8e393da44bb5bc8d3468 + category: main + optional: false +- name: nodeenv + version: 1.9.1 + manager: conda + platform: osx-64 + dependencies: + setuptools: '' + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.9.1-pyhd8ed1ab_1.conda + hash: + md5: 7ba3f09fceae6a120d664217e58fe686 + sha256: 3636eec0e60466a00069b47ce94b6d88b01419b6577d8e393da44bb5bc8d3468 + category: main + optional: false +- name: openssl + version: 3.4.0 + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + ca-certificates: '' + libgcc: '>=13' + url: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.4.0-h7b32b05_1.conda + hash: + md5: 4ce6875f75469b2757a65e10a5d05e31 + sha256: f62f6bca4a33ca5109b6d571b052a394d836956d21b25b7ffd03376abf7a481f + category: main + optional: false +- name: openssl + version: 3.4.0 + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + ca-certificates: '' + url: https://conda.anaconda.org/conda-forge/osx-64/openssl-3.4.0-hc426f3f_1.conda + hash: + md5: eaae23dbfc9ec84775097898526c72ea + sha256: 879a960d586cf8a64131ac0c060ef575cfb8aa9f6813093cba92042a86ee867c + category: main + optional: false +- name: packaging + version: '24.2' + manager: conda + platform: linux-64 + dependencies: + python: '>=3.8' + url: https://conda.anaconda.org/conda-forge/noarch/packaging-24.2-pyhd8ed1ab_2.conda + hash: + md5: 3bfed7e6228ebf2f7b9eaa47f1b4e2aa + sha256: da157b19bcd398b9804c5c52fc000fcb8ab0525bdb9c70f95beaa0bb42f85af1 + category: main + optional: false +- name: packaging + version: '24.2' + manager: conda + platform: osx-64 + dependencies: + python: '>=3.8' + url: https://conda.anaconda.org/conda-forge/noarch/packaging-24.2-pyhd8ed1ab_2.conda + hash: + md5: 3bfed7e6228ebf2f7b9eaa47f1b4e2aa + sha256: da157b19bcd398b9804c5c52fc000fcb8ab0525bdb9c70f95beaa0bb42f85af1 + category: main + optional: false +- name: pip + version: 24.3.1 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9,<3.13.0a0' + setuptools: '' + wheel: '' + url: https://conda.anaconda.org/conda-forge/noarch/pip-24.3.1-pyh8b19718_2.conda + hash: + md5: 04e691b9fadd93a8a9fad87a81d4fd8f + sha256: da8c8888de10c1e4234ebcaa1550ac2b4b5408ac20f093fe641e4bc8c9c9f3eb + category: main + optional: false +- name: pip + version: 24.3.1 + manager: conda + platform: osx-64 + dependencies: + setuptools: '' + wheel: '' + python: '>=3.9,<3.13.0a0' + url: https://conda.anaconda.org/conda-forge/noarch/pip-24.3.1-pyh8b19718_2.conda + hash: + md5: 04e691b9fadd93a8a9fad87a81d4fd8f + sha256: da8c8888de10c1e4234ebcaa1550ac2b4b5408ac20f093fe641e4bc8c9c9f3eb + category: main + optional: false +- name: platformdirs + version: 4.3.6 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.3.6-pyhd8ed1ab_1.conda + hash: + md5: 577852c7e53901ddccc7e6a9959ddebe + sha256: bb50f6499e8bc1d1a26f17716c97984671121608dc0c3ecd34858112bce59a27 + category: main + optional: false +- name: platformdirs + version: 4.3.6 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.3.6-pyhd8ed1ab_1.conda + hash: + md5: 577852c7e53901ddccc7e6a9959ddebe + sha256: bb50f6499e8bc1d1a26f17716c97984671121608dc0c3ecd34858112bce59a27 + category: main + optional: false +- name: pluggy + version: 1.5.0 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.5.0-pyhd8ed1ab_1.conda + hash: + md5: e9dcbce5f45f9ee500e728ae58b605b6 + sha256: 122433fc5318816b8c69283aaf267c73d87aa2d09ce39f64c9805c9a3b264819 + category: main + optional: false +- name: pluggy + version: 1.5.0 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.5.0-pyhd8ed1ab_1.conda + hash: + md5: e9dcbce5f45f9ee500e728ae58b605b6 + sha256: 122433fc5318816b8c69283aaf267c73d87aa2d09ce39f64c9805c9a3b264819 + category: main + optional: false +- name: pre-commit + version: 4.0.1 + manager: conda + platform: linux-64 + dependencies: + cfgv: '>=2.0.0' + identify: '>=1.0.0' + nodeenv: '>=0.11.1' + python: '>=3.9' + pyyaml: '>=5.1' + virtualenv: '>=20.10.0' + url: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.0.1-pyha770c72_1.conda + hash: + md5: d0ea6ed474bf7f6db88fc85e6dc809b1 + sha256: 3cfe4c777f1bb3f869cefd732357c7c657df7f0bba5c11cd64ced21e0b0a2b5b + category: main + optional: false +- name: pre-commit + version: 4.0.1 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + pyyaml: '>=5.1' + identify: '>=1.0.0' + nodeenv: '>=0.11.1' + cfgv: '>=2.0.0' + virtualenv: '>=20.10.0' + url: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.0.1-pyha770c72_1.conda + hash: + md5: d0ea6ed474bf7f6db88fc85e6dc809b1 + sha256: 3cfe4c777f1bb3f869cefd732357c7c657df7f0bba5c11cd64ced21e0b0a2b5b + category: main + optional: false +- name: pre_commit + version: 4.0.1 + manager: conda + platform: linux-64 + dependencies: + pre-commit: '>=4.0.1,<4.0.2.0a0' + url: https://conda.anaconda.org/conda-forge/noarch/pre_commit-4.0.1-hd8ed1ab_1.conda + hash: + md5: 6767ba12c472102bf249d00ee023ac5d + sha256: 7c57984df8a0d936b1f81762d557bc79e98769c06b79631b86980327cac2a538 + category: main + optional: false +- name: pre_commit + version: 4.0.1 + manager: conda + platform: osx-64 + dependencies: + pre-commit: '>=4.0.1,<4.0.2.0a0' + url: https://conda.anaconda.org/conda-forge/noarch/pre_commit-4.0.1-hd8ed1ab_1.conda + hash: + md5: 6767ba12c472102bf249d00ee023ac5d + sha256: 7c57984df8a0d936b1f81762d557bc79e98769c06b79631b86980327cac2a538 + category: main + optional: false +- name: pycparser + version: '2.22' + manager: conda + platform: linux-64 + dependencies: + python: '' + url: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda + hash: + md5: 12c566707c80111f9799308d9e265aef + sha256: 79db7928d13fab2d892592223d7570f5061c192f27b9febd1a418427b719acc6 + category: main + optional: false +- name: pycparser + version: '2.22' + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda + hash: + md5: 12c566707c80111f9799308d9e265aef + sha256: 79db7928d13fab2d892592223d7570f5061c192f27b9febd1a418427b719acc6 + category: main + optional: false +- name: pytest + version: 8.3.4 + manager: conda + platform: linux-64 + dependencies: + colorama: '' + exceptiongroup: '>=1.0.0rc8' + iniconfig: '' + packaging: '' + pluggy: <2,>=1.5 + python: '>=3.9' + tomli: '>=1' + url: https://conda.anaconda.org/conda-forge/noarch/pytest-8.3.4-pyhd8ed1ab_1.conda + hash: + md5: 799ed216dc6af62520f32aa39bc1c2bb + sha256: 75245ca9d0cbd6d38bb45ec02430189a9d4c21c055c5259739d738a2298d61b3 + category: main + optional: false +- name: pytest + version: 8.3.4 + manager: conda + platform: osx-64 + dependencies: + packaging: '' + colorama: '' + iniconfig: '' + python: '>=3.9' + exceptiongroup: '>=1.0.0rc8' + tomli: '>=1' + pluggy: <2,>=1.5 + url: https://conda.anaconda.org/conda-forge/noarch/pytest-8.3.4-pyhd8ed1ab_1.conda + hash: + md5: 799ed216dc6af62520f32aa39bc1c2bb + sha256: 75245ca9d0cbd6d38bb45ec02430189a9d4c21c055c5259739d738a2298d61b3 + category: main + optional: false +- name: pytest-cov + version: 6.0.0 + manager: conda + platform: linux-64 + dependencies: + coverage: '>=7.5' + pytest: '>=4.6' + python: '>=3.9' + toml: '' + url: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-6.0.0-pyhd8ed1ab_1.conda + hash: + md5: 79963c319d1be62c8fd3e34555816e01 + sha256: 09acac1974e10a639415be4be326dd21fa6d66ca51a01fb71532263fba6dccf6 + category: main + optional: false +- name: pytest-cov + version: 6.0.0 + manager: conda + platform: osx-64 + dependencies: + toml: '' + python: '>=3.9' + pytest: '>=4.6' + coverage: '>=7.5' + url: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-6.0.0-pyhd8ed1ab_1.conda + hash: + md5: 79963c319d1be62c8fd3e34555816e01 + sha256: 09acac1974e10a639415be4be326dd21fa6d66ca51a01fb71532263fba6dccf6 + category: main + optional: false +- name: python + version: 3.11.11 + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + bzip2: '>=1.0.8,<2.0a0' + ld_impl_linux-64: '>=2.36.1' + libexpat: '>=2.6.4,<3.0a0' + libffi: '>=3.4,<4.0a0' + libgcc: '>=13' + liblzma: '>=5.6.3,<6.0a0' + libnsl: '>=2.0.1,<2.1.0a0' + libsqlite: '>=3.47.0,<4.0a0' + libuuid: '>=2.38.1,<3.0a0' + libxcrypt: '>=4.4.36' + libzlib: '>=1.3.1,<2.0a0' + ncurses: '>=6.5,<7.0a0' + openssl: '>=3.4.0,<4.0a0' + readline: '>=8.2,<9.0a0' + tk: '>=8.6.13,<8.7.0a0' + tzdata: '' + url: https://conda.anaconda.org/conda-forge/linux-64/python-3.11.11-h9e4cc4f_1_cpython.conda + hash: + md5: 8387070aa413ce9a8cc35a509fae938b + sha256: b29ce0836fce55bdff8d5c5b71c4921a23f87d3b950aea89a9e75784120b06b0 + category: main + optional: false +- name: python + version: 3.11.11 + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + bzip2: '>=1.0.8,<2.0a0' + libexpat: '>=2.6.4,<3.0a0' + libffi: '>=3.4,<4.0a0' + liblzma: '>=5.6.3,<6.0a0' + libsqlite: '>=3.47.0,<4.0a0' + libzlib: '>=1.3.1,<2.0a0' + ncurses: '>=6.5,<7.0a0' + openssl: '>=3.4.0,<4.0a0' + readline: '>=8.2,<9.0a0' + tk: '>=8.6.13,<8.7.0a0' + tzdata: '' + url: https://conda.anaconda.org/conda-forge/osx-64/python-3.11.11-h9ccd52b_1_cpython.conda + hash: + md5: 9b20fb7c571405d29f33ae2fc5990d8d + sha256: 4c53c4c48a0f42577ae405553ab899b3ef5ee23b2a1bf4fbbc694c46f884f6fc + category: main + optional: false +- name: python_abi + version: '3.11' + manager: conda + platform: linux-64 + dependencies: {} + url: https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.11-5_cp311.conda + hash: + md5: 139a8d40c8a2f430df31048949e450de + sha256: 2660b8059b3ee854bc5d3c6b1fce946e5bd2fe8fbca7827de2c5885ead6209de + category: main + optional: false +- name: python_abi + version: '3.11' + manager: conda + platform: osx-64 + dependencies: {} + url: https://conda.anaconda.org/conda-forge/osx-64/python_abi-3.11-5_cp311.conda + hash: + md5: e6d62858c06df0be0e6255c753d74787 + sha256: 9b092850a268aca99600b724bae849f51209ecd5628e609b4699debc59ff1945 + category: main + optional: false +- name: pyyaml + version: 6.0.2 + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + libgcc: '>=13' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* + yaml: '>=0.2.5,<0.3.0a0' + url: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.2-py311h9ecbd09_1.conda + hash: + md5: abeb54d40f439b86f75ea57045ab8496 + sha256: e721e5ff389a7b2135917c04b27391be3d3382e261bb60a369b1620655365c3d + category: main + optional: false +- name: pyyaml + version: 6.0.2 + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* + yaml: '>=0.2.5,<0.3.0a0' + url: https://conda.anaconda.org/conda-forge/osx-64/pyyaml-6.0.2-py311h3336109_1.conda + hash: + md5: b0132bec7165a53403dcc393ff761a9e + sha256: d8f4513c53a7c0be9f1cdb9d1af31ac85cf8a6f0e4194715e36e915c03104662 + category: main + optional: false +- name: readline + version: '8.2' + manager: conda + platform: linux-64 + dependencies: + libgcc-ng: '>=12' + ncurses: '>=6.3,<7.0a0' + url: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda + hash: + md5: 47d31b792659ce70f470b5c82fdfb7a4 + sha256: 5435cf39d039387fbdc977b0a762357ea909a7694d9528ab40f005e9208744d7 + category: main + optional: false +- name: readline + version: '8.2' + manager: conda + platform: osx-64 + dependencies: + ncurses: '>=6.3,<7.0a0' + url: https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h9e318b2_1.conda + hash: + md5: f17f77f2acf4d344734bda76829ce14e + sha256: 41e7d30a097d9b060037f0c6a2b1d4c4ae7e942c06c943d23f9d481548478568 + category: main + optional: false +- name: setuptools + version: 75.6.0 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.6.0-pyhff2d567_1.conda + hash: + md5: fc80f7995e396cbaeabd23cf46c413dc + sha256: abb12e1dd515b13660aacb5d0fd43835bc2186cab472df25b7716cd65e095111 + category: main + optional: false +- name: setuptools + version: 75.6.0 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.6.0-pyhff2d567_1.conda + hash: + md5: fc80f7995e396cbaeabd23cf46c413dc + sha256: abb12e1dd515b13660aacb5d0fd43835bc2186cab472df25b7716cd65e095111 + category: main + optional: false +- name: tk + version: 8.6.13 + manager: conda + platform: linux-64 + dependencies: + libgcc-ng: '>=12' + libzlib: '>=1.2.13,<2.0.0a0' + url: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda + hash: + md5: d453b98d9c83e71da0741bb0ff4d76bc + sha256: e0569c9caa68bf476bead1bed3d79650bb080b532c64a4af7d8ca286c08dea4e + category: main + optional: false +- name: tk + version: 8.6.13 + manager: conda + platform: osx-64 + dependencies: + libzlib: '>=1.2.13,<2.0.0a0' + url: https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-h1abcd95_1.conda + hash: + md5: bf830ba5afc507c6232d4ef0fb1a882d + sha256: 30412b2e9de4ff82d8c2a7e5d06a15f4f4fef1809a72138b6ccb53a33b26faf5 + category: main + optional: false +- name: toml + version: 0.10.2 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_1.conda + hash: + md5: b0dd904de08b7db706167240bf37b164 + sha256: 34f3a83384ac3ac30aefd1309e69498d8a4aa0bf2d1f21c645f79b180e378938 + category: main + optional: false +- name: toml + version: 0.10.2 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_1.conda + hash: + md5: b0dd904de08b7db706167240bf37b164 + sha256: 34f3a83384ac3ac30aefd1309e69498d8a4aa0bf2d1f21c645f79b180e378938 + category: main + optional: false +- name: tomli + version: 2.2.1 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/tomli-2.2.1-pyhd8ed1ab_1.conda + hash: + md5: ac944244f1fed2eb49bae07193ae8215 + sha256: 18636339a79656962723077df9a56c0ac7b8a864329eb8f847ee3d38495b863e + category: main + optional: false +- name: tomli + version: 2.2.1 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/tomli-2.2.1-pyhd8ed1ab_1.conda + hash: + md5: ac944244f1fed2eb49bae07193ae8215 + sha256: 18636339a79656962723077df9a56c0ac7b8a864329eb8f847ee3d38495b863e + category: main + optional: false +- name: tzdata + version: 2024b + manager: conda + platform: linux-64 + dependencies: {} + url: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda + hash: + md5: 8ac3367aafb1cc0a068483c580af8015 + sha256: 4fde5c3008bf5d2db82f2b50204464314cc3c91c1d953652f7bd01d9e52aefdf + category: main + optional: false +- name: tzdata + version: 2024b + manager: conda + platform: osx-64 + dependencies: {} + url: https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda + hash: + md5: 8ac3367aafb1cc0a068483c580af8015 + sha256: 4fde5c3008bf5d2db82f2b50204464314cc3c91c1d953652f7bd01d9e52aefdf + category: main + optional: false +- name: ukkonen + version: 1.0.1 + manager: conda + platform: linux-64 + dependencies: + __glibc: '>=2.17,<3.0.a0' + cffi: '' + libgcc: '>=13' + libstdcxx: '>=13' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* + url: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.0.1-py311hd18a35c_5.conda + hash: + md5: 4e8447ca8558a203ec0577b4730073f3 + sha256: 4542cc3093f480c7fa3e104bfd9e5b7daeff32622121be6847f9e839341b0790 + category: main + optional: false +- name: ukkonen + version: 1.0.1 + manager: conda + platform: osx-64 + dependencies: + __osx: '>=10.13' + cffi: '' + libcxx: '>=17' + python: '>=3.11,<3.12.0a0' + python_abi: 3.11.* + url: https://conda.anaconda.org/conda-forge/osx-64/ukkonen-1.0.1-py311hf2f7c97_5.conda + hash: + md5: 1b576e5588d90b82f96e3e21490b085d + sha256: d1aaec2edf78eeb79407d907679a78ecc0c97f7390046a45d561e22b348de553 + category: main + optional: false +- name: virtualenv + version: 20.28.1 + manager: conda + platform: linux-64 + dependencies: + distlib: '>=0.3.7,<1' + filelock: '>=3.12.2,<4' + platformdirs: '>=3.9.1,<5' + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/virtualenv-20.28.1-pyhd8ed1ab_0.conda + hash: + md5: 680b1c287b10cefc8bda0530b217229f + sha256: c8bde4547ddbd21ea89e483a7c65d8a5e442c0db494b0b977e389b75b9d03d62 + category: main + optional: false +- name: virtualenv + version: 20.28.1 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + distlib: '>=0.3.7,<1' + filelock: '>=3.12.2,<4' + platformdirs: '>=3.9.1,<5' + url: https://conda.anaconda.org/conda-forge/noarch/virtualenv-20.28.1-pyhd8ed1ab_0.conda + hash: + md5: 680b1c287b10cefc8bda0530b217229f + sha256: c8bde4547ddbd21ea89e483a7c65d8a5e442c0db494b0b977e389b75b9d03d62 + category: main + optional: false +- name: wheel + version: 0.45.1 + manager: conda + platform: linux-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/wheel-0.45.1-pyhd8ed1ab_1.conda + hash: + md5: 75cb7132eb58d97896e173ef12ac9986 + sha256: 1b34021e815ff89a4d902d879c3bd2040bc1bd6169b32e9427497fa05c55f1ce + category: main + optional: false +- name: wheel + version: 0.45.1 + manager: conda + platform: osx-64 + dependencies: + python: '>=3.9' + url: https://conda.anaconda.org/conda-forge/noarch/wheel-0.45.1-pyhd8ed1ab_1.conda + hash: + md5: 75cb7132eb58d97896e173ef12ac9986 + sha256: 1b34021e815ff89a4d902d879c3bd2040bc1bd6169b32e9427497fa05c55f1ce + category: main + optional: false +- name: yaml + version: 0.2.5 + manager: conda + platform: linux-64 + dependencies: + libgcc-ng: '>=9.4.0' + url: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h7f98852_2.tar.bz2 + hash: + md5: 4cb3ad778ec2d5a7acbdf254eb1c42ae + sha256: a4e34c710eeb26945bdbdaba82d3d74f60a78f54a874ec10d373811a5d217535 + category: main + optional: false +- name: yaml + version: 0.2.5 + manager: conda + platform: osx-64 + dependencies: {} + url: https://conda.anaconda.org/conda-forge/osx-64/yaml-0.2.5-h0d85af4_2.tar.bz2 + hash: + md5: d7e08fcf8259d742156188e8762b4d20 + sha256: 5301417e2c8dea45b401ffee8df3957d2447d4ce80c83c5ff151fc6bfe1c4148 + category: main + optional: false diff --git a/data/inhouse/README.md b/data/inhouse/README.md new file mode 100644 index 0000000..224d421 --- /dev/null +++ b/data/inhouse/README.md @@ -0,0 +1,3 @@ +# Inhouse NF-kB pathway combinatorial screening dataset +Inhouse dataset specifically targeting NF-kB pathway genes\ +Data download available at s3://pert-spectra/data/inhouse.h5ad diff --git a/data/norman/README.md b/data/norman/README.md new file mode 100644 index 0000000..c68f145 --- /dev/null +++ b/data/norman/README.md @@ -0,0 +1,10 @@ +# Exploring genetic interaction manifolds constructed from rich single-cell phenotypes +Paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6746554/.\ +``` +Norman TM, Horlbeck MA, Replogle JM, Ge AY, Xu A, Jost M, Gilbert LA, Weissman JS. +Exploring genetic interaction manifolds constructed from rich single-cell phenotypes. +Science. 2019 Aug 23;365(6455):786-793. doi: 10.1126/science.aax4438. Epub 2019 Aug 8. +PMID: 31395745; PMCID: PMC6746554. +``` +We use the preprocessed data used by SAMS-VAE ([Berekt & Karaletsos](https://github.com/insitro/sams-vae)), originally from Theis Lab (https://github.com/theislab/sc-pert). \ +URL for data download: https://ndownloader.figshare.com/files/34027562. diff --git a/data/replogle/README.md b/data/replogle/README.md new file mode 100644 index 0000000..b98e065 --- /dev/null +++ b/data/replogle/README.md @@ -0,0 +1,12 @@ +# Mapping information-rich genotype-phenotype landscapes with genome-scale Perturb-seq +Paper: https://www.cell.com/cell/fulltext/S0092-8674(22)00597-9.\ +``` +J. M. Replogle, R. A. Saunders, A. N. Pogson, J. A. Hussmann, A. Lenail, A. Guna, L. Masci- +broda, E. J. Wagner, K. Adelman, G. Lithwick-Yanai, N. Iremadze, F. Oberstrass, D. Lip- +son, J. L. Bonnar, M. Jost, T. M. Norman, and J. S. Weissman. Mapping information- +rich genotype-phenotype landscapes with genome-scale perturb-seq. Cell, 185(14):2559– +2575.e28, 2022. ISSN 0092-8674. doi: https://doi.org/10.1016/j.cell.2022.05.013. URL +https://www.sciencedirect.com/science/article/pii/S0092867422005979. +``` +Raw data `K562_essential_raw_singlecell_01.h5ad` can be downloaded from [FigShare](https://plus.figshare.com/articles/dataset/_Mapping_information-rich_genotype-phenotype_landscapes_with_genome-scale_Perturb-seq_Replogle_et_al_2022_processed_Perturb-seq_datasets/20029387).\ +We filter down to strong perturbations using sVAE's ([Lopez et al, 2022](https://arxiv.org/abs/2211.03553)) preprocessed anndata, which can be generated using their preprocessing [notebook](https://github.com/Genentech/sVAE/blob/main/entry_points/replogle-preprocessing.ipynb). diff --git a/data_preprocessing/inhouse_prior_graph_preprocessing.ipynb b/data_preprocessing/inhouse_prior_graph_preprocessing.ipynb new file mode 100644 index 0000000..36086bd --- /dev/null +++ b/data_preprocessing/inhouse_prior_graph_preprocessing.ipynb @@ -0,0 +1,638 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3cc45e64", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2d35f5c7", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "import pandas as pd\n", + "import scanpy as sc\n", + "\n", + "sys.path.append(\"..\")\n", + "from utils import inhouse_preprocess, read_aws_csv, read_aws_h5ad" + ] + }, + { + "cell_type": "markdown", + "id": "7fc8f323-ab37-4a50-95c7-91f6d1876eeb", + "metadata": {}, + "source": [ + "## Data Preprocessing\n", + "- Ensure that the data is downloaded or fetch directly from s3 (see `../data` for instructions)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b5f89eea", + "metadata": {}, + "outputs": [], + "source": [ + "data_path = \"s3://pert-spectra/data/inhouse.h5ad\"\n", + "adata = read_aws_h5ad(data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0ab6141-ecf4-4f76-94ed-9ec1aeabf2ae", + "metadata": {}, + "outputs": [], + "source": [ + "adata = inhouse_preprocess(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b45a7b29", + "metadata": {}, + "outputs": [], + "source": [ + "# subset to hvg\n", + "sc.pp.highly_variable_genes(adata, n_top_genes=5000, subset=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8813d26b", + "metadata": {}, + "outputs": [], + "source": [ + "# ensure all perts are in adata\n", + "idx = []\n", + "pert_list = adata.obs[\"condition\"].unique()\n", + "for i in range(len(adata.var_names)):\n", + " if (adata.var[\"highly_variable\"][i]) or (adata.var_names[i] in pert_list):\n", + " idx.append(i)\n", + "adata = adata[:, idx]" + ] + }, + { + "cell_type": "markdown", + "id": "6cb93634-955e-4de4-9364-29fd1efee349", + "metadata": {}, + "source": [ + "## Generating StringDB prior graph" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cade1ddc", + "metadata": {}, + "outputs": [], + "source": [ + "# stringdb prior\n", + "stringdb_hq = read_aws_csv(\"s3://pert-spectra/references/StringDB.HQ.txt\", sep=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "837bd514", + "metadata": {}, + "outputs": [], + "source": [ + "# ensg mapping\n", + "gene_name_df = read_aws_csv(\n", + " \"s3://pert-spectra/references/Homo_sapiens.gene_info\", sep=\"\\t\"\n", + ")\n", + "gene_to_ensg = {}\n", + "gene_name_mapping = {}\n", + "for symbol, synonyms, ref in zip(\n", + " gene_name_df[\"Symbol\"], gene_name_df[\"Synonyms\"], gene_name_df[\"dbXrefs\"]\n", + "):\n", + " syn_set = set(synonyms.split(\"|\")).union(set([symbol]))\n", + " refs = ref.split(\"|\")\n", + " ensg = None\n", + " for r in refs:\n", + " label = r.split(\":\")\n", + " if label[0] == \"Ensembl\":\n", + " ensg = label[1]\n", + " break\n", + " if ensg is None:\n", + " continue\n", + " assert ensg[:4] == \"ENSG\"\n", + " gene_to_ensg[symbol] = ensg\n", + " gene_name_mapping[symbol] = syn_set\n", + " # make sure name mapping goes both ways\n", + " for syn in syn_set:\n", + " if syn in gene_name_mapping:\n", + " gene_name_mapping[syn].add(symbol)\n", + " else:\n", + " gene_name_mapping[syn] = set([symbol])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4c10530a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_11863/2928678895.py:1: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual.\n", + " adata.var['gene_symbols'] = adata.var_names\n" + ] + } + ], + "source": [ + "adata.var[\"gene_symbols\"] = adata.var_names\n", + "adata.var_names = [i.split(\".\")[0] for i in adata.var[\"gene_id\"]]\n", + "inhouse_ensg_to_gene = {}\n", + "for symbol, ensgid in zip(adata.var[\"gene_symbols\"], adata.var_names):\n", + " inhouse_ensg_to_gene[ensgid] = symbol" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5a884a01", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_measured_genes = set(adata.var[\"gene_symbols\"])\n", + "dataset_measured_ensg = set(adata.var_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "73bd2fda", + "metadata": {}, + "outputs": [], + "source": [ + "# first, check if gene name has ensembl id\n", + "# if it does, use that to match\n", + "# else, use gene name mapping\n", + "edges = []\n", + "for index, row in stringdb_hq.iterrows():\n", + " gene1 = row[\"i_genes\"]\n", + " gene2 = row[\"j_genes\"]\n", + " if gene1 not in gene_to_ensg:\n", + " # neither gene has ensg\n", + " if gene2 not in gene_to_ensg:\n", + " gene1_syn = set([gene1])\n", + " gene2_syn = set([gene2])\n", + " if gene1 in gene_name_mapping:\n", + " gene1_syn = gene_name_mapping[gene1].union(set([gene1]))\n", + " if gene2 in gene_name_mapping:\n", + " gene2_syn = gene_name_mapping[gene2].union(set([gene2]))\n", + "\n", + " alias_intersect_1 = gene1_syn.intersection(dataset_measured_genes)\n", + " alias_intersect_2 = gene2_syn.intersection(dataset_measured_genes)\n", + "\n", + " if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):\n", + " gene1 = (\n", + " gene1\n", + " if gene1 in dataset_measured_genes\n", + " else list(alias_intersect_1)[0]\n", + " )\n", + " gene2 = (\n", + " gene2\n", + " if gene2 in dataset_measured_genes\n", + " else list(alias_intersect_2)[0]\n", + " )\n", + " edges.append([gene1, gene2, row[\"x\"]])\n", + "\n", + " # gene1 does not have ensemblid, gene2 does have ensemblid\n", + " else:\n", + " ensg2 = gene_to_ensg[gene2]\n", + " gene1_syn = set([gene1])\n", + " gene2_syn = set([ensg2])\n", + " if gene1 in gene_name_mapping:\n", + " gene1_syn = gene_name_mapping[gene1].union(set([gene1]))\n", + "\n", + " alias_intersect_1 = gene1_syn.intersection(dataset_measured_genes)\n", + " alias_intersect_2 = gene2_syn.intersection(dataset_measured_ensg)\n", + "\n", + " if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):\n", + " gene1 = (\n", + " gene1\n", + " if gene1 in dataset_measured_genes\n", + " else list(alias_intersect_1)[0]\n", + " )\n", + " gene2 = inhouse_ensg_to_gene[list(alias_intersect_2)[0]]\n", + " edges.append([gene1, gene2, row[\"x\"]])\n", + "\n", + " else:\n", + " # gene1 has ensemblid, gene2 does not have ensemblid\n", + " if gene2 not in gene_to_ensg:\n", + " ensg1 = gene_to_ensg[gene1]\n", + " gene1_syn = set([ensg1])\n", + " gene2_syn = set([gene2])\n", + " if gene2 in gene_name_mapping:\n", + " gene2_syn = gene_name_mapping[gene2].union(set([gene2]))\n", + "\n", + " alias_intersect_1 = gene1_syn.intersection(dataset_measured_ensg)\n", + " alias_intersect_2 = gene2_syn.intersection(dataset_measured_genes)\n", + "\n", + " if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):\n", + " gene1 = inhouse_ensg_to_gene[list(alias_intersect_1)[0]]\n", + " gene2 = (\n", + " gene2\n", + " if gene2 in dataset_measured_genes\n", + " else list(alias_intersect_2)[0]\n", + " )\n", + " edges.append([gene1, gene2, row[\"x\"]])\n", + "\n", + " # both genes have ensmblid\n", + " else:\n", + " ensg1 = gene_to_ensg[gene1]\n", + " ensg2 = gene_to_ensg[gene2]\n", + " gene1_syn = set([ensg1])\n", + " gene2_syn = set([ensg2])\n", + "\n", + " alias_intersect_1 = gene1_syn.intersection(dataset_measured_ensg)\n", + " alias_intersect_2 = gene2_syn.intersection(dataset_measured_ensg)\n", + "\n", + " if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):\n", + " gene1 = inhouse_ensg_to_gene[list(alias_intersect_1)[0]]\n", + " gene2 = inhouse_ensg_to_gene[list(alias_intersect_2)[0]]\n", + " edges.append([gene1, gene2, row[\"x\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9e2e5c9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene1gene2score
0GPC1HS3ST10.949
1SLCO2B1HS3ST10.172
2SDC4HS3ST10.930
3HSPG2HS3ST10.941
4GPC6HS3ST10.949
............
210287KRT8MUC5AC0.575
210288GANMUC5AC0.270
210289MYCMUC5AC0.266
210290GCNT7MUC5AC0.905
210291MUC13MUC5AC0.974
\n", + "

210292 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " gene1 gene2 score\n", + "0 GPC1 HS3ST1 0.949\n", + "1 SLCO2B1 HS3ST1 0.172\n", + "2 SDC4 HS3ST1 0.930\n", + "3 HSPG2 HS3ST1 0.941\n", + "4 GPC6 HS3ST1 0.949\n", + "... ... ... ...\n", + "210287 KRT8 MUC5AC 0.575\n", + "210288 GAN MUC5AC 0.270\n", + "210289 MYC MUC5AC 0.266\n", + "210290 GCNT7 MUC5AC 0.905\n", + "210291 MUC13 MUC5AC 0.974\n", + "\n", + "[210292 rows x 3 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inhouse_network = pd.DataFrame(edges, columns=[\"gene1\", \"gene2\", \"score\"])\n", + "inhouse_network" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b6bd6d19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene1gene2score
0ENSG00000228541ENSG000002285410.000
1STARD13-ASSTARD13-AS0.000
2CLDND2CLDND20.000
3ENSG00000230333ENSG000002303330.000
4ENSG00000246090ENSG000002460900.000
............
211669KRT8MUC5AC0.575
211670GANMUC5AC0.270
211671MYCMUC5AC0.266
211672GCNT7MUC5AC0.905
211673MUC13MUC5AC0.974
\n", + "

211674 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " gene1 gene2 score\n", + "0 ENSG00000228541 ENSG00000228541 0.000\n", + "1 STARD13-AS STARD13-AS 0.000\n", + "2 CLDND2 CLDND2 0.000\n", + "3 ENSG00000230333 ENSG00000230333 0.000\n", + "4 ENSG00000246090 ENSG00000246090 0.000\n", + "... ... ... ...\n", + "211669 KRT8 MUC5AC 0.575\n", + "211670 GAN MUC5AC 0.270\n", + "211671 MYC MUC5AC 0.266\n", + "211672 GCNT7 MUC5AC 0.905\n", + "211673 MUC13 MUC5AC 0.974\n", + "\n", + "[211674 rows x 3 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# add genes measured in inhouse with no connections found in StringDB\n", + "missing_genes = dataset_measured_genes.difference(\n", + " set(inhouse_network[\"gene1\"].unique())\n", + ")\n", + "for g in missing_genes:\n", + " inhouse_network = pd.concat(\n", + " [pd.DataFrame([[g, g, 0]], columns=inhouse_network.columns), inhouse_network],\n", + " ignore_index=True,\n", + " )\n", + "inhouse_network" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "05005170", + "metadata": {}, + "outputs": [], + "source": [ + "# create one-hot-encoding mapping based on adata var structure\n", + "adata.var_names = adata.var[\"gene_symbols\"]\n", + "\n", + "\n", + "def map_gene_to_onehot(name: str):\n", + " return adata.var_names.get_loc(name)\n", + "\n", + "\n", + "inhouse_network[\"gene1\"] = inhouse_network[\"gene1\"].apply(map_gene_to_onehot)\n", + "inhouse_network[\"gene2\"] = inhouse_network[\"gene2\"].apply(map_gene_to_onehot)\n", + "inhouse_network = inhouse_network.sort_values(by=[\"gene1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "b884a780", + "metadata": {}, + "outputs": [], + "source": [ + "# create weighted adj matrix\n", + "import networkx\n", + "\n", + "edgeList = inhouse_network.values.tolist()\n", + "G = networkx.DiGraph()\n", + "for i in range(len(edgeList)):\n", + " G.add_edge(edgeList[i][0], edgeList[i][1], weight=edgeList[i][2])\n", + "A = networkx.adjacency_matrix(\n", + " G, nodelist=[i for i in range(len(adata.var_names))]\n", + ").toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "cbd774ea", + "metadata": {}, + "outputs": [], + "source": [ + "# turn to sparse\n", + "from scipy import sparse\n", + "\n", + "sA = sparse.csr_matrix(A)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f262408", + "metadata": {}, + "outputs": [], + "source": [ + "# write to adata\n", + "adata.uns[\"sparse_gene_network\"] = sA\n", + "adata.write_h5ad(\"../inhouse_adata_spectra.h5ad\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a40a09e3", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data_preprocessing/norman_filtering.ipynb b/data_preprocessing/norman_filtering.ipynb new file mode 100644 index 0000000..cc8fac3 --- /dev/null +++ b/data_preprocessing/norman_filtering.ipynb @@ -0,0 +1,316 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "fe4ff200-83d2-43db-9409-416aea41f2ff", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ea22a349", + "metadata": {}, + "outputs": [], + "source": [ + "import anndata as an\n", + "import matplotlib.pyplot as plt\n", + "import scanpy as sc\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "markdown", + "id": "f1b7d7e3-dfc9-45f4-b313-a19db12bcc7d", + "metadata": {}, + "source": [ + "## Data Filtering\n", + "- Ensure that the data is downloaded (see `../data` for instructions)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4f643f95", + "metadata": {}, + "outputs": [], + "source": [ + "data_path = \"norman.h5ad\"\n", + "adata = an.read_h5ad(data_path)\n", + "sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=True, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "45a2d041", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Min read count: 1\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# read_count\n", + "read_count_filter = 100\n", + "print(f\"Min read count: {min(adata.obs['read_count'])}\")\n", + "sns.histplot(adata.obs[\"read_count\"])\n", + "plt.axvline(read_count_filter, 0, 6000, color=\"red\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "591bf5b5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Min UMI count: 1\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# UMI_count\n", + "umi_count_filter = 10\n", + "print(f\"Min UMI count: {min(adata.obs['UMI_count'])}\")\n", + "sns.histplot(adata.obs[\"UMI_count\"])\n", + "plt.axvline(umi_count_filter, 0, 6000, color=\"red\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "bfdaa7ee", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Min total counts: 1633.098876953125\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# total_counts\n", + "total_count_filter = 2000\n", + "print(f\"Min total counts: {min(adata.obs['total_counts'])}\")\n", + "sns.histplot(adata.obs[\"total_counts\"])\n", + "plt.axvline(total_count_filter, 0, 6000, color=\"red\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "bed06790", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Min n_genes_by_counts: 974\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# n_genes_by_counts\n", + "print(f\"Min n_genes_by_counts: {min(adata.obs['n_genes_by_counts'])}\")\n", + "sns.histplot(adata.obs[\"n_genes_by_counts\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8fb84285", + "metadata": {}, + "outputs": [], + "source": [ + "# UMAP projections\n", + "sc.pp.neighbors(adata)\n", + "sc.tl.umap(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6a127931", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sc.pl.umap(\n", + " adata, color=[\"read_count\", \"UMI_count\", \"total_counts\", \"n_genes_by_counts\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1cbf9e8f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sc.pl.umap(adata, color=\"leiden\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "14b6e48c", + "metadata": {}, + "outputs": [], + "source": [ + "# filter the anndata and save\n", + "filtered_adata = adata[\n", + " (adata.obs[\"read_count\"] >= read_count_filter)\n", + " & (adata.obs[\"UMI_count\"] >= umi_count_filter)\n", + " & (adata.obs[\"total_counts\"] >= total_count_filter)\n", + "]\n", + "filtered_adata.write(\"norman_filtered.h5ad\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4feaa85", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data_preprocessing/norman_prior_graph_preprocessing.ipynb b/data_preprocessing/norman_prior_graph_preprocessing.ipynb new file mode 100644 index 0000000..0fe08a5 --- /dev/null +++ b/data_preprocessing/norman_prior_graph_preprocessing.ipynb @@ -0,0 +1,1034 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3cc45e64", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2d35f5c7", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "import pandas as pd\n", + "import scanpy as sc\n", + "\n", + "sys.path.append(\"..\")\n", + "from utils import read_aws_csv, read_aws_h5ad" + ] + }, + { + "cell_type": "markdown", + "id": "f0a90a7f-b171-48e1-83c9-ab73ec9af158", + "metadata": {}, + "source": [ + "## Data Preprocessing\n", + "- Ensure that the data is downloaded (see `../data` for instructions)\n", + "- Ensure that the data was filtered using `norman_filtering.ipynb`" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "b5f89eea", + "metadata": {}, + "outputs": [], + "source": [ + "data_path = \"path to filtered norman.h5ad\"\n", + "adata = read_aws_h5ad(data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "cb93a4f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "105" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get perturbation genes\n", + "pert_list = set()\n", + "for t in adata.obs[\"perturbation_name\"]:\n", + " if \"+\" not in t and t != \"control\":\n", + " pert_list.add(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "c0cc203f", + "metadata": {}, + "outputs": [], + "source": [ + "# subset to hvg\n", + "sc.pp.highly_variable_genes(adata, n_top_genes=5000, subset=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "37d7ff87", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "View of AnnData object with n_obs × n_vars = 101578 × 5035\n", + " obs: 'guide_identity', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'guide_AHR', 'guide_ARID1A', 'guide_ARRDC3', 'guide_ATL1', 'guide_BAK1', 'guide_BCL2L11', 'guide_BCORL1', 'guide_BPGM', 'guide_C19orf26', 'guide_C3orf72', 'guide_CBFA2T3', 'guide_CBL', 'guide_CDKN1A', 'guide_CDKN1B', 'guide_CDKN1C', 'guide_CEBPA', 'guide_CEBPB', 'guide_CEBPE', 'guide_CELF2', 'guide_CITED1', 'guide_CKS1B', 'guide_CLDN6', 'guide_CNN1', 'guide_CNNM4', 'guide_COL1A1', 'guide_COL2A1', 'guide_CSRNP1', 'guide_DLX2', 'guide_DUSP9', 'guide_EGR1', 'guide_ELMSAN1', 'guide_ETS2', 'guide_FEV', 'guide_FOSB', 'guide_FOXA1', 'guide_FOXA3', 'guide_FOXF1', 'guide_FOXL2', 'guide_FOXO4', 'guide_GLB1L2', 'guide_HES7', 'guide_HK2', 'guide_HNF4A', 'guide_HOXA13', 'guide_HOXB9', 'guide_HOXC13', 'guide_IER5L', 'guide_IGDCC3', 'guide_IKZF3', 'guide_IRF1', 'guide_ISL2', 'guide_JUN', 'guide_KIAA1804', 'guide_KIF18B', 'guide_KIF2C', 'guide_KLF1', 'guide_KMT2A', 'guide_LHX1', 'guide_LYL1', 'guide_MAML2', 'guide_MAP2K3', 'guide_MAP2K6', 'guide_MAP4K3', 'guide_MAP4K5', 'guide_MAP7D1', 'guide_MAPK1', 'guide_MEIS1', 'guide_MIDN', 'guide_NCL', 'guide_NIT1', 'guide_OSR2', 'guide_PLK4', 'guide_POU3F2', 'guide_PRDM1', 'guide_PRTG', 'guide_PTPN1', 'guide_PTPN12', 'guide_PTPN13', 'guide_PTPN9', 'guide_RHOXF2', 'guide_RREB1', 'guide_RUNX1T1', 'guide_S1PR2', 'guide_SAMD1', 'guide_SET', 'guide_SGK1', 'guide_SLC38A2', 'guide_SLC4A1', 'guide_SLC6A9', 'guide_SNAI1', 'guide_SPI1', 'guide_STIL', 'guide_TBX2', 'guide_TBX3', 'guide_TGFBR2', 'guide_TMSB4X', 'guide_TP73', 'guide_TSC22D1', 'guide_UBASH3A', 'guide_UBASH3B', 'guide_ZBTB1', 'guide_ZBTB10', 'guide_ZBTB25', 'guide_ZC3HAV1', 'guide_ZNF318', 'guide_ids', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'perturbation_name', 'perturbation_type', 'perturbation_value', 'perturbation_unit', 'log1p_n_genes_by_counts', 'log1p_total_counts'\n", + " var: 'index', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'log1p_mean_counts', 'log1p_total_counts'\n", + " uns: 'doi', 'guide_ids_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'perturbation_type_colors', 'preprocessing_nb_link', 'umap'\n", + " obsm: 'X_pca', 'X_umap'\n", + " varm: 'PCs'\n", + " layers: 'counts'\n", + " obsp: 'connectivities', 'distances'" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# ensure all perts are in adata\n", + "idx = []\n", + "for i in range(len(adata.var_names)):\n", + " if (adata.var[\"highly_variable\"][i]) or (adata.var_names[i] in pert_list):\n", + " idx.append(i)\n", + "adata_subset = adata[:, idx]\n", + "adata_subset" + ] + }, + { + "cell_type": "markdown", + "id": "99cd823e-a774-4e07-a6b1-d1586e5411e3", + "metadata": {}, + "source": [ + "## StringDB prior graph" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "cade1ddc", + "metadata": {}, + "outputs": [], + "source": [ + "# stringdb prior\n", + "stringdb_hq = read_aws_csv(\"s3://pert-spectra/references/StringDB.HQ.txt\", sep=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "9410883b", + "metadata": {}, + "outputs": [], + "source": [ + "# ensg mapping\n", + "gene_name_df = read_aws_csv(\n", + " \"s3://pert-spectra/references/Homo_sapiens.gene_info\", sep=\"\\t\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "977803fa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#tax_idGeneIDSymbolLocusTagSynonymsdbXrefschromosomemap_locationdescriptiontype_of_geneSymbol_from_nomenclature_authorityFull_name_from_nomenclature_authorityNomenclature_statusOther_designationsModification_dateFeature_type
096061A1BG-A1B|ABG|GAB|HYST2477MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...1919q13.43alpha-1-B glycoproteinprotein-codingA1BGalpha-1-B glycoproteinOalpha-1B-glycoprotein|HEL-S-163pA|epididymis s...20240617-
196062A2M-A2MD|CPAMD5|FWP007|S863-7MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...1212p13.31alpha-2-macroglobulinprotein-codingA2Malpha-2-macroglobulinOalpha-2-macroglobulin|C3 and PZP-like alpha-2-...20240617-
296063A2MP1-A2MPHGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe...1212p13.31alpha-2-macroglobulin pseudogene 1pseudoA2MP1alpha-2-macroglobulin pseudogene 1Opregnancy-zone protein pseudogene20240617-
396069NAT1-AAC1|MNAT|NAT-1|NATIMIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...88p22N-acetyltransferase 1protein-codingNAT1N-acetyltransferase 1Oarylamine N-acetyltransferase 1|N-acetyltransf...20240617-
4960610NAT2-AAC2|NAT-2|PNATMIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156...88p22N-acetyltransferase 2protein-codingNAT2N-acetyltransferase 2Oarylamine N-acetyltransferase 2|N-acetyltransf...20240617-
...................................................
1934517411588923215trnD---MT-tRNA-AsptRNA----20200909-
1934527411588923216trnP---MT-tRNA-ProtRNA----20200909-
1934537411588923217trnA---MT-tRNA-AlatRNA----20200909-
1934547411588923218COX1---MT-cytochrome c oxidase subunit Iprotein-coding---cytochrome c oxidase subunit I20230818-
193455741158892321916S rRNA---MT-l-rRNArRNA----20200909-
\n", + "

193456 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " #tax_id GeneID Symbol LocusTag Synonyms \\\n", + "0 9606 1 A1BG - A1B|ABG|GAB|HYST2477 \n", + "1 9606 2 A2M - A2MD|CPAMD5|FWP007|S863-7 \n", + "2 9606 3 A2MP1 - A2MP \n", + "3 9606 9 NAT1 - AAC1|MNAT|NAT-1|NATI \n", + "4 9606 10 NAT2 - AAC2|NAT-2|PNAT \n", + "... ... ... ... ... ... \n", + "193451 741158 8923215 trnD - - \n", + "193452 741158 8923216 trnP - - \n", + "193453 741158 8923217 trnA - - \n", + "193454 741158 8923218 COX1 - - \n", + "193455 741158 8923219 16S rRNA - - \n", + "\n", + " dbXrefs chromosome \\\n", + "0 MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410... 19 \n", + "1 MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899... 12 \n", + "2 HGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe... 12 \n", + "3 MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171... 8 \n", + "4 MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156... 8 \n", + "... ... ... \n", + "193451 - MT \n", + "193452 - MT \n", + "193453 - MT \n", + "193454 - MT \n", + "193455 - MT \n", + "\n", + " map_location description type_of_gene \\\n", + "0 19q13.43 alpha-1-B glycoprotein protein-coding \n", + "1 12p13.31 alpha-2-macroglobulin protein-coding \n", + "2 12p13.31 alpha-2-macroglobulin pseudogene 1 pseudo \n", + "3 8p22 N-acetyltransferase 1 protein-coding \n", + "4 8p22 N-acetyltransferase 2 protein-coding \n", + "... ... ... ... \n", + "193451 - tRNA-Asp tRNA \n", + "193452 - tRNA-Pro tRNA \n", + "193453 - tRNA-Ala tRNA \n", + "193454 - cytochrome c oxidase subunit I protein-coding \n", + "193455 - l-rRNA rRNA \n", + "\n", + " Symbol_from_nomenclature_authority \\\n", + "0 A1BG \n", + "1 A2M \n", + "2 A2MP1 \n", + "3 NAT1 \n", + "4 NAT2 \n", + "... ... \n", + "193451 - \n", + "193452 - \n", + "193453 - \n", + "193454 - \n", + "193455 - \n", + "\n", + " Full_name_from_nomenclature_authority Nomenclature_status \\\n", + "0 alpha-1-B glycoprotein O \n", + "1 alpha-2-macroglobulin O \n", + "2 alpha-2-macroglobulin pseudogene 1 O \n", + "3 N-acetyltransferase 1 O \n", + "4 N-acetyltransferase 2 O \n", + "... ... ... \n", + "193451 - - \n", + "193452 - - \n", + "193453 - - \n", + "193454 - - \n", + "193455 - - \n", + "\n", + " Other_designations Modification_date \\\n", + "0 alpha-1B-glycoprotein|HEL-S-163pA|epididymis s... 20240617 \n", + "1 alpha-2-macroglobulin|C3 and PZP-like alpha-2-... 20240617 \n", + "2 pregnancy-zone protein pseudogene 20240617 \n", + "3 arylamine N-acetyltransferase 1|N-acetyltransf... 20240617 \n", + "4 arylamine N-acetyltransferase 2|N-acetyltransf... 20240617 \n", + "... ... ... \n", + "193451 - 20200909 \n", + "193452 - 20200909 \n", + "193453 - 20200909 \n", + "193454 cytochrome c oxidase subunit I 20230818 \n", + "193455 - 20200909 \n", + "\n", + " Feature_type \n", + "0 - \n", + "1 - \n", + "2 - \n", + "3 - \n", + "4 - \n", + "... ... \n", + "193451 - \n", + "193452 - \n", + "193453 - \n", + "193454 - \n", + "193455 - \n", + "\n", + "[193456 rows x 16 columns]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gene_name_df" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "837bd514", + "metadata": {}, + "outputs": [], + "source": [ + "gene_to_ensg = {}\n", + "gene_name_mapping = {}\n", + "for symbol, synonyms, ref in zip(\n", + " gene_name_df[\"Symbol\"], gene_name_df[\"Synonyms\"], gene_name_df[\"dbXrefs\"]\n", + "):\n", + " syn_set = set(synonyms.split(\"|\")).union(set([symbol]))\n", + " refs = ref.split(\"|\")\n", + " ensg = None\n", + " for r in refs:\n", + " label = r.split(\":\")\n", + " if label[0] == \"Ensembl\":\n", + " ensg = label[1]\n", + " break\n", + " if ensg is None:\n", + " continue\n", + " assert ensg[:4] == \"ENSG\"\n", + " gene_to_ensg[symbol] = ensg\n", + " gene_name_mapping[symbol] = syn_set\n", + " # make sure name mapping goes both ways\n", + " for syn in syn_set:\n", + " if syn in gene_name_mapping:\n", + " gene_name_mapping[syn].add(symbol)\n", + " else:\n", + " gene_name_mapping[syn] = set([symbol])" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "4c10530a", + "metadata": {}, + "outputs": [], + "source": [ + "adata_subset.var[\"gene_symbols\"] = adata_subset.var_names.copy()\n", + "adata_subset.var_names = adata_subset.var[\"index\"].copy()\n", + "norman_ensg_to_gene = {}\n", + "for symbol, ensgid in zip(adata_subset.var[\"gene_symbols\"], adata_subset.var_names):\n", + " norman_ensg_to_gene[ensgid] = symbol" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "5a884a01", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_measured_genes = set(adata_subset.var[\"gene_symbols\"])\n", + "dataset_measured_ensg = set(adata_subset.var_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "73bd2fda", + "metadata": {}, + "outputs": [], + "source": [ + "# first, check if gene name has ensembl id\n", + "# if it does, use that to match\n", + "# else, use gene name mapping\n", + "edges = []\n", + "for index, row in stringdb_hq.iterrows():\n", + " gene1 = row[\"i_genes\"]\n", + " gene2 = row[\"j_genes\"]\n", + " if gene1 not in gene_to_ensg:\n", + " # neither gene has ensg\n", + " if gene2 not in gene_to_ensg:\n", + " gene1_syn = set([gene1])\n", + " gene2_syn = set([gene2])\n", + " if gene1 in gene_name_mapping:\n", + " gene1_syn = gene_name_mapping[gene1].union(set([gene1]))\n", + " if gene2 in gene_name_mapping:\n", + " gene2_syn = gene_name_mapping[gene2].union(set([gene2]))\n", + "\n", + " alias_intersect_1 = gene1_syn.intersection(dataset_measured_genes)\n", + " alias_intersect_2 = gene2_syn.intersection(dataset_measured_genes)\n", + "\n", + " if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):\n", + " gene1 = (\n", + " gene1\n", + " if gene1 in dataset_measured_genes\n", + " else list(alias_intersect_1)[0]\n", + " )\n", + " gene2 = (\n", + " gene2\n", + " if gene2 in dataset_measured_genes\n", + " else list(alias_intersect_2)[0]\n", + " )\n", + " edges.append([gene1, gene2, row[\"x\"]])\n", + "\n", + " # gene1 does not have ensemblid, gene2 does have ensemblid\n", + " else:\n", + " ensg2 = gene_to_ensg[gene2]\n", + " gene1_syn = set([gene1])\n", + " gene2_syn = set([ensg2])\n", + " if gene1 in gene_name_mapping:\n", + " gene1_syn = gene_name_mapping[gene1].union(set([gene1]))\n", + "\n", + " alias_intersect_1 = gene1_syn.intersection(dataset_measured_genes)\n", + " alias_intersect_2 = gene2_syn.intersection(dataset_measured_ensg)\n", + "\n", + " if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):\n", + " gene1 = (\n", + " gene1\n", + " if gene1 in dataset_measured_genes\n", + " else list(alias_intersect_1)[0]\n", + " )\n", + " gene2 = norman_ensg_to_gene[list(alias_intersect_2)[0]]\n", + " edges.append([gene1, gene2, row[\"x\"]])\n", + "\n", + " else:\n", + " # gene1 has ensemblid, gene2 does not have ensemblid\n", + " if gene2 not in gene_to_ensg:\n", + " ensg1 = gene_to_ensg[gene1]\n", + " gene1_syn = set([ensg1])\n", + " gene2_syn = set([gene2])\n", + " if gene2 in gene_name_mapping:\n", + " gene2_syn = gene_name_mapping[gene2].union(set([gene2]))\n", + "\n", + " alias_intersect_1 = gene1_syn.intersection(dataset_measured_ensg)\n", + " alias_intersect_2 = gene2_syn.intersection(dataset_measured_genes)\n", + "\n", + " if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):\n", + " gene1 = norman_ensg_to_gene[list(alias_intersect_1)[0]]\n", + " gene2 = (\n", + " gene2\n", + " if gene2 in dataset_measured_genes\n", + " else list(alias_intersect_2)[0]\n", + " )\n", + " edges.append([gene1, gene2, row[\"x\"]])\n", + "\n", + " # both genes have ensmblid\n", + " else:\n", + " ensg1 = gene_to_ensg[gene1]\n", + " ensg2 = gene_to_ensg[gene2]\n", + " gene1_syn = set([ensg1])\n", + " gene2_syn = set([ensg2])\n", + "\n", + " alias_intersect_1 = gene1_syn.intersection(dataset_measured_ensg)\n", + " alias_intersect_2 = gene2_syn.intersection(dataset_measured_ensg)\n", + "\n", + " if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):\n", + " gene1 = norman_ensg_to_gene[list(alias_intersect_1)[0]]\n", + " gene2 = norman_ensg_to_gene[list(alias_intersect_2)[0]]\n", + " edges.append([gene1, gene2, row[\"x\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "9e2e5c9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene1gene2score
0MISPUSH1C0.212
1TUBB1USH1C0.184
2CRYMUSH1C0.289
3LIN7BUSH1C0.152
4TAX1BP3USH1C0.919
............
223491N4BP2ZNF518A0.179
223492PHIPZNF518A0.160
223493BRD2ZNF518A0.228
223494GSTP1ZNF518A0.165
223495MYL12BZNF518A0.189
\n", + "

223496 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " gene1 gene2 score\n", + "0 MISP USH1C 0.212\n", + "1 TUBB1 USH1C 0.184\n", + "2 CRYM USH1C 0.289\n", + "3 LIN7B USH1C 0.152\n", + "4 TAX1BP3 USH1C 0.919\n", + "... ... ... ...\n", + "223491 N4BP2 ZNF518A 0.179\n", + "223492 PHIP ZNF518A 0.160\n", + "223493 BRD2 ZNF518A 0.228\n", + "223494 GSTP1 ZNF518A 0.165\n", + "223495 MYL12B ZNF518A 0.189\n", + "\n", + "[223496 rows x 3 columns]" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "norman_network = pd.DataFrame(edges, columns=[\"gene1\", \"gene2\", \"score\"])\n", + "norman_network" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "b6bd6d19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene1gene2score
0ZNF720ZNF7200.000
1LINC00467LINC004670.000
2RP3-473L9.4RP3-473L9.40.000
3RP11-380O24.1RP11-380O24.10.000
4LINC01597LINC015970.000
............
225103N4BP2ZNF518A0.179
225104PHIPZNF518A0.160
225105BRD2ZNF518A0.228
225106GSTP1ZNF518A0.165
225107MYL12BZNF518A0.189
\n", + "

225108 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " gene1 gene2 score\n", + "0 ZNF720 ZNF720 0.000\n", + "1 LINC00467 LINC00467 0.000\n", + "2 RP3-473L9.4 RP3-473L9.4 0.000\n", + "3 RP11-380O24.1 RP11-380O24.1 0.000\n", + "4 LINC01597 LINC01597 0.000\n", + "... ... ... ...\n", + "225103 N4BP2 ZNF518A 0.179\n", + "225104 PHIP ZNF518A 0.160\n", + "225105 BRD2 ZNF518A 0.228\n", + "225106 GSTP1 ZNF518A 0.165\n", + "225107 MYL12B ZNF518A 0.189\n", + "\n", + "[225108 rows x 3 columns]" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# add genes measured in Norman with no connections found in StringDB\n", + "missing_genes = dataset_measured_genes.difference(set(norman_network[\"gene1\"].unique()))\n", + "for g in missing_genes:\n", + " norman_network = pd.concat(\n", + " [pd.DataFrame([[g, g, 0]], columns=norman_network.columns), norman_network],\n", + " ignore_index=True,\n", + " )\n", + "norman_network" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "05005170", + "metadata": {}, + "outputs": [], + "source": [ + "# create one-hot-encoding mapping based on adata var structure\n", + "adata_subset.var_names = adata_subset.var[\"gene_symbols\"]\n", + "\n", + "\n", + "def map_gene_to_onehot(name: str):\n", + " return adata_subset.var_names.get_loc(name)\n", + "\n", + "\n", + "norman_network[\"gene1\"] = norman_network[\"gene1\"].apply(map_gene_to_onehot)\n", + "norman_network[\"gene2\"] = norman_network[\"gene2\"].apply(map_gene_to_onehot)\n", + "norman_network = norman_network.sort_values(by=[\"gene1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "b884a780", + "metadata": {}, + "outputs": [], + "source": [ + "# create weighted adj matrix\n", + "import networkx\n", + "\n", + "edgeList = norman_network.values.tolist()\n", + "G = networkx.DiGraph()\n", + "for i in range(len(edgeList)):\n", + " G.add_edge(edgeList[i][0], edgeList[i][1], weight=edgeList[i][2])\n", + "A = networkx.adjacency_matrix(\n", + " G, nodelist=[i for i in range(len(adata_subset.var_names))]\n", + ").toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "cbd774ea", + "metadata": {}, + "outputs": [], + "source": [ + "# turn to sparse\n", + "from scipy import sparse\n", + "\n", + "sA = sparse.csr_matrix(A)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "3f262408", + "metadata": {}, + "outputs": [], + "source": [ + "# write to adata\n", + "adata_subset.uns[\"sparse_gene_network\"] = sA\n", + "adata_subset.write_h5ad(\"../norman_adata_spectra.h5ad\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93494551", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data_preprocessing/replogle_prior_graph_preprocessing.ipynb b/data_preprocessing/replogle_prior_graph_preprocessing.ipynb new file mode 100644 index 0000000..8d5f174 --- /dev/null +++ b/data_preprocessing/replogle_prior_graph_preprocessing.ipynb @@ -0,0 +1,1102 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3cc45e64", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2d35f5c7", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "import pandas as pd\n", + "import scanpy as sc\n", + "from matplotlib import pyplot as plt\n", + "\n", + "sys.path.append(\"..\")\n", + "from utils import read_aws_csv, read_aws_h5ad" + ] + }, + { + "cell_type": "markdown", + "id": "0da5cab4-9744-4009-8c38-4623a5b26279", + "metadata": {}, + "source": [ + "## Data Preprocessing\n", + "- Ensure that the data is downloaded (see `../data` for instructions)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b5f89eea", + "metadata": {}, + "outputs": [], + "source": [ + "data_path = \"path to raw replogle.h5ad\"\n", + "adata = read_aws_h5ad(data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "05562663", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "adata[adata.obs[\"gene\"] != \"non-targeting\"].obs[\"gene\"].value_counts().hist(bins=200)\n", + "plt.title(\"Number of perturbations by number of cells\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "92623d3e", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "adata[adata.obs[\"gene_transcript\"] != \"non-targeting\"].obs[\n", + " \"gene_transcript\"\n", + "].value_counts().hist(bins=200)\n", + "plt.title(\"Number of gene transcripts by number of cells\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "7b643a2f", + "metadata": {}, + "outputs": [], + "source": [ + "sc.pp.filter_cells(adata, min_counts=100)\n", + "sc.pp.filter_genes(adata, min_cells=100)\n", + "sc.pp.normalize_total(adata, target_sum=1e4)\n", + "sc.pp.log1p(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "cb93a4f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2057" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get perturbation genes\n", + "pert_list = set()\n", + "for t in adata.obs[\"gene\"]:\n", + " if \"+\" not in t and t != \"non-targeting\":\n", + " pert_list.add(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c0cc203f", + "metadata": {}, + "outputs": [], + "source": [ + "# subset to hvg\n", + "sc.pp.highly_variable_genes(adata, n_top_genes=5000, subset=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "37d7ff87", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "View of AnnData object with n_obs × n_vars = 310385 × 5000\n", + " obs: 'gem_group', 'gene', 'gene_id', 'transcript', 'gene_transcript', 'sgID_AB', 'mitopercent', 'UMI_count', 'z_gemgroup_UMI', 'core_scale_factor', 'core_adjusted_UMI_count'\n", + " var: 'gene_name', 'chr', 'start', 'end', 'class', 'strand', 'length', 'in_matrix', 'mean', 'std', 'cv', 'fano', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'\n", + " uns: 'log1p', 'hvg'" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# ensure all perts are in adata\n", + "idx = []\n", + "for i in range(len(adata.var_names)):\n", + " if (adata.var[\"highly_variable\"][i]) or (adata.var_names[i] in pert_list):\n", + " idx.append(i)\n", + "adata_subset = adata[:, idx]\n", + "adata_subset" + ] + }, + { + "cell_type": "markdown", + "id": "2f2ad658-a6cb-465c-b826-f34fd4fbdbed", + "metadata": {}, + "source": [ + "## StringDB prior graph" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "cade1ddc", + "metadata": {}, + "outputs": [], + "source": [ + "# stringdb prior\n", + "stringdb_hq = read_aws_csv(\"s3://pert-spectra/references/StringDB.HQ.txt\", sep=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "9410883b", + "metadata": {}, + "outputs": [], + "source": [ + "# ensg mapping\n", + "gene_name_df = read_aws_csv(\n", + " \"s3://pert-spectra/references/Homo_sapiens.gene_info\", sep=\"\\t\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "977803fa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#tax_idGeneIDSymbolLocusTagSynonymsdbXrefschromosomemap_locationdescriptiontype_of_geneSymbol_from_nomenclature_authorityFull_name_from_nomenclature_authorityNomenclature_statusOther_designationsModification_dateFeature_type
096061A1BG-A1B|ABG|GAB|HYST2477MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...1919q13.43alpha-1-B glycoproteinprotein-codingA1BGalpha-1-B glycoproteinOalpha-1B-glycoprotein|HEL-S-163pA|epididymis s...20240617-
196062A2M-A2MD|CPAMD5|FWP007|S863-7MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...1212p13.31alpha-2-macroglobulinprotein-codingA2Malpha-2-macroglobulinOalpha-2-macroglobulin|C3 and PZP-like alpha-2-...20240617-
296063A2MP1-A2MPHGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe...1212p13.31alpha-2-macroglobulin pseudogene 1pseudoA2MP1alpha-2-macroglobulin pseudogene 1Opregnancy-zone protein pseudogene20240617-
396069NAT1-AAC1|MNAT|NAT-1|NATIMIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...88p22N-acetyltransferase 1protein-codingNAT1N-acetyltransferase 1Oarylamine N-acetyltransferase 1|N-acetyltransf...20240617-
4960610NAT2-AAC2|NAT-2|PNATMIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156...88p22N-acetyltransferase 2protein-codingNAT2N-acetyltransferase 2Oarylamine N-acetyltransferase 2|N-acetyltransf...20240617-
...................................................
1934517411588923215trnD---MT-tRNA-AsptRNA----20200909-
1934527411588923216trnP---MT-tRNA-ProtRNA----20200909-
1934537411588923217trnA---MT-tRNA-AlatRNA----20200909-
1934547411588923218COX1---MT-cytochrome c oxidase subunit Iprotein-coding---cytochrome c oxidase subunit I20230818-
193455741158892321916S rRNA---MT-l-rRNArRNA----20200909-
\n", + "

193456 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " #tax_id GeneID Symbol LocusTag Synonyms \\\n", + "0 9606 1 A1BG - A1B|ABG|GAB|HYST2477 \n", + "1 9606 2 A2M - A2MD|CPAMD5|FWP007|S863-7 \n", + "2 9606 3 A2MP1 - A2MP \n", + "3 9606 9 NAT1 - AAC1|MNAT|NAT-1|NATI \n", + "4 9606 10 NAT2 - AAC2|NAT-2|PNAT \n", + "... ... ... ... ... ... \n", + "193451 741158 8923215 trnD - - \n", + "193452 741158 8923216 trnP - - \n", + "193453 741158 8923217 trnA - - \n", + "193454 741158 8923218 COX1 - - \n", + "193455 741158 8923219 16S rRNA - - \n", + "\n", + " dbXrefs chromosome \\\n", + "0 MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410... 19 \n", + "1 MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899... 12 \n", + "2 HGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe... 12 \n", + "3 MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171... 8 \n", + "4 MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156... 8 \n", + "... ... ... \n", + "193451 - MT \n", + "193452 - MT \n", + "193453 - MT \n", + "193454 - MT \n", + "193455 - MT \n", + "\n", + " map_location description type_of_gene \\\n", + "0 19q13.43 alpha-1-B glycoprotein protein-coding \n", + "1 12p13.31 alpha-2-macroglobulin protein-coding \n", + "2 12p13.31 alpha-2-macroglobulin pseudogene 1 pseudo \n", + "3 8p22 N-acetyltransferase 1 protein-coding \n", + "4 8p22 N-acetyltransferase 2 protein-coding \n", + "... ... ... ... \n", + "193451 - tRNA-Asp tRNA \n", + "193452 - tRNA-Pro tRNA \n", + "193453 - tRNA-Ala tRNA \n", + "193454 - cytochrome c oxidase subunit I protein-coding \n", + "193455 - l-rRNA rRNA \n", + "\n", + " Symbol_from_nomenclature_authority \\\n", + "0 A1BG \n", + "1 A2M \n", + "2 A2MP1 \n", + "3 NAT1 \n", + "4 NAT2 \n", + "... ... \n", + "193451 - \n", + "193452 - \n", + "193453 - \n", + "193454 - \n", + "193455 - \n", + "\n", + " Full_name_from_nomenclature_authority Nomenclature_status \\\n", + "0 alpha-1-B glycoprotein O \n", + "1 alpha-2-macroglobulin O \n", + "2 alpha-2-macroglobulin pseudogene 1 O \n", + "3 N-acetyltransferase 1 O \n", + "4 N-acetyltransferase 2 O \n", + "... ... ... \n", + "193451 - - \n", + "193452 - - \n", + "193453 - - \n", + "193454 - - \n", + "193455 - - \n", + "\n", + " Other_designations Modification_date \\\n", + "0 alpha-1B-glycoprotein|HEL-S-163pA|epididymis s... 20240617 \n", + "1 alpha-2-macroglobulin|C3 and PZP-like alpha-2-... 20240617 \n", + "2 pregnancy-zone protein pseudogene 20240617 \n", + "3 arylamine N-acetyltransferase 1|N-acetyltransf... 20240617 \n", + "4 arylamine N-acetyltransferase 2|N-acetyltransf... 20240617 \n", + "... ... ... \n", + "193451 - 20200909 \n", + "193452 - 20200909 \n", + "193453 - 20200909 \n", + "193454 cytochrome c oxidase subunit I 20230818 \n", + "193455 - 20200909 \n", + "\n", + " Feature_type \n", + "0 - \n", + "1 - \n", + "2 - \n", + "3 - \n", + "4 - \n", + "... ... \n", + "193451 - \n", + "193452 - \n", + "193453 - \n", + "193454 - \n", + "193455 - \n", + "\n", + "[193456 rows x 16 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gene_name_df" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "837bd514", + "metadata": {}, + "outputs": [], + "source": [ + "gene_to_ensg = {}\n", + "gene_name_mapping = {}\n", + "for symbol, synonyms, ref in zip(\n", + " gene_name_df[\"Symbol\"], gene_name_df[\"Synonyms\"], gene_name_df[\"dbXrefs\"]\n", + "):\n", + " syn_set = set(synonyms.split(\"|\")).union(set([symbol]))\n", + " refs = ref.split(\"|\")\n", + " ensg = None\n", + " for r in refs:\n", + " label = r.split(\":\")\n", + " if label[0] == \"Ensembl\":\n", + " ensg = label[1]\n", + " break\n", + " if ensg is None:\n", + " continue\n", + " assert ensg[:4] == \"ENSG\"\n", + " gene_to_ensg[symbol] = ensg\n", + " gene_name_mapping[symbol] = syn_set\n", + " # make sure name mapping goes both ways\n", + " for syn in syn_set:\n", + " if syn in gene_name_mapping:\n", + " gene_name_mapping[syn].add(symbol)\n", + " else:\n", + " gene_name_mapping[syn] = set([symbol])" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "4c10530a", + "metadata": {}, + "outputs": [], + "source": [ + "replogle_ensg_to_gene = {}\n", + "for symbol, ensgid in zip(adata_subset.var[\"gene_name\"], adata_subset.var_names):\n", + " replogle_ensg_to_gene[ensgid] = symbol" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "5a884a01", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_measured_genes = set(adata_subset.var[\"gene_name\"])\n", + "dataset_measured_ensg = set(adata_subset.var_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "73bd2fda", + "metadata": {}, + "outputs": [], + "source": [ + "# first, check if gene name has ensembl id\n", + "# if it does, use that to match\n", + "# else, use gene name mapping\n", + "edges = []\n", + "for index, row in stringdb_hq.iterrows():\n", + " gene1 = row[\"i_genes\"]\n", + " gene2 = row[\"j_genes\"]\n", + " if gene1 not in gene_to_ensg:\n", + " # neither gene has ensg\n", + " if gene2 not in gene_to_ensg:\n", + " gene1_syn = set([gene1])\n", + " gene2_syn = set([gene2])\n", + " if gene1 in gene_name_mapping:\n", + " gene1_syn = gene_name_mapping[gene1].union(set([gene1]))\n", + " if gene2 in gene_name_mapping:\n", + " gene2_syn = gene_name_mapping[gene2].union(set([gene2]))\n", + "\n", + " alias_intersect_1 = gene1_syn.intersection(dataset_measured_genes)\n", + " alias_intersect_2 = gene2_syn.intersection(dataset_measured_genes)\n", + "\n", + " if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):\n", + " gene1 = (\n", + " gene1\n", + " if gene1 in dataset_measured_genes\n", + " else list(alias_intersect_1)[0]\n", + " )\n", + " gene2 = (\n", + " gene2\n", + " if gene2 in dataset_measured_genes\n", + " else list(alias_intersect_2)[0]\n", + " )\n", + " edges.append([gene1, gene2, row[\"x\"]])\n", + "\n", + " # gene1 does not have ensemblid, gene2 does have ensemblid\n", + " else:\n", + " ensg2 = gene_to_ensg[gene2]\n", + " gene1_syn = set([gene1])\n", + " gene2_syn = set([ensg2])\n", + " if gene1 in gene_name_mapping:\n", + " gene1_syn = gene_name_mapping[gene1].union(set([gene1]))\n", + "\n", + " alias_intersect_1 = gene1_syn.intersection(dataset_measured_genes)\n", + " alias_intersect_2 = gene2_syn.intersection(dataset_measured_ensg)\n", + "\n", + " if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):\n", + " gene1 = (\n", + " gene1\n", + " if gene1 in dataset_measured_genes\n", + " else list(alias_intersect_1)[0]\n", + " )\n", + " gene2 = replogle_ensg_to_gene[list(alias_intersect_2)[0]]\n", + " edges.append([gene1, gene2, row[\"x\"]])\n", + "\n", + " else:\n", + " # gene1 has ensemblid, gene2 does not have ensemblid\n", + " if gene2 not in gene_to_ensg:\n", + " ensg1 = gene_to_ensg[gene1]\n", + " gene1_syn = set([ensg1])\n", + " gene2_syn = set([gene2])\n", + " if gene2 in gene_name_mapping:\n", + " gene2_syn = gene_name_mapping[gene2].union(set([gene2]))\n", + "\n", + " alias_intersect_1 = gene1_syn.intersection(dataset_measured_ensg)\n", + " alias_intersect_2 = gene2_syn.intersection(dataset_measured_genes)\n", + "\n", + " if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):\n", + " gene1 = replogle_ensg_to_gene[list(alias_intersect_1)[0]]\n", + " gene2 = (\n", + " gene2\n", + " if gene2 in dataset_measured_genes\n", + " else list(alias_intersect_2)[0]\n", + " )\n", + " edges.append([gene1, gene2, row[\"x\"]])\n", + "\n", + " # both genes have ensmblid\n", + " else:\n", + " ensg1 = gene_to_ensg[gene1]\n", + " ensg2 = gene_to_ensg[gene2]\n", + " gene1_syn = set([ensg1])\n", + " gene2_syn = set([ensg2])\n", + "\n", + " alias_intersect_1 = gene1_syn.intersection(dataset_measured_ensg)\n", + " alias_intersect_2 = gene2_syn.intersection(dataset_measured_ensg)\n", + "\n", + " if (len(alias_intersect_1) > 0) and (len(alias_intersect_2) > 0):\n", + " gene1 = replogle_ensg_to_gene[list(alias_intersect_1)[0]]\n", + " gene2 = replogle_ensg_to_gene[list(alias_intersect_2)[0]]\n", + " edges.append([gene1, gene2, row[\"x\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "9e2e5c9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene1gene2score
0M6PRARF50.157
1MIFARF50.164
2AP4S1ARF50.250
3CHMP4BARF50.327
4NME4ARF50.212
............
801575PTGES3EIF3L0.150
801576NR2C2EIF3L0.270
801577NSA2EIF3L0.315
801578PSMB3EIF3L0.169
801579ADSLEIF3L0.161
\n", + "

801580 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " gene1 gene2 score\n", + "0 M6PR ARF5 0.157\n", + "1 MIF ARF5 0.164\n", + "2 AP4S1 ARF5 0.250\n", + "3 CHMP4B ARF5 0.327\n", + "4 NME4 ARF5 0.212\n", + "... ... ... ...\n", + "801575 PTGES3 EIF3L 0.150\n", + "801576 NR2C2 EIF3L 0.270\n", + "801577 NSA2 EIF3L 0.315\n", + "801578 PSMB3 EIF3L 0.169\n", + "801579 ADSL EIF3L 0.161\n", + "\n", + "[801580 rows x 3 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "replogle_network = pd.DataFrame(edges, columns=[\"gene1\", \"gene2\", \"score\"])\n", + "replogle_network" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "b6bd6d19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene1gene2score
0SEPTIN2SEPTIN20.000
1ZNF433-AS1ZNF433-AS10.000
2SLC16A1-AS1SLC16A1-AS10.000
3AP000873.2AP000873.20.000
4CHKB-DTCHKB-DT0.000
............
802163PTGES3EIF3L0.150
802164NR2C2EIF3L0.270
802165NSA2EIF3L0.315
802166PSMB3EIF3L0.169
802167ADSLEIF3L0.161
\n", + "

802168 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " gene1 gene2 score\n", + "0 SEPTIN2 SEPTIN2 0.000\n", + "1 ZNF433-AS1 ZNF433-AS1 0.000\n", + "2 SLC16A1-AS1 SLC16A1-AS1 0.000\n", + "3 AP000873.2 AP000873.2 0.000\n", + "4 CHKB-DT CHKB-DT 0.000\n", + "... ... ... ...\n", + "802163 PTGES3 EIF3L 0.150\n", + "802164 NR2C2 EIF3L 0.270\n", + "802165 NSA2 EIF3L 0.315\n", + "802166 PSMB3 EIF3L 0.169\n", + "802167 ADSL EIF3L 0.161\n", + "\n", + "[802168 rows x 3 columns]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# add genes measured in Replogle with no connections found in StringDB\n", + "missing_genes = dataset_measured_genes.difference(\n", + " set(replogle_network[\"gene1\"].unique())\n", + ")\n", + "for g in missing_genes:\n", + " replogle_network = pd.concat(\n", + " [pd.DataFrame([[g, g, 0]], columns=replogle_network.columns), replogle_network],\n", + " ignore_index=True,\n", + " )\n", + "replogle_network" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05005170", + "metadata": {}, + "outputs": [], + "source": [ + "# create one-hot-encoding mapping based on adata var structure\n", + "adata_subset.var_names = adata_subset.var[\"gene_name\"]\n", + "\n", + "\n", + "def map_gene_to_onehot(name: str):\n", + " return adata_subset.var_names.get_loc(name)\n", + "\n", + "\n", + "replogle_network[\"gene1\"] = replogle_network[\"gene1\"].apply(map_gene_to_onehot)\n", + "replogle_network[\"gene2\"] = replogle_network[\"gene2\"].apply(map_gene_to_onehot)\n", + "replogle_network = replogle_network.sort_values(by=[\"gene1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "b884a780", + "metadata": {}, + "outputs": [], + "source": [ + "# create weighted adj matrix\n", + "import networkx\n", + "\n", + "edgeList = replogle_network.values.tolist()\n", + "G = networkx.DiGraph()\n", + "for i in range(len(edgeList)):\n", + " G.add_edge(edgeList[i][0], edgeList[i][1], weight=edgeList[i][2])\n", + "A = networkx.adjacency_matrix(\n", + " G, nodelist=[i for i in range(len(adata_subset.var_names))]\n", + ").toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "cbd774ea", + "metadata": {}, + "outputs": [], + "source": [ + "# turn to sparse\n", + "from scipy import sparse\n", + "\n", + "sA = sparse.csr_matrix(A)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "8a481c1a", + "metadata": {}, + "outputs": [], + "source": [ + "adata_subset.X = sparse.csr_matrix(adata_subset.X)\n", + "adata_subset.layers[\"logcounts\"] = adata_subset.X.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "3f262408", + "metadata": {}, + "outputs": [], + "source": [ + "# write to adata\n", + "adata_subset.uns[\"sparse_gene_network\"] = sA\n", + "adata_subset.write_h5ad(\"../replogle_adata_spectra.h5ad\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93494551", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/environment.yaml b/environment.yaml new file mode 100755 index 0000000..a3f6159 --- /dev/null +++ b/environment.yaml @@ -0,0 +1,11 @@ +name: pertspectra +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python>=3.11,<3.12 + - pip>=23.2.0 + - pre_commit>=2.20.0 + - pytest-cov>=2.7.1 + - pytest>=7.2.0 diff --git a/figures/figures.ipynb b/figures/figures.ipynb new file mode 100644 index 0000000..3a1d0da --- /dev/null +++ b/figures/figures.ipynb @@ -0,0 +1,813 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "78401ded-13c1-40c3-b027-9b71f36ed06e", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b8dbb8d4-7a06-4c04-8eaa-4b57734bc4de", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "sys.path.append(\"..\")\n", + "from utils import read_aws_csv" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "244869a4-dfa7-4b2d-b8e1-f87c79e2349e", + "metadata": {}, + "outputs": [], + "source": [ + "# aws s3 path to figures data\n", + "figure_data_path = \"s3://pert-spectra/figures/\"" + ] + }, + { + "cell_type": "markdown", + "id": "198bd21e-c8da-4c40-9bd9-927a402ce0af", + "metadata": {}, + "source": [ + "# Time complexity" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "46c09872-9f78-401d-9678-6c6eaaefc14e", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Load the data\n", + "time_df = read_aws_csv(figure_data_path + \"runtime/time_complexity_cells.txt\", sep=\",\")\n", + "time_df[\" time\"] = time_df[\" time\"] / 60 # Convert time to minutes\n", + "\n", + "# Initialize the matplotlib figure\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "# Plot with Seaborn\n", + "sns.set(style=\"whitegrid\")\n", + "sns.lineplot(data=time_df, x=\" cells\", y=\" time\", hue=\"model\", marker=\"o\")\n", + "\n", + "# Add labels and title\n", + "plt.xlabel(\"Number of Cells\")\n", + "plt.ylabel(\"Minutes\")\n", + "plt.title(\"Training Time vs. Number of Cells for Different Models\")\n", + "plt.legend(title=\"Model\")\n", + "plt.grid(True)\n", + "plt.yscale(\"log\") # Optional: Use logarithmic scale for y-axis for better visualization\n", + "\n", + "# Show plot\n", + "plt.savefig(\"figure_pngs/training_runtime_cells.png\", dpi=600, bbox_inches=\"tight\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "89468d72-019e-4c28-8fd5-d5da29e01699", + "metadata": {}, + "source": [ + "# Reconstruction - Spearman Correlation" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "853ccb5a-7507-4b96-aec1-f40f2e76a87d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "recon_path = \"reconstruction_spearmans/\"\n", + "data = {\n", + " \"Inhouse\": {\n", + " \"PertSpectra\": list(\n", + " read_aws_csv(\n", + " figure_data_path\n", + " + recon_path\n", + " + \"pertspectra_inhouse_spearman_correlations.csv\",\n", + " header=None,\n", + " )[0]\n", + " ),\n", + " \"scETM\": list(\n", + " read_aws_csv(\n", + " figure_data_path\n", + " + recon_path\n", + " + \"scETM_inhouse_spearman_correlations.csv\",\n", + " header=None,\n", + " )[0]\n", + " ),\n", + " \"GSFA\": list(\n", + " read_aws_csv(\n", + " figure_data_path\n", + " + recon_path\n", + " + \"GSFA_inhouse_spearman_correlations.csv\",\n", + " header=None,\n", + " )[0]\n", + " ),\n", + " },\n", + " \"Norman\": {\n", + " \"PertSpectra\": list(\n", + " read_aws_csv(\n", + " figure_data_path\n", + " + recon_path\n", + " + \"pertspectra_norman_spearman_correlations.csv\",\n", + " header=None,\n", + " )[0]\n", + " ),\n", + " \"scETM\": list(\n", + " read_aws_csv(\n", + " figure_data_path\n", + " + recon_path\n", + " + \"scETM_norman_spearman_correlations.csv\",\n", + " header=None,\n", + " )[0]\n", + " ),\n", + " \"GSFA\": list(\n", + " read_aws_csv(\n", + " figure_data_path + recon_path + \"GSFA_norman_spearman_correlations.csv\",\n", + " header=None,\n", + " )[0]\n", + " ),\n", + " },\n", + " \"Replogle\": {\n", + " \"PertSpectra\": list(\n", + " read_aws_csv(\n", + " figure_data_path\n", + " + recon_path\n", + " + \"pertspectra_replogle_spearman_correlations.csv\",\n", + " header=None,\n", + " )[0]\n", + " ),\n", + " \"scETM\": list(\n", + " read_aws_csv(\n", + " figure_data_path\n", + " + recon_path\n", + " + \"scETM_replogle_spearman_correlations.csv\",\n", + " header=None,\n", + " )[0]\n", + " ),\n", + " \"GSFA\": [\n", + " 0\n", + " for _ in range(\n", + " len(\n", + " list(\n", + " read_aws_csv(\n", + " figure_data_path\n", + " + recon_path\n", + " + \"scETM_replogle_spearman_correlations.csv\",\n", + " header=None,\n", + " )[0]\n", + " )\n", + " )\n", + " )\n", + " ],\n", + " },\n", + "}\n", + "\n", + "# Prepare data for Seaborn\n", + "records = []\n", + "for dataset in data:\n", + " for model in data[dataset]:\n", + " for value in data[dataset][model]:\n", + " records.append(\n", + " {\"Dataset\": dataset, \"Model\": model, \"Spearman Correlation\": value}\n", + " )\n", + "\n", + "df = pd.DataFrame(records)\n", + "\n", + "# Initialize the matplotlib figure\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "# Plot with Seaborn\n", + "sns.set(style=\"whitegrid\")\n", + "ax = sns.barplot(\n", + " x=\"Dataset\",\n", + " y=\"Spearman Correlation\",\n", + " hue=\"Model\",\n", + " data=df,\n", + " errorbar=\"sd\",\n", + " capsize=0.1,\n", + ")\n", + "\n", + "# Optionally add text labels above the bars\n", + "for i, p in enumerate(ax.patches):\n", + " if i == 6:\n", + " break\n", + " height = p.get_height()\n", + " ax.annotate(\n", + " f\"{height:.2f}\",\n", + " xy=(p.get_x() + p.get_width() / 2, height),\n", + " xytext=(0, 3), # 3 points vertical offset\n", + " textcoords=\"offset points\",\n", + " ha=\"center\",\n", + " va=\"bottom\",\n", + " )\n", + "\n", + "# Add a title and labels\n", + "ax.set_title(\"Reconstruction of Gene Expression\")\n", + "ax.set_ylabel(\"Spearman Correlation\")\n", + "\n", + "# Show legend and plot\n", + "plt.legend(title=\"Model\")\n", + "plt.savefig(\"figure_pngs/recon.png\", dpi=600, bbox_inches=\"tight\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "58ddcb38-4830-4dbf-8b63-d740632b02e6", + "metadata": {}, + "source": [ + "# Signal Recovery/Interpretability of Factors" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a7cbb85a-aa7a-4ec2-b6f6-a5f7575cf08a", + "metadata": {}, + "outputs": [], + "source": [ + "# csvs containing pvalues per perturbation from the hypergeometric tests\n", + "model_recovery_inhouse = read_aws_csv(\n", + " figure_data_path\n", + " + \"process_recovery_hypergeo_pvals/pertspectra_inhouse_hypergeo_neighbors_recovery_pvalues.csv\",\n", + " header=None,\n", + ")\n", + "gsfa_recovery_inhouse = read_aws_csv(\n", + " figure_data_path\n", + " + \"process_recovery_hypergeo_pvals/GSFA_inhouse_hypergeo_neighbors_recovery_pvalues.csv\",\n", + " header=None,\n", + ")\n", + "model_recovery_norman = read_aws_csv(\n", + " figure_data_path\n", + " + \"process_recovery_hypergeo_pvals/pertspectra_norman_hypergeo_neighbors_recovery_pvalues.csv\",\n", + " header=None,\n", + ")\n", + "gsfa_recovery_norman = read_aws_csv(\n", + " figure_data_path\n", + " + \"process_recovery_hypergeo_pvals/GSFA_norman_hypergeo_neighbors_recovery_pvalues.csv\",\n", + " header=None,\n", + ")\n", + "model_recovery_replogle = read_aws_csv(\n", + " figure_data_path\n", + " + \"process_recovery_hypergeo_pvals/pertspectra_replogle_hypergeo_neighbors_recovery_pvalues.csv\",\n", + " header=None,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "54086b31-2a3e-4caf-8eda-22ae35b52dff", + "metadata": {}, + "outputs": [], + "source": [ + "# inhouse\n", + "assert len(model_recovery_inhouse) == len(gsfa_recovery_inhouse)\n", + "df = model_recovery_inhouse.merge(gsfa_recovery_inhouse, left_on=0, right_on=0)\n", + "model_inhouse_recovered = 0\n", + "gsfa_inhouse_recovered = 0\n", + "for i, row in df.iterrows():\n", + " if row[\"1_x\"] < 0.001:\n", + " model_inhouse_recovered += 1\n", + " if row[\"1_y\"] < 0.001:\n", + " gsfa_inhouse_recovered += 1\n", + "model_inhouse_recovered /= len(model_recovery_inhouse)\n", + "gsfa_inhouse_recovered /= len(gsfa_recovery_inhouse)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2b19d42a-8061-49de-8c51-c5a7f3d2371a", + "metadata": {}, + "outputs": [], + "source": [ + "# norman\n", + "assert len(model_recovery_norman) == len(gsfa_recovery_norman)\n", + "df = model_recovery_norman.merge(gsfa_recovery_norman, left_on=0, right_on=0)\n", + "model_norman_recovered = 0\n", + "gsfa_norman_recovered = 0\n", + "for i, row in df.iterrows():\n", + " if row[\"1_x\"] < 0.001:\n", + " model_norman_recovered += 1\n", + " if row[\"1_y\"] < 0.001:\n", + " gsfa_norman_recovered += 1\n", + "model_norman_recovered /= len(model_recovery_norman)\n", + "gsfa_norman_recovered /= len(gsfa_recovery_norman)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ebec88ad-54f0-4e8a-96e4-5c2a87308721", + "metadata": {}, + "outputs": [], + "source": [ + "# replogle\n", + "model_replogle_recovered = 0\n", + "for i, row in model_recovery_replogle.iterrows():\n", + " if row[1] < 0.001:\n", + " model_replogle_recovered += 1\n", + "model_replogle_recovered /= len(model_recovery_replogle)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "bcebc9de-b1b0-4393-9f6b-6b9680feecdb", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot fraction of perturbations recovered\n", + "# Define the metrics\n", + "datasets = [\"Inhouse\", \"Norman\", \"Replogle\"]\n", + "models = [\"PertSpectra\", \"GSFA\"]\n", + "values = {\n", + " \"Inhouse\": [model_inhouse_recovered, gsfa_inhouse_recovered],\n", + " \"Norman\": [model_norman_recovered, gsfa_norman_recovered],\n", + " \"Replogle\": [model_replogle_recovered, 0.0],\n", + "}\n", + "\n", + "# Prepare data in long-form for Seaborn\n", + "records = []\n", + "for dataset in datasets:\n", + " for i, model in enumerate(models):\n", + " records.append(\n", + " {\n", + " \"Dataset\": dataset,\n", + " \"Model\": model,\n", + " \"Fraction of Perturbations\": values[dataset][i],\n", + " }\n", + " )\n", + "\n", + "df = pd.DataFrame(records)\n", + "\n", + "# Initialize the matplotlib figure\n", + "plt.figure(figsize=(8, 6))\n", + "\n", + "# Plot with Seaborn\n", + "sns.set(style=\"whitegrid\")\n", + "ax = sns.barplot(\n", + " x=\"Dataset\", y=\"Fraction of Perturbations\", hue=\"Model\", data=df, errorbar=None\n", + ")\n", + "\n", + "# Add a title and labels\n", + "ax.set_title(\"Fraction of Perturbations With Recovered Signal\")\n", + "ax.set_ylabel(\"Fraction of Perturbations\")\n", + "\n", + "# Optionally add text labels above the bars\n", + "for i, p in enumerate(ax.patches):\n", + " if i == 6:\n", + " break\n", + " height = p.get_height()\n", + " ax.annotate(\n", + " f\"{height:.2f}\",\n", + " xy=(p.get_x() + p.get_width() / 2, height),\n", + " xytext=(0, 3), # 3 points vertical offset\n", + " textcoords=\"offset points\",\n", + " ha=\"center\",\n", + " va=\"bottom\",\n", + " )\n", + "\n", + "# Show legend and plot\n", + "plt.legend(title=\"Model\")\n", + "plt.savefig(\"recovered_GO.png\", dpi=600, bbox_inches=\"tight\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "333b3171-2090-428c-a92c-fd0943a03675", + "metadata": {}, + "source": [ + "# AUPRC" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "41226468-7e0a-44dc-a6bb-6c85e5d51b26", + "metadata": {}, + "outputs": [], + "source": [ + "# csvs containing recall metrics\n", + "model_aucpr_inhouse = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/pertspectra_inhouse_aucpr.csv\"\n", + ")\n", + "scETM_aucpr_inhouse = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/scETM_inhouse_aucpr.csv\"\n", + ")\n", + "gsfa_aucpr_inhouse = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/GSFA_inhouse_aucpr.csv\"\n", + ")\n", + "model_aucpr_norman = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/pertspectra_norman_aucpr.csv\"\n", + ")\n", + "scETM_aucpr_norman = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/scETM_norman_aucpr.csv\"\n", + ")\n", + "gsfa_aucpr_norman = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/GSFA_norman_aucpr.csv\"\n", + ")\n", + "model_aucpr_replogle = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/pertspectra_replogle_aucpr.csv\"\n", + ")\n", + "scETM_aucpr_replogle = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/scETM_replogle_aucpr.csv\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1acc7695-5308-4fb5-964c-c999e3999aed", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# StringDb recall\n", + "data = {\n", + " \"Inhouse\": {\n", + " \"PertSpectra\": model_aucpr_inhouse[[\"AUC\"]].iloc[0].values[0],\n", + " \"scETM\": scETM_aucpr_inhouse[[\"AUC\"]].iloc[0].values[0],\n", + " \"GSFA\": gsfa_aucpr_inhouse[[\"AUC\"]].iloc[0].values[0],\n", + " },\n", + " \"Norman\": {\n", + " \"PertSpectra\": model_aucpr_norman[[\"AUC\"]].iloc[0].values[0],\n", + " \"scETM\": scETM_aucpr_norman[[\"AUC\"]].iloc[0].values[0],\n", + " \"GSFA\": gsfa_aucpr_norman[[\"AUC\"]].iloc[0].values[0],\n", + " },\n", + " \"Replogle\": {\n", + " \"PertSpectra\": model_aucpr_replogle[[\"AUC\"]].iloc[0].values[0],\n", + " \"scETM\": scETM_aucpr_replogle[[\"AUC\"]].iloc[0].values[0],\n", + " \"GSFA\": 0,\n", + " },\n", + "}\n", + "\n", + "datasets = data.keys()\n", + "model_labels = [\"PertSpectra\", \"scETM\", \"GSFA\"]\n", + "\n", + "# Prepare data in long-form for Seaborn\n", + "records = []\n", + "for dataset in datasets:\n", + " for model in model_labels:\n", + " records.append(\n", + " {\"Dataset\": dataset, \"Model\": model, \"AUPRC\": data[dataset][model]}\n", + " )\n", + "\n", + "df = pd.DataFrame(records)\n", + "\n", + "# Initialize the matplotlib figure\n", + "plt.figure(figsize=(8, 6))\n", + "\n", + "# Plot with Seaborn\n", + "sns.set(style=\"whitegrid\")\n", + "ax = sns.barplot(x=\"Dataset\", y=\"AUPRC\", hue=\"Model\", data=df, errorbar=None)\n", + "\n", + "# Add a title and labels\n", + "ax.set_title(\"AUPRC for StringDB\")\n", + "ax.set_ylabel(\"AUPRC\")\n", + "\n", + "# Show legend and plot\n", + "plt.legend(title=\"Model\")\n", + "plt.savefig(\"figure_pngs/auprc_stringdb.png\", dpi=600, bbox_inches=\"tight\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "8b9f6ef1-ff4a-4951-8885-177b8246c1fe", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "# Example data setup (assuming model_aucpr_*, scETM_aucpr_*, etc. are appropriately defined DataFrames)\n", + "data = {\n", + " \"Inhouse\": {\n", + " \"PertSpectra\": model_aucpr_inhouse[[\"AUC\"]].iloc[1].values[0],\n", + " \"scETM\": scETM_aucpr_inhouse[[\"AUC\"]].iloc[1].values[0],\n", + " \"GSFA\": gsfa_aucpr_inhouse[[\"AUC\"]].iloc[1].values[0],\n", + " },\n", + " \"Norman\": {\n", + " \"PertSpectra\": model_aucpr_norman[[\"AUC\"]].iloc[1].values[0],\n", + " \"scETM\": scETM_aucpr_norman[[\"AUC\"]].iloc[1].values[0],\n", + " \"GSFA\": gsfa_aucpr_norman[[\"AUC\"]].iloc[1].values[0],\n", + " },\n", + " \"Replogle\": {\n", + " \"PertSpectra\": model_aucpr_replogle[[\"AUC\"]].iloc[1].values[0],\n", + " \"scETM\": scETM_aucpr_replogle[[\"AUC\"]].iloc[1].values[0],\n", + " \"GSFA\": 0,\n", + " },\n", + "}\n", + "\n", + "datasets = data.keys()\n", + "model_labels = [\"PertSpectra\", \"scETM\", \"GSFA\"]\n", + "\n", + "# Prepare data in long-form for Seaborn\n", + "records = []\n", + "for dataset in datasets:\n", + " for model in model_labels:\n", + " records.append(\n", + " {\"Dataset\": dataset, \"Model\": model, \"AUPRC\": data[dataset][model]}\n", + " )\n", + "\n", + "df = pd.DataFrame(records)\n", + "\n", + "# Initialize the matplotlib figure\n", + "plt.figure(figsize=(8, 6))\n", + "\n", + "# Plot with Seaborn\n", + "sns.set(style=\"whitegrid\")\n", + "ax = sns.barplot(x=\"Dataset\", y=\"AUPRC\", hue=\"Model\", data=df, errorbar=None)\n", + "\n", + "# Add a title and labels\n", + "ax.set_title(\"AUPRC for CORUM\")\n", + "ax.set_ylabel(\"AUPRC\")\n", + "\n", + "# Show legend and plot\n", + "plt.legend(title=\"Model\")\n", + "plt.savefig(\"figure_pngs/auprc_corum.png\", dpi=600, bbox_inches=\"tight\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "4305070d-375f-4374-9583-07fd867a4514", + "metadata": {}, + "source": [ + "# Best F1" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "83c08895-fe85-4423-9cab-8563c10fa695", + "metadata": {}, + "outputs": [], + "source": [ + "# csvs containing recall metrics\n", + "pertspectra_f1_inhouse = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/pertspectra_inhouse_f1.csv\"\n", + ")\n", + "scETM_f1_inhouse = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/scETM_inhouse_f1.csv\"\n", + ")\n", + "gsfa_f1_inhouse = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/GSFA_inhouse_f1.csv\"\n", + ")\n", + "pertspectra_f1_norman = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/pertspectra_norman_f1.csv\"\n", + ")\n", + "scETM_f1_norman = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/scETM_norman_f1.csv\"\n", + ")\n", + "gsfa_f1_norman = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/GSFA_norman_f1.csv\"\n", + ")\n", + "pertspectra_f1_replogle = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/pertspectra_replogle_f1.csv\"\n", + ")\n", + "scETM_f1_replogle = read_aws_csv(\n", + " figure_data_path + \"pert_embedding_recall/scETM_replogle_f1.csv\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "97cc2c12-2bf0-4045-9b7c-ce5ee0c03807", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data = {\n", + " \"Inhouse\": {\n", + " \"PertSpectra\": pertspectra_f1_inhouse[[\"F1\"]].iloc[0].values[0],\n", + " \"scETM\": scETM_f1_inhouse[[\"F1\"]].iloc[0].values[0],\n", + " \"GSFA\": gsfa_f1_inhouse[[\"F1\"]].iloc[0].values[0],\n", + " },\n", + " \"Norman\": {\n", + " \"PertSpectra\": pertspectra_f1_norman[[\"F1\"]].iloc[0].values[0],\n", + " \"scETM\": scETM_f1_norman[[\"F1\"]].iloc[0].values[0],\n", + " \"GSFA\": gsfa_f1_norman[[\"F1\"]].iloc[0].values[0],\n", + " },\n", + " \"Replogle\": {\n", + " \"PertSpectra\": pertspectra_f1_replogle[[\"F1\"]].iloc[0].values[0],\n", + " \"scETM\": scETM_f1_replogle[[\"F1\"]].iloc[0].values[0],\n", + " \"GSFA\": 0,\n", + " },\n", + "}\n", + "\n", + "datasets = data.keys()\n", + "model_labels = [\"PertSpectra\", \"scETM\", \"GSFA\"]\n", + "\n", + "# Prepare data in long-form for Seaborn\n", + "records = []\n", + "for dataset in datasets:\n", + " for model in model_labels:\n", + " records.append({\"Dataset\": dataset, \"Model\": model, \"F1\": data[dataset][model]})\n", + "\n", + "df = pd.DataFrame(records)\n", + "\n", + "# Initialize the matplotlib figure\n", + "plt.figure(figsize=(8, 6))\n", + "\n", + "# Plot with Seaborn\n", + "sns.set(style=\"whitegrid\")\n", + "ax = sns.barplot(x=\"Dataset\", y=\"F1\", hue=\"Model\", data=df, errorbar=None)\n", + "\n", + "# Add a title and labels\n", + "ax.set_title(\"Best F1 for StringDB\")\n", + "ax.set_ylabel(\"F1 Score\")\n", + "\n", + "# Show legend and plot\n", + "plt.legend(title=\"Model\")\n", + "# plt.savefig('figure_pngs/f1_stringdb.png', dpi=600, bbox_inches='tight')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e456118c-735f-40aa-b5b8-7f405dfaf23d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Assuming the model_f1_* variables are DataFrames with F1 scores\n", + "data = {\n", + " \"Inhouse\": {\n", + " \"PertSpectra\": pertspectra_f1_inhouse[[\"F1\"]].iloc[1].values[0],\n", + " \"scETM\": scETM_f1_inhouse[[\"F1\"]].iloc[1].values[0],\n", + " \"GSFA\": gsfa_f1_inhouse[[\"F1\"]].iloc[1].values[0],\n", + " },\n", + " \"Norman\": {\n", + " \"PertSpectra\": pertspectra_f1_norman[[\"F1\"]].iloc[1].values[0],\n", + " \"scETM\": scETM_f1_norman[[\"F1\"]].iloc[1].values[0],\n", + " \"GSFA\": gsfa_f1_norman[[\"F1\"]].iloc[1].values[0],\n", + " },\n", + " \"Replogle\": {\n", + " \"PertSpectra\": pertspectra_f1_replogle[[\"F1\"]].iloc[1].values[0],\n", + " \"scETM\": scETM_f1_replogle[[\"F1\"]].iloc[1].values[0],\n", + " \"GSFA\": 0,\n", + " },\n", + "}\n", + "\n", + "datasets = data.keys()\n", + "model_labels = [\"PertSpectra\", \"scETM\", \"GSFA\"]\n", + "\n", + "# Prepare data in long-form for Seaborn\n", + "records = []\n", + "for dataset in datasets:\n", + " for model in model_labels:\n", + " records.append({\"Dataset\": dataset, \"Model\": model, \"F1\": data[dataset][model]})\n", + "\n", + "df = pd.DataFrame(records)\n", + "\n", + "# Initialize the matplotlib figure\n", + "plt.figure(figsize=(8, 6))\n", + "\n", + "# Plot with Seaborn\n", + "sns.set(style=\"whitegrid\")\n", + "ax = sns.barplot(x=\"Dataset\", y=\"F1\", hue=\"Model\", data=df, errorbar=None)\n", + "\n", + "# Add a title and labels\n", + "ax.set_title(\"Best F1 for CORUM\")\n", + "ax.set_ylabel(\"F1 Score\")\n", + "\n", + "# Show legend and plot\n", + "plt.legend(title=\"Model\")\n", + "# plt.savefig('figure_pngs/f1_corum.png', dpi=600, bbox_inches='tight')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f3608bd-815f-4efc-acc3-1769ea75b18f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/figures/summary_radar_plot.ipynb b/figures/summary_radar_plot.ipynb new file mode 100644 index 0000000..a1f17a4 --- /dev/null +++ b/figures/summary_radar_plot.ipynb @@ -0,0 +1,1852 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d93d54a2", + "metadata": {}, + "outputs": [], + "source": [ + "from os import listdir\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import plotly.express as px" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d9d985c5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "download: s3://pert-spectra/figures/pert_embedding_cluster_gprofiler/pertspectra_inhouse_pert_emb_gprofiler.pickle to pert_embedding_cluster_gprofiler/pertspectra_inhouse_pert_emb_gprofiler.pickle\n", + "download: s3://pert-spectra/figures/pert_embedding_cluster_gprofiler/pertspectra_norman_pert_emb_gprofiler.pickle to pert_embedding_cluster_gprofiler/pertspectra_norman_pert_emb_gprofiler.pickle\n", + "download: s3://pert-spectra/figures/factor_enrichments/GSFA_norman_factor_enrichment.pickle to factor_enrichments/GSFA_norman_factor_enrichment.pickle\n", + "download: s3://pert-spectra/figures/pert_embedding_cluster_gprofiler/GSFA_norman_pert_emb_gprofiler.pickle to pert_embedding_cluster_gprofiler/GSFA_norman_pert_emb_gprofiler.pickle\n", + "download: s3://pert-spectra/figures/factor_enrichments/GSFA_inhouse_factor_enrichment.pickle to factor_enrichments/GSFA_inhouse_factor_enrichment.pickle\n", + "download: s3://pert-spectra/figures/pert_embedding_cluster_gprofiler/GSFA_inhouse_pert_emb_gprofiler.pickle to pert_embedding_cluster_gprofiler/GSFA_inhouse_pert_emb_gprofiler.pickle\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/GSFA_inhouse_aucpr.csv to pert_embedding_recall/GSFA_inhouse_aucpr.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/GSFA_inhouse_f1.csv to pert_embedding_recall/GSFA_inhouse_f1.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/GSFA_norman_f1.csv to pert_embedding_recall/GSFA_norman_f1.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_cluster_gprofiler/scETM_inhouse_pert_emb_gprofiler.pickle to pert_embedding_cluster_gprofiler/scETM_inhouse_pert_emb_gprofiler.pickle\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/GSFA_norman_aucpr.csv to pert_embedding_recall/GSFA_norman_aucpr.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/pertspectra_inhouse_aucpr.csv to pert_embedding_recall/pertspectra_inhouse_aucpr.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/pertspectra_inhouse_f1.csv to pert_embedding_recall/pertspectra_inhouse_f1.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/pertspectra_norman_f1.csv to pert_embedding_recall/pertspectra_norman_f1.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/pertspectra_replogle_aucpr.csv to pert_embedding_recall/pertspectra_replogle_aucpr.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/pertspectra_replogle_f1.csv to pert_embedding_recall/pertspectra_replogle_f1.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_cluster_gprofiler/pertspectra_replogle_pert_emb_gprofiler.pickle to pert_embedding_cluster_gprofiler/pertspectra_replogle_pert_emb_gprofiler.pickle\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/pertspectra_norman_aucpr.csv to pert_embedding_recall/pertspectra_norman_aucpr.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/scETM_inhouse_aucpr.csv to pert_embedding_recall/scETM_inhouse_aucpr.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/scETM_inhouse_f1.csv to pert_embedding_recall/scETM_inhouse_f1.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/scETM_norman_f1.csv to pert_embedding_recall/scETM_norman_f1.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/scETM_norman_aucpr.csv to pert_embedding_recall/scETM_norman_aucpr.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/scETM_replogle_aucpr.csv to pert_embedding_recall/scETM_replogle_aucpr.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_recall/scETM_replogle_f1.csv to pert_embedding_recall/scETM_replogle_f1.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_cluster_gprofiler/scETM_norman_pert_emb_gprofiler.pickle to pert_embedding_cluster_gprofiler/scETM_norman_pert_emb_gprofiler.pickle\n", + "download: s3://pert-spectra/figures/process_recovery_hypergeo_pvals/GSFA_norman_hypergeo_neighbors_recovery_pvalues.csv to process_recovery_hypergeo_pvals/GSFA_norman_hypergeo_neighbors_recovery_pvalues.csv\n", + "download: s3://pert-spectra/figures/process_recovery_hypergeo_pvals/pertspectra_inhouse_hypergeo_neighbors_recovery_pvalues.csv to process_recovery_hypergeo_pvals/pertspectra_inhouse_hypergeo_neighbors_recovery_pvalues.csv\n", + "download: s3://pert-spectra/figures/process_recovery_hypergeo_pvals/pertspectra_norman_hypergeo_neighbors_recovery_pvalues.csv to process_recovery_hypergeo_pvals/pertspectra_norman_hypergeo_neighbors_recovery_pvalues.csv\n", + "download: s3://pert-spectra/figures/process_recovery_hypergeo_pvals/GSFA_inhouse_hypergeo_neighbors_recovery_pvalues.csv to process_recovery_hypergeo_pvals/GSFA_inhouse_hypergeo_neighbors_recovery_pvalues.csv\n", + "download: s3://pert-spectra/figures/process_recovery_hypergeo_pvals/pertspectra_replogle_hypergeo_neighbors_recovery_pvalues.csv to process_recovery_hypergeo_pvals/pertspectra_replogle_hypergeo_neighbors_recovery_pvalues.csv\n", + "download: s3://pert-spectra/figures/pert_embedding_cluster_gprofiler/scETM_replogle_pert_emb_gprofiler.pickle to pert_embedding_cluster_gprofiler/scETM_replogle_pert_emb_gprofiler.pickle\n", + "download: s3://pert-spectra/figures/reconstruction_spearmans/GSFA_inhouse_spearman_correlations.csv to reconstruction_spearmans/GSFA_inhouse_spearman_correlations.csv\n", + "download: s3://pert-spectra/figures/reconstruction_spearmans/scETM_inhouse_spearman_correlations.csv to reconstruction_spearmans/scETM_inhouse_spearman_correlations.csv\n", + "download: s3://pert-spectra/figures/factor_enrichments/pertspectra_inhouse_factor_enrichment.pickle to factor_enrichments/pertspectra_inhouse_factor_enrichment.pickle\n", + "download: s3://pert-spectra/figures/reconstruction_spearmans/pertspectra_inhouse_spearman_correlations.csv to reconstruction_spearmans/pertspectra_inhouse_spearman_correlations.csv\n", + "download: s3://pert-spectra/figures/reconstruction_spearmans/GSFA_norman_spearman_correlations.csv to reconstruction_spearmans/GSFA_norman_spearman_correlations.csv\n", + "download: s3://pert-spectra/figures/runtime/.ipynb_checkpoints/time_complexity_cells-checkpoint.txt to runtime/.ipynb_checkpoints/time_complexity_cells-checkpoint.txt\n", + "download: s3://pert-spectra/figures/reconstruction_spearmans/scETM_norman_spearman_correlations.csv to reconstruction_spearmans/scETM_norman_spearman_correlations.csv\n", + "download: s3://pert-spectra/figures/reconstruction_spearmans/pertspectra_replogle_spearman_correlations.csv to reconstruction_spearmans/pertspectra_replogle_spearman_correlations.csv\n", + "download: s3://pert-spectra/figures/runtime/.ipynb_checkpoints/time_complexity_perts-checkpoint.txt to runtime/.ipynb_checkpoints/time_complexity_perts-checkpoint.txt\n", + "download: s3://pert-spectra/figures/factor_enrichments/pertspectra_norman_factor_enrichment.pickle to factor_enrichments/pertspectra_norman_factor_enrichment.pickle\n", + "download: s3://pert-spectra/figures/factor_enrichments/pertspectra_replogle_factor_enrichment.pickle to factor_enrichments/pertspectra_replogle_factor_enrichment.pickle\n", + "download: s3://pert-spectra/figures/reconstruction_spearmans/scETM_replogle_spearman_correlations.csv to reconstruction_spearmans/scETM_replogle_spearman_correlations.csv\n", + "download: s3://pert-spectra/figures/runtime/time_complexity_cells.txt to runtime/time_complexity_cells.txt\n", + "download: s3://pert-spectra/figures/reconstruction_spearmans/pertspectra_norman_spearman_correlations.csv to reconstruction_spearmans/pertspectra_norman_spearman_correlations.csv\n" + ] + } + ], + "source": [ + "!aws s3 cp --recursive s3://pert-spectra/figures/ . " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "88d496c9", + "metadata": {}, + "outputs": [], + "source": [ + "reconstruction_spearman_files = listdir(\"reconstruction_spearmans/\")\n", + "reconstruction_spearman_dict = {}\n", + "for fname in reconstruction_spearman_files:\n", + " method = fname.split(\"_\")[0]\n", + " if method not in reconstruction_spearman_dict:\n", + " reconstruction_spearman_dict[method] = {}\n", + " dataset = fname.split(\"_\")[1]\n", + " perf = np.mean(\n", + " [\n", + " float(i)\n", + " for i in open(f\"reconstruction_spearmans/{fname}\", \"r\")\n", + " .read()\n", + " .strip()\n", + " .split(\"\\n\")\n", + " ]\n", + " )\n", + " reconstruction_spearman_dict[method][dataset] = perf\n", + "spearman_df = pd.DataFrame.from_dict(reconstruction_spearman_dict).transpose()\n", + "spearman_df.reset_index(inplace=True)\n", + "spearman_df.rename(columns={\"index\": \"model\"}, inplace=True)\n", + "spearman_df = spearman_df.melt(\"model\", var_name=\"Dataset\", value_name=\"Spearman\")\n", + "spearman_df.loc[spearman_df[\"model\"] == \"pertspectra\", \"model\"] = \"PertSpectra\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "59db6493", + "metadata": {}, + "outputs": [], + "source": [ + "runtimes = pd.read_csv(\"runtime/time_complexity_cells.txt\", header=0, sep=\",\")\n", + "runtimes = runtimes.loc[runtimes[\"cells\"] == 10000]\n", + "runtimes[\"time\"] = -1 * np.log10(runtimes[\"time\"])\n", + "runtimes[\"Dataset\"] = \"norman\"\n", + "runtimes2 = runtimes.copy()\n", + "runtimes2[\"Dataset\"] = \"replogle\"\n", + "runtimes3 = runtimes.copy()\n", + "runtimes3[\"Dataset\"] = \"inhouse\"\n", + "runtimes = pd.concat([runtimes, runtimes2, runtimes3])\n", + "runtimes.drop(columns=[\"cells\", \"perturbations\"], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2e29c05a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
modeltimeDataset
3GSFA-4.137166norman
7scETM-2.995324norman
11PertSpectra-2.718502norman
3GSFA-4.137166replogle
7scETM-2.995324replogle
11PertSpectra-2.718502replogle
3GSFA-4.137166inhouse
7scETM-2.995324inhouse
11PertSpectra-2.718502inhouse
\n", + "
" + ], + "text/plain": [ + " model time Dataset\n", + "3 GSFA -4.137166 norman\n", + "7 scETM -2.995324 norman\n", + "11 PertSpectra -2.718502 norman\n", + "3 GSFA -4.137166 replogle\n", + "7 scETM -2.995324 replogle\n", + "11 PertSpectra -2.718502 replogle\n", + "3 GSFA -4.137166 inhouse\n", + "7 scETM -2.995324 inhouse\n", + "11 PertSpectra -2.718502 inhouse" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "runtimes" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "3deb102a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0 1\n", + "0 ATP6AP1 1.000000\n", + "1 RPL30 1.000000\n", + "2 DDX21 0.000379\n", + "3 PHB 0.025197\n", + "4 DMAP1 0.100606\n", + " 0 1\n", + "0 AHR 2.638282e-11\n", + "1 ARID1A 1.923624e-01\n", + "2 BAK1 1.453943e-04\n", + "3 BCL2L11 6.373444e-05\n", + "4 CBL 2.470848e-04\n", + " 0 1\n", + "0 AHR 7.461950e-07\n", + "1 ARID1A 6.365297e-06\n", + "2 BAK1 8.785246e-04\n", + "3 BCL2L11 1.197726e-02\n", + "4 CBL 2.916654e-04\n", + " 0 1\n", + "0 RIPK1 0.000001\n", + "1 MAP3K7 1.000000\n", + "2 IKBKB 0.000002\n", + "3 SKP1 1.000000\n", + "4 BIRC3 0.014625\n", + " 0 1\n", + "0 SKP2 5.679505e-05\n", + "1 NFKBIA 1.931082e-08\n", + "2 TRAF2 1.415568e-15\n", + "3 BIRC2 1.293259e-16\n", + "4 TAB1 7.861270e-06\n" + ] + } + ], + "source": [ + "reconstruction_hypergeo_files = listdir(\"process_recovery_hypergeo_pvals/\")\n", + "reconstruction_hypergeo_dict = {}\n", + "for fname in reconstruction_hypergeo_files:\n", + " method = fname.split(\"_\")[0]\n", + " if method not in reconstruction_hypergeo_dict:\n", + " reconstruction_hypergeo_dict[method] = {}\n", + " dataset = fname.split(\"_\")[1]\n", + " perf = pd.read_csv(f\"process_recovery_hypergeo_pvals/{fname}\", header=None, sep=\",\")\n", + " print(perf.head())\n", + " perf[1] = -1 * np.log10(perf[1])\n", + " perf = np.mean(perf[1])\n", + " reconstruction_hypergeo_dict[method][dataset] = perf\n", + "hypergeo_df = pd.DataFrame.from_dict(reconstruction_hypergeo_dict).transpose()\n", + "hypergeo_df.reset_index(inplace=True)\n", + "hypergeo_df.rename(columns={\"index\": \"model\"}, inplace=True)\n", + "hypergeo_df = hypergeo_df.melt(\"model\", var_name=\"Dataset\", value_name=\"hypergeo\")\n", + "hypergeo_df.loc[hypergeo_df[\"model\"] == \"pertspectra\", \"model\"] = \"PertSpectra\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "49c8b694", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.2785192646688491\n", + "0.8294573643410853\n", + "0.8203125\n", + "0.6979655712050078\n", + "0.2461233916199274\n", + "0.8203125\n", + "0.6796407185628742\n", + "0.6895475819032763\n" + ] + } + ], + "source": [ + "reconstruction_f1_files = listdir(\"pert_embedding_recall_f1\")\n", + "reconstruction_f1_dict = {}\n", + "for fname in reconstruction_f1_files:\n", + " method = fname.split(\"_\")[0]\n", + " if method not in reconstruction_f1_dict:\n", + " reconstruction_f1_dict[method] = {}\n", + " dataset = fname.split(\"_\")[1]\n", + " perf = pd.read_csv(\n", + " f\"pert_embedding_recall_f1/{fname}\", header=0, index_col=0, sep=\",\"\n", + " )\n", + " perf = perf.loc[\"StringDB\", \"F1\"]\n", + " print(perf)\n", + " reconstruction_f1_dict[method][dataset] = perf\n", + "f1_df = pd.DataFrame.from_dict(reconstruction_f1_dict).transpose()\n", + "f1_df.reset_index(inplace=True)\n", + "f1_df.rename(columns={\"index\": \"model\"}, inplace=True)\n", + "f1_df = f1_df.melt(\"model\", var_name=\"Dataset\", value_name=\"F1\")\n", + "f1_df.loc[f1_df[\"model\"] == \"pertspectra\", \"model\"] = \"PertSpectra\"" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "2b55a0b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7530475544691059\n", + "0.685748863225066\n", + "0.215161882377014\n", + "0.2525738309904994\n", + "0.7260584258594132\n", + "0.7327761570507714\n", + "0.675809046108318\n", + "0.5736430637940849\n" + ] + } + ], + "source": [ + "reconstruction_aucpr_files = listdir(\"pert_embedding_recall_aucpr\")\n", + "reconstruction_aucpr_dict = {}\n", + "for fname in reconstruction_aucpr_files:\n", + " method = fname.split(\"_\")[0]\n", + " if method not in reconstruction_aucpr_dict:\n", + " reconstruction_aucpr_dict[method] = {}\n", + " dataset = fname.split(\"_\")[1]\n", + " perf = pd.read_csv(\n", + " f\"pert_embedding_recall_aucpr/{fname}\", header=0, index_col=0, sep=\",\"\n", + " )\n", + " perf = perf.loc[\"StringDB\", \"AUC\"]\n", + " print(perf)\n", + " reconstruction_aucpr_dict[method][dataset] = perf\n", + "aucpr_df = pd.DataFrame.from_dict(reconstruction_aucpr_dict).transpose()\n", + "aucpr_df.reset_index(inplace=True)\n", + "aucpr_df.rename(columns={\"index\": \"model\"}, inplace=True)\n", + "aucpr_df = aucpr_df.melt(\"model\", var_name=\"Dataset\", value_name=\"auPRC\")\n", + "aucpr_df.loc[aucpr_df[\"model\"] == \"pertspectra\", \"model\"] = \"PertSpectra\"" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "59c55378", + "metadata": {}, + "outputs": [], + "source": [ + "# combine the metrics into 1 df\n", + "all_perf = (\n", + " spearman_df.merge(runtimes, on=[\"model\", \"Dataset\"], how=\"outer\")\n", + " .merge(hypergeo_df, on=[\"model\", \"Dataset\"], how=\"outer\")\n", + " .merge(f1_df, on=[\"model\", \"Dataset\"], how=\"outer\")\n", + " .merge(aucpr_df, on=[\"model\", \"Dataset\"], how=\"outer\")\n", + ")\n", + "all_perf.rename(\n", + " columns={\n", + " \"model\": \"Model\",\n", + " \"Spearman\": \"Spearman R\",\n", + " \"time\": \"Runtime -log10(sec)
21 perts,10K cells\",\n", + " \"hypergeo\": \"Recovery Known Biological Functions
Hypergeometric Test -log10(FDR)\",\n", + " \"F1\": \"Recall:F1\",\n", + " \"auPRC\": \"Recall:auPRC\",\n", + " },\n", + " inplace=True,\n", + ")\n", + "all_perf.loc[all_perf[\"Dataset\"] == \"norman\", \"Dataset\"] = \"Norman\"\n", + "all_perf.loc[all_perf[\"Dataset\"] == \"inhouse\", \"Dataset\"] = \"Inhouse\"\n", + "all_perf.loc[all_perf[\"Dataset\"] == \"replogle\", \"Dataset\"] = \"Replogle\"\n", + "\n", + "for column in all_perf.columns[2::]:\n", + " all_perf[column] = (all_perf[column] - all_perf[column].min()) / (\n", + " all_perf[column].max() - all_perf[column].min()\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "a705ce3a", + "metadata": {}, + "outputs": [], + "source": [ + "all_perf.fillna(0, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "9a8f2cf6", + "metadata": {}, + "outputs": [], + "source": [ + "all_perf_melted = all_perf.melt(id_vars=[\"Model\", \"Dataset\"], var_name=\"Metric\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "d964f716", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "plotlyServerURL": "https://plot.ly" + }, + "data": [ + { + "hovertemplate": "Model=GSFA
Dataset=Inhouse
value=%{r}
Metric=%{theta}", + "legendgroup": "GSFA, Inhouse", + "line": { + "color": "#ff0000", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "GSFA, Inhouse", + "r": [ + 0.011130752463737357, + 0, + 0.0990448741980258, + 0.7601549215706523, + 0.8748829077705612, + 0.011130752463737357 + ], + "showlegend": true, + "subplot": "polar", + "theta": [ + "Spearman R", + "Runtime -log10(sec)
21 perts,10K cells", + "Recovery Known Biological Functions
Hypergeometric Test -log10(FDR)", + "Recall:F1", + "Recall:auPRC", + "Spearman R" + ], + "type": "scatterpolar" + }, + { + "hovertemplate": "Model=GSFA
Dataset=Norman
value=%{r}
Metric=%{theta}", + "legendgroup": "GSFA, Norman", + "line": { + "color": "#ff0000", + "dash": "dot" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "GSFA, Norman", + "r": [ + 0, + 0, + 0.24331602055827575, + 0.9843231068843358, + 0.9623128139117564, + 0 + ], + "showlegend": true, + "subplot": "polar", + "theta": [ + "Spearman R", + "Runtime -log10(sec)
21 perts,10K cells", + "Recovery Known Biological Functions
Hypergeometric Test -log10(FDR)", + "Recall:F1", + "Recall:auPRC", + "Spearman R" + ], + "type": "scatterpolar" + }, + { + "hovertemplate": "Model=GSFA
Dataset=Replogle
value=%{r}
Metric=%{theta}", + "legendgroup": "GSFA, Replogle", + "line": { + "color": "#ff0000", + "dash": "dash" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "GSFA, Replogle", + "r": [ + 0, + 0, + 0, + 0, + 0, + 0 + ], + "showlegend": true, + "subplot": "polar", + "theta": [ + "Spearman R", + "Runtime -log10(sec)
21 perts,10K cells", + "Recovery Known Biological Functions
Hypergeometric Test -log10(FDR)", + "Recall:F1", + "Recall:auPRC", + "Spearman R" + ], + "type": "scatterpolar" + }, + { + "hovertemplate": "Model=PertSpectra
Dataset=Inhouse
value=%{r}
Metric=%{theta}", + "legendgroup": "PertSpectra, Inhouse", + "line": { + "color": "#00ff00", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "PertSpectra, Inhouse", + "r": [ + 1, + 1, + 1, + 0.7745857445560906, + 0.856403484293659, + 1 + ], + "showlegend": true, + "subplot": "polar", + "theta": [ + "Spearman R", + "Runtime -log10(sec)
21 perts,10K cells", + "Recovery Known Biological Functions
Hypergeometric Test -log10(FDR)", + "Recall:F1", + "Recall:auPRC", + "Spearman R" + ], + "type": "scatterpolar" + }, + { + "hovertemplate": "Model=PertSpectra
Dataset=Norman
value=%{r}
Metric=%{theta}", + "legendgroup": "PertSpectra, Norman", + "line": { + "color": "#00ff00", + "dash": "dot" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "PertSpectra, Norman", + "r": [ + 0.9934537881913026, + 1, + 0.5005301087955898, + 1, + 1, + 0.9934537881913026 + ], + "showlegend": true, + "subplot": "polar", + "theta": [ + "Spearman R", + "Runtime -log10(sec)
21 perts,10K cells", + "Recovery Known Biological Functions
Hypergeometric Test -log10(FDR)", + "Recall:F1", + "Recall:auPRC", + "Spearman R" + ], + "type": "scatterpolar" + }, + { + "hovertemplate": "Model=PertSpectra
Dataset=Replogle
value=%{r}
Metric=%{theta}", + "legendgroup": "PertSpectra, Replogle", + "line": { + "color": "#00ff00", + "dash": "dash" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "PertSpectra, Replogle", + "r": [ + 0.9714246276290128, + 1, + 0, + 0, + 0, + 0.9714246276290128 + ], + "showlegend": true, + "subplot": "polar", + "theta": [ + "Spearman R", + "Runtime -log10(sec)
21 perts,10K cells", + "Recovery Known Biological Functions
Hypergeometric Test -log10(FDR)", + "Recall:F1", + "Recall:auPRC", + "Spearman R" + ], + "type": "scatterpolar" + }, + { + "hovertemplate": "Model=scETM
Dataset=Inhouse
value=%{r}
Metric=%{theta}", + "legendgroup": "scETM, Inhouse", + "line": { + "color": "#0000ff", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "scETM, Inhouse", + "r": [ + 0.9929171151899834, + 0.8048713983489332, + 0, + 0.7431717458879673, + 0.666463525646199, + 0.9929171151899834 + ], + "showlegend": true, + "subplot": "polar", + "theta": [ + "Spearman R", + "Runtime -log10(sec)
21 perts,10K cells", + "Recovery Known Biological Functions
Hypergeometric Test -log10(FDR)", + "Recall:F1", + "Recall:auPRC", + "Spearman R" + ], + "type": "scatterpolar" + }, + { + "hovertemplate": "Model=scETM
Dataset=Norman
value=%{r}
Metric=%{theta}", + "legendgroup": "scETM, Norman", + "line": { + "color": "#0000ff", + "dash": "dot" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "scETM, Norman", + "r": [ + 0.9925487919516428, + 0.8048713983489332, + 0, + 0.9843231068843358, + 0.9498236706980512, + 0.9925487919516428 + ], + "showlegend": true, + "subplot": "polar", + "theta": [ + "Spearman R", + "Runtime -log10(sec)
21 perts,10K cells", + "Recovery Known Biological Functions
Hypergeometric Test -log10(FDR)", + "Recall:F1", + "Recall:auPRC", + "Spearman R" + ], + "type": "scatterpolar" + }, + { + "hovertemplate": "Model=scETM
Dataset=Replogle
value=%{r}
Metric=%{theta}", + "legendgroup": "scETM, Replogle", + "line": { + "color": "#0000ff", + "dash": "dash" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "scETM, Replogle", + "r": [ + 0.8292982343569246, + 0.8048713983489332, + 0, + 0.05553572149724144, + 0.06955371848439953, + 0.8292982343569246 + ], + "showlegend": true, + "subplot": "polar", + "theta": [ + "Spearman R", + "Runtime -log10(sec)
21 perts,10K cells", + "Recovery Known Biological Functions
Hypergeometric Test -log10(FDR)", + "Recall:F1", + "Recall:auPRC", + "Spearman R" + ], + "type": "scatterpolar" + } + ], + "layout": { + "legend": { + "title": { + "text": "Model, Dataset" + }, + "tracegroupgap": 0 + }, + "margin": { + "t": 60 + }, + "polar": { + "angularaxis": { + "direction": "counterclockwise", + "rotation": 45 + }, + "domain": { + "x": [ + 0, + 1 + ], + "y": [ + 0, + 1 + ] + } + }, + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "white", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "white", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "#C8D4E3", + "linecolor": "#C8D4E3", + "minorgridcolor": "#C8D4E3", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "#C8D4E3", + "linecolor": "#C8D4E3", + "minorgridcolor": "#C8D4E3", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "fillpattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "white", + "showlakes": true, + "showland": true, + "subunitcolor": "#C8D4E3" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "white", + "polar": { + "angularaxis": { + "gridcolor": "#EBF0F8", + "linecolor": "#EBF0F8", + "ticks": "" + }, + "bgcolor": "white", + "radialaxis": { + "gridcolor": "#EBF0F8", + "linecolor": "#EBF0F8", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "white", + "gridcolor": "#DFE8F3", + "gridwidth": 2, + "linecolor": "#EBF0F8", + "showbackground": true, + "ticks": "", + "zerolinecolor": "#EBF0F8" + }, + "yaxis": { + "backgroundcolor": "white", + "gridcolor": "#DFE8F3", + "gridwidth": 2, + "linecolor": "#EBF0F8", + "showbackground": true, + "ticks": "", + "zerolinecolor": "#EBF0F8" + }, + "zaxis": { + "backgroundcolor": "white", + "gridcolor": "#DFE8F3", + "gridwidth": 2, + "linecolor": "#EBF0F8", + "showbackground": true, + "ticks": "", + "zerolinecolor": "#EBF0F8" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "#DFE8F3", + "linecolor": "#A2B1C6", + "ticks": "" + }, + "baxis": { + "gridcolor": "#DFE8F3", + "linecolor": "#A2B1C6", + "ticks": "" + }, + "bgcolor": "white", + "caxis": { + "gridcolor": "#DFE8F3", + "linecolor": "#A2B1C6", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "#EBF0F8", + "linecolor": "#EBF0F8", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "#EBF0F8", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "#EBF0F8", + "linecolor": "#EBF0F8", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "#EBF0F8", + "zerolinewidth": 2 + } + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df = all_perf_melted\n", + "fig = px.line_polar(\n", + " df,\n", + " r=\"value\",\n", + " theta=\"Metric\",\n", + " color=\"Model\",\n", + " line_dash=\"Dataset\",\n", + " line_close=True,\n", + " color_discrete_sequence=[\"#ff0000\", \"#00ff00\", \"#0000ff\"],\n", + " direction=\"counterclockwise\",\n", + " start_angle=45,\n", + " template=\"plotly_white\",\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "c00ef03f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on function line_polar in module plotly.express._chart_types:\n", + "\n", + "line_polar(data_frame=None, r=None, theta=None, color=None, line_dash=None, hover_name=None, hover_data=None, custom_data=None, line_group=None, text=None, symbol=None, animation_frame=None, animation_group=None, category_orders=None, labels=None, color_discrete_sequence=None, color_discrete_map=None, line_dash_sequence=None, line_dash_map=None, symbol_sequence=None, symbol_map=None, markers=False, direction='clockwise', start_angle=90, line_close=False, line_shape=None, render_mode='auto', range_r=None, range_theta=None, log_r=False, title=None, template=None, width=None, height=None) -> plotly.graph_objs._figure.Figure\n", + " In a polar line plot, each row of `data_frame` is represented as vertex\n", + " of a polyline mark in polar coordinates.\n", + " \n", + " Parameters\n", + " ----------\n", + " data_frame: DataFrame or array-like or dict\n", + " This argument needs to be passed for column names (and not keyword\n", + " names) to be used. Array-like and dict are transformed internally to a\n", + " pandas DataFrame. Optional: if missing, a DataFrame gets constructed\n", + " under the hood using the other arguments.\n", + " r: str or int or Series or array-like\n", + " Either a name of a column in `data_frame`, or a pandas Series or\n", + " array_like object. Values from this column or array_like are used to\n", + " position marks along the radial axis in polar coordinates.\n", + " theta: str or int or Series or array-like\n", + " Either a name of a column in `data_frame`, or a pandas Series or\n", + " array_like object. Values from this column or array_like are used to\n", + " position marks along the angular axis in polar coordinates.\n", + " color: str or int or Series or array-like\n", + " Either a name of a column in `data_frame`, or a pandas Series or\n", + " array_like object. Values from this column or array_like are used to\n", + " assign color to marks.\n", + " line_dash: str or int or Series or array-like\n", + " Either a name of a column in `data_frame`, or a pandas Series or\n", + " array_like object. Values from this column or array_like are used to\n", + " assign dash-patterns to lines.\n", + " hover_name: str or int or Series or array-like\n", + " Either a name of a column in `data_frame`, or a pandas Series or\n", + " array_like object. Values from this column or array_like appear in bold\n", + " in the hover tooltip.\n", + " hover_data: str, or list of str or int, or Series or array-like, or dict\n", + " Either a name or list of names of columns in `data_frame`, or pandas\n", + " Series, or array_like objects or a dict with column names as keys, with\n", + " values True (for default formatting) False (in order to remove this\n", + " column from hover information), or a formatting string, for example\n", + " ':.3f' or '|%a' or list-like data to appear in the hover tooltip or\n", + " tuples with a bool or formatting string as first element, and list-like\n", + " data to appear in hover as second element Values from these columns\n", + " appear as extra data in the hover tooltip.\n", + " custom_data: str, or list of str or int, or Series or array-like\n", + " Either name or list of names of columns in `data_frame`, or pandas\n", + " Series, or array_like objects Values from these columns are extra data,\n", + " to be used in widgets or Dash callbacks for example. This data is not\n", + " user-visible but is included in events emitted by the figure (lasso\n", + " selection etc.)\n", + " line_group: str or int or Series or array-like\n", + " Either a name of a column in `data_frame`, or a pandas Series or\n", + " array_like object. Values from this column or array_like are used to\n", + " group rows of `data_frame` into lines.\n", + " text: str or int or Series or array-like\n", + " Either a name of a column in `data_frame`, or a pandas Series or\n", + " array_like object. Values from this column or array_like appear in the\n", + " figure as text labels.\n", + " symbol: str or int or Series or array-like\n", + " Either a name of a column in `data_frame`, or a pandas Series or\n", + " array_like object. Values from this column or array_like are used to\n", + " assign symbols to marks.\n", + " animation_frame: str or int or Series or array-like\n", + " Either a name of a column in `data_frame`, or a pandas Series or\n", + " array_like object. Values from this column or array_like are used to\n", + " assign marks to animation frames.\n", + " animation_group: str or int or Series or array-like\n", + " Either a name of a column in `data_frame`, or a pandas Series or\n", + " array_like object. Values from this column or array_like are used to\n", + " provide object-constancy across animation frames: rows with matching\n", + " `animation_group`s will be treated as if they describe the same object\n", + " in each frame.\n", + " category_orders: dict with str keys and list of str values (default `{}`)\n", + " By default, in Python 3.6+, the order of categorical values in axes,\n", + " legends and facets depends on the order in which these values are first\n", + " encountered in `data_frame` (and no order is guaranteed by default in\n", + " Python below 3.6). This parameter is used to force a specific ordering\n", + " of values per column. The keys of this dict should correspond to column\n", + " names, and the values should be lists of strings corresponding to the\n", + " specific display order desired.\n", + " labels: dict with str keys and str values (default `{}`)\n", + " By default, column names are used in the figure for axis titles, legend\n", + " entries and hovers. This parameter allows this to be overridden. The\n", + " keys of this dict should correspond to column names, and the values\n", + " should correspond to the desired label to be displayed.\n", + " color_discrete_sequence: list of str\n", + " Strings should define valid CSS-colors. When `color` is set and the\n", + " values in the corresponding column are not numeric, values in that\n", + " column are assigned colors by cycling through `color_discrete_sequence`\n", + " in the order described in `category_orders`, unless the value of\n", + " `color` is a key in `color_discrete_map`. Various useful color\n", + " sequences are available in the `plotly.express.colors` submodules,\n", + " specifically `plotly.express.colors.qualitative`.\n", + " color_discrete_map: dict with str keys and str values (default `{}`)\n", + " String values should define valid CSS-colors Used to override\n", + " `color_discrete_sequence` to assign a specific colors to marks\n", + " corresponding with specific values. Keys in `color_discrete_map` should\n", + " be values in the column denoted by `color`. Alternatively, if the\n", + " values of `color` are valid colors, the string `'identity'` may be\n", + " passed to cause them to be used directly.\n", + " line_dash_sequence: list of str\n", + " Strings should define valid plotly.js dash-patterns. When `line_dash`\n", + " is set, values in that column are assigned dash-patterns by cycling\n", + " through `line_dash_sequence` in the order described in\n", + " `category_orders`, unless the value of `line_dash` is a key in\n", + " `line_dash_map`.\n", + " line_dash_map: dict with str keys and str values (default `{}`)\n", + " Strings values define plotly.js dash-patterns. Used to override\n", + " `line_dash_sequences` to assign a specific dash-patterns to lines\n", + " corresponding with specific values. Keys in `line_dash_map` should be\n", + " values in the column denoted by `line_dash`. Alternatively, if the\n", + " values of `line_dash` are valid line-dash names, the string\n", + " `'identity'` may be passed to cause them to be used directly.\n", + " symbol_sequence: list of str\n", + " Strings should define valid plotly.js symbols. When `symbol` is set,\n", + " values in that column are assigned symbols by cycling through\n", + " `symbol_sequence` in the order described in `category_orders`, unless\n", + " the value of `symbol` is a key in `symbol_map`.\n", + " symbol_map: dict with str keys and str values (default `{}`)\n", + " String values should define plotly.js symbols Used to override\n", + " `symbol_sequence` to assign a specific symbols to marks corresponding\n", + " with specific values. Keys in `symbol_map` should be values in the\n", + " column denoted by `symbol`. Alternatively, if the values of `symbol`\n", + " are valid symbol names, the string `'identity'` may be passed to cause\n", + " them to be used directly.\n", + " markers: boolean (default `False`)\n", + " If `True`, markers are shown on lines.\n", + " direction: str\n", + " One of '`counterclockwise'` or `'clockwise'`. Default is `'clockwise'`\n", + " Sets the direction in which increasing values of the angular axis are\n", + " drawn.\n", + " start_angle: int (default `90`)\n", + " Sets start angle for the angular axis, with 0 being due east and 90\n", + " being due north.\n", + " line_close: boolean (default `False`)\n", + " If `True`, an extra line segment is drawn between the first and last\n", + " point.\n", + " line_shape: str (default `'linear'`)\n", + " One of `'linear'`, `'spline'`, `'hv'`, `'vh'`, `'hvh'`, or `'vhv'`\n", + " render_mode: str\n", + " One of `'auto'`, `'svg'` or `'webgl'`, default `'auto'` Controls the\n", + " browser API used to draw marks. `'svg'` is appropriate for figures of\n", + " less than 1000 data points, and will allow for fully-vectorized output.\n", + " `'webgl'` is likely necessary for acceptable performance above 1000\n", + " points but rasterizes part of the output. `'auto'` uses heuristics to\n", + " choose the mode.\n", + " range_r: list of two numbers\n", + " If provided, overrides auto-scaling on the radial axis in polar\n", + " coordinates.\n", + " range_theta: list of two numbers\n", + " If provided, overrides auto-scaling on the angular axis in polar\n", + " coordinates.\n", + " log_r: boolean (default `False`)\n", + " If `True`, the radial axis is log-scaled in polar coordinates.\n", + " title: str\n", + " The figure title.\n", + " template: str or dict or plotly.graph_objects.layout.Template instance\n", + " The figure template name (must be a key in plotly.io.templates) or\n", + " definition.\n", + " width: int (default `None`)\n", + " The figure width in pixels.\n", + " height: int (default `None`)\n", + " The figure height in pixels.\n", + " \n", + " Returns\n", + " -------\n", + " plotly.graph_objects.Figure\n", + "\n" + ] + } + ], + "source": [ + "help(px.line_polar)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c75a64d4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/inhouse_GSFA_metrics.ipynb b/inhouse_GSFA_metrics.ipynb new file mode 100644 index 0000000..21afd07 --- /dev/null +++ b/inhouse_GSFA_metrics.ipynb @@ -0,0 +1,632 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b8270f66-7a27-4263-aa97-25abeb2cf34d", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2 " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "529f64f3-c83e-481d-872c-df89b9dd063b", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from utils import (\n", + " auprc,\n", + " factor_enrichment_gsea,\n", + " get_gprofiler,\n", + " inhouse_preprocess,\n", + " perturbation_signal_recovery,\n", + " read_aws_csv,\n", + " read_aws_h5ad,\n", + " read_aws_npz,\n", + " read_aws_pickle,\n", + " retrieve_stringdb_neighbors,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "39762015-fa94-4ae8-9038-405f7c56a03b", + "metadata": {}, + "outputs": [], + "source": [ + "# read learned parameters\n", + "Z = read_aws_csv(\n", + " \"s3://pert-spectra/gsfa_checkpoints/inhouse_gsfa_outputs/Z.csv\"\n", + ").to_numpy()\n", + "W = read_aws_csv(\n", + " \"s3://pert-spectra/gsfa_checkpoints/inhouse_gsfa_outputs/W.csv\"\n", + ").to_numpy()\n", + "F = read_aws_csv(\n", + " \"s3://pert-spectra/gsfa_checkpoints/inhouse_gsfa_outputs/F.csv\"\n", + ").to_numpy()\n", + "beta = read_aws_csv(\n", + " \"s3://pert-spectra/gsfa_checkpoints/inhouse_gsfa_outputs/beta.csv\"\n", + ").to_numpy()\n", + "lsfr = read_aws_csv(\n", + " \"s3://pert-spectra/gsfa_checkpoints/inhouse_gsfa_outputs/lsfr.csv\"\n", + ").to_numpy()\n", + "# read gene and perturbation information\n", + "gene_labels = read_aws_csv(\"s3://pert-spectra/gsfa_checkpoints/inhouse_top_genes.csv\")[\n", + " \"x\"\n", + "].to_numpy()\n", + "pert_labels = read_aws_npz(\"s3://pert-spectra/gsfa_checkpoints/inhouse_G_labels.npz\")[\n", + " \"arr_0\"\n", + "]\n", + "pert_labels = [x.split(\"_\")[1] for x in pert_labels[:-1]]\n", + "pert_labels = pert_labels + [\"ctrl\"]\n", + "# read inputs and preprocessed input\n", + "preprocessed_Y = read_aws_npz(\n", + " \"s3://pert-spectra/gsfa_checkpoints/inhouse_GSFA_preprocessed.npz\"\n", + ")[\"array1\"]\n", + "G = read_aws_npz(\"s3://pert-spectra/gsfa_checkpoints/inhouse_GSFA_inputs.npz\")[\"array2\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "601bbfb0-17ed-4694-850c-fcbf1fef961b", + "metadata": {}, + "outputs": [], + "source": [ + "# read in adata of raw data for reference\n", + "adata = read_aws_h5ad(\"path to raw inhouse h5ad\")\n", + "adata = adata[:, gene_labels - 1]\n", + "adata = inhouse_preprocess(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a6ea3e4a-7c2f-41ba-b136-9ef9a6ab3029", + "metadata": {}, + "outputs": [], + "source": [ + "# filter adata to perturbations with at least 50 samples for each treatment\n", + "adata.obs[\"condition\"] = adata.obs[\"condition\"].astype(str)\n", + "adata.obs[\"Treatment\"] = adata.obs[\"Treatment\"].astype(str)\n", + "adata.obs[\"pert_treat\"] = adata.obs[\"condition\"] + \"+\" + adata.obs[\"Treatment\"]\n", + "obs_df = pd.DataFrame(adata.obs[\"pert_treat\"])\n", + "category_counts = obs_df[\"pert_treat\"].value_counts()\n", + "filtered_categories = category_counts[category_counts >= 50].index\n", + "adata = adata[adata.obs[\"pert_treat\"].isin(filtered_categories)]" + ] + }, + { + "cell_type": "markdown", + "id": "5b7bf773-5e0c-43b8-8df2-4ce3376be959", + "metadata": {}, + "source": [ + "# Reconstruction" + ] + }, + { + "cell_type": "markdown", + "id": "3b4b092b-2120-48c6-a492-84656e8b7169", + "metadata": {}, + "source": [ + "## Spearman Coefficient\n", + "- Spearman correlation between predicted and observed expression (on DE genes if available)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e2885bd3-5433-47d6-88d5-47be83df5b5a", + "metadata": {}, + "outputs": [], + "source": [ + "# for GSFA, measure reconstruction on training\n", + "from scipy.stats import spearmanr\n", + "\n", + "reconstruction = Z @ W.T\n", + "# rebuild pert labels\n", + "G_labels = []\n", + "for row in G:\n", + " pert_idx = np.where(row == 1)[0].tolist()\n", + " perts = [pert_labels[i] for i in pert_idx]\n", + " G_labels.append(\"+\".join(perts))\n", + "test_corr_singles = []\n", + "test_corr_combos = []\n", + "\n", + "for pert in set(G_labels):\n", + " hold_idx = [i for i, x in enumerate(G_labels) if x == pert]\n", + " if not hold_idx:\n", + " continue\n", + " recon = reconstruction[hold_idx]\n", + " # correlation\n", + " mean_reconstruction = recon.mean(axis=0)\n", + " mean_observed = preprocessed_Y[hold_idx].mean(axis=0)\n", + " if (\"+\" in pert) and (\"ctrl\" not in pert):\n", + " test_corr_combos.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0]]\n", + " )\n", + " else:\n", + " test_corr_singles.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0]]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "971dead3-70d4-41a5-98c0-ab485b32da4f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot train correlation distribution vs test correlation distribution\n", + "import seaborn as sns\n", + "\n", + "test_corr = np.array(test_corr_singles + test_corr_combos)[:, 1].astype(float)\n", + "sns.histplot(test_corr, label=\"test_corr\")\n", + "plt.title(\"Correlation between predicted and observed expression per perturbation\")\n", + "plt.xlabel(\"Spearman Correlation\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "cfa5cc5e-a7c8-44b2-92de-2c5d083721a1", + "metadata": {}, + "outputs": [], + "source": [ + "np.savetxt(\n", + " \"figures/reconstruction_spearmans/GSFA_inhouse_spearman_correlations.csv\",\n", + " test_corr,\n", + " delimiter=\",\",\n", + " fmt=\"%.2f\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "64fc0b35-f87e-4d8a-87f1-e6080a3ee83f", + "metadata": {}, + "source": [ + "# Perturbation Embedding Heatmap" + ] + }, + { + "cell_type": "markdown", + "id": "c36ebe2b-2835-44ed-9f71-a52b522125b1", + "metadata": {}, + "source": [ + "## Hierarchical Clustering + Enrichment\n", + "- Perform enrichment tests (gprofiler) on hierarchical clustering of perturbation embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0aaf8dc3-7e5b-4748-8685-fc8560bf7fc7", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "from scipy.spatial.distance import pdist, squareform\n", + "\n", + "# Put into df\n", + "pert_emb_df = pd.DataFrame(beta[:-1], index=pert_labels)\n", + "pert_emb_df = pert_emb_df.drop(columns=[0]).astype(\"float\")\n", + "\n", + "# Compute the pairwise distances\n", + "df = pert_emb_df.drop(index=[\"ctrl\", \"intergenic\"])\n", + "\n", + "distance = \"euclidean\"\n", + "\n", + "distances = pdist(df.values, metric=distance)\n", + "\n", + "# Convert the distances into a square distance matrix\n", + "distance_matrix = pd.DataFrame(squareform(distances), index=df.index, columns=df.index)\n", + "clustermap = sns.clustermap(distance_matrix, cmap=\"viridis_r\")\n", + "clustermap.fig.suptitle(f\"Pairwise {distance} distance of perturbation latent vectors\")\n", + "# clustermap.ax_row_dendrogram.set_visible(False)\n", + "clustermap.ax_col_dendrogram.set_visible(False)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5c7d0199-28ac-4e7c-b4b5-180844815fe6", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from scipy.cluster.hierarchy import dendrogram\n", + "\n", + "den = dendrogram(\n", + " clustermap.dendrogram_col.linkage, labels=distance_matrix.index, color_threshold=2\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0a73d648-3099-47e8-b7d2-6c9e54b4b8c5", + "metadata": {}, + "outputs": [], + "source": [ + "# extract clusters and perform gprofiler\n", + "from collections import defaultdict\n", + "\n", + "\n", + "def get_cluster_classes(den, label=\"ivl\"):\n", + " cluster_idxs = defaultdict(list)\n", + " for c, pi in zip(den[\"color_list\"], den[\"icoord\"]):\n", + " for leg in pi[1:3]:\n", + " i = (leg - 5.0) / 10.0\n", + " if abs(i - int(i)) < 1e-5:\n", + " cluster_idxs[c].append(int(i))\n", + " cluster_classes = {}\n", + " for c, l in cluster_idxs.items(): # noqa\n", + " i_l = [den[label][i] for i in l]\n", + " cluster_classes[c] = i_l\n", + " return cluster_classes\n", + "\n", + "\n", + "clusters = get_cluster_classes(den)\n", + "# extract functions for clusters\n", + "cluster_process = {}\n", + "for c in clusters:\n", + " cluster_df = pd.DataFrame(clusters[c], columns=[\"gene_symbol\"])\n", + " res = get_gprofiler(cluster_df)\n", + " cluster_process[c] = res[res[\"p_value\"] <= 0.05]" + ] + }, + { + "cell_type": "code", + "execution_count": 266, + "id": "fa1a6087-7597-4ac7-8c6b-ebc945044ef8", + "metadata": {}, + "outputs": [], + "source": [ + "# save grpofiler results\n", + "with open(\n", + " \"figures/pert_embedding_cluster_gprofiler/GSFA_inhouse_pert_emb_gprofiler.pickle\",\n", + " \"wb\",\n", + ") as handle:\n", + " pickle.dump(cluster_process, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "2b11d8f8-00ca-47cf-acde-53d5d1c88bef", + "metadata": {}, + "source": [ + "## PR Curve+AUC\n", + "- AUCPR using prior graph as binary label" + ] + }, + { + "cell_type": "code", + "execution_count": 268, + "id": "16fceeb8-b9a3-495f-a799-f733d21cba7b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " AUC\n", + "StringDB 0.685749\n", + "CORUM 0.536772\n", + " F1 Threshold\n", + "StringDB 0.689548 0.011192\n", + "CORUM 0.632953 0.016780\n" + ] + } + ], + "source": [ + "aucpr_df, f1_df, pr_dict = auprc(distance_matrix)\n", + "print(aucpr_df)\n", + "print(f1_df)\n", + "aucpr_df.to_csv(\"figures/pert_embedding_recall/GSFA_inhouse_aucpr.csv\")\n", + "f1_df.to_csv(\"figures/pert_embedding_recall/GSFA_inhouse_f1.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "8fdff66a-8f20-4268-8520-d33a0fa4969a", + "metadata": {}, + "source": [ + "# Interpretability of our learned latent space" + ] + }, + { + "cell_type": "markdown", + "id": "1ec54f80-8a0f-41f0-8c55-edd2b1fbfb3c", + "metadata": {}, + "source": [ + "## Factor Enrichment\n", + "- GSEA on log transformed latent-by-gene factors\n", + "- Then associate each perturbation to its top latent factors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35eb8169-15b1-4b5b-b926-d5b7238f372a", + "metadata": {}, + "outputs": [], + "source": [ + "factor_to_go = factor_enrichment_gsea(adata, W.T, fdr=5e-2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814189ce-53f3-4716-834a-a4f0ea6efcf9", + "metadata": {}, + "outputs": [], + "source": [ + "# filter and add description to processes\n", + "go_df = read_aws_csv(\"s3://pert-spectra/references/GO_to_Description.txt\")\n", + "go_df.set_index(\"Term\", inplace=True)\n", + "go_dict = go_df.to_dict()[\"Description\"]\n", + "\n", + "filtered_factor_to_go = {}\n", + "for i in factor_to_go:\n", + " proc = factor_to_go[i]\n", + " proc[\"descr\"] = [go_dict[x] for x in proc[\"GO_ID\"]]\n", + " filtered_factor_to_go[i] = proc" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "a9e4b2f0-41d1-434d-b67e-94c6bef8ddd4", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\n", + " \"figures/factor_enrichments/GSFA_inhouse_factor_enrichment.pickle\", \"wb\"\n", + ") as handle:\n", + " pickle.dump(filtered_factor_to_go, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "c48ac5d8-e953-4b82-b6ca-9e59bc8e49bf", + "metadata": {}, + "source": [ + "## Overlap with prior knowledge/ground truth (stringdb)\n", + "- Group A: a set of GO terms associated with a perturbation (either drivers from msigdb, or from literature) and its neighbors in stringdb\n", + "- Group B: a set of GO terms from the interpretability analysis\n", + "- Hypergeometric test on the overlap of the two groups" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6867de68-b31b-452d-8f3f-167e5af8d2c9", + "metadata": {}, + "outputs": [], + "source": [ + "# load precomputed factor_to_go dict if available\n", + "filtered_factor_to_go = read_aws_pickle(\n", + " \"s3://pert-spectra/figures/factor_enrichments/GSFA_inhouse_factor_enrichment.pickle\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "bf3b1040-d325-4154-9f9e-92d4ec8d9ab7", + "metadata": {}, + "outputs": [], + "source": [ + "# get neighbors for each perturbation\n", + "pert_neighbors = retrieve_stringdb_neighbors(pert_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "781275db-dc25-4733-84e6-010ebbc17cdd", + "metadata": {}, + "outputs": [], + "source": [ + "# construct group A - known processes for each perturbation\n", + "gene_sets = read_aws_pickle(\"s3://pert-spectra/references/GO_to_Gene.pickle\")\n", + "# subset to biological processes\n", + "go_reference = read_aws_csv(\"s3://pert-spectra/references/GO_terms.txt.gz\", zipped=True)\n", + "go_bp = go_reference[go_reference[\"go_category\"] == \"biological_process\"]\n", + "go_bp_ids = set(go_bp[\"go_id\"].values)\n", + "filtered_go_terms = {key: gene_sets[key] for key in go_bp_ids if key in gene_sets}\n", + "\n", + "# GO terms per perturbation AND its neighbors in stringdb\n", + "pert_to_go = {key: set() for key in df.index}\n", + "for goterm in filtered_go_terms:\n", + " for pert in df.index:\n", + " if pert in filtered_go_terms[goterm] and set(\n", + " filtered_go_terms[goterm]\n", + " ).intersection(pert_neighbors[pert]):\n", + " pert_to_go[pert].add(goterm)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "acdb19ec-92d3-4d85-9a0b-23c3711b879b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_32012/994851414.py:11: RuntimeWarning: divide by zero encountered in log\n", + " delta_loading = np.abs(np.log(np.abs(pert_loading))-np.log(np.abs(ctrl_loading)))\n" + ] + } + ], + "source": [ + "# construct set B - model identified processes for each perturbation\n", + "n = 2 # number of top factors to get processes from\n", + "model_pert_to_go = {}\n", + "for pert in pert_emb_df.index:\n", + " if pert in [\"ctrl\", \"intergenic\", \"basal\"]:\n", + " continue\n", + " # get top factors\n", + " pert_emb_df = pd.DataFrame(beta[:-1], index=pert_labels)\n", + " pert_loading = pert_emb_df.loc[pert].to_numpy()[1:].astype(float)\n", + " ctrl_loading = pert_emb_df.loc[\"intergenic\"].to_numpy()[1:].astype(float)\n", + " delta_loading = np.abs(np.log(np.abs(pert_loading)) - np.log(np.abs(ctrl_loading)))\n", + " top_n_factors = np.argpartition(np.array(delta_loading), -n)[-n:]\n", + " # get processes\n", + " model_processes = set()\n", + " for f in top_n_factors:\n", + " proc = filtered_factor_to_go[f]\n", + " model_processes = model_processes.union(set(proc[\"Term\"]))\n", + " model_pert_to_go[pert] = model_processes" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "26781b0d-61c6-45ca-9b42-ee77c73ceed2", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overlap for RIPK1: 9 out of 38 in researchDB\n", + "P-value for RIPK1: 1.1308619075120921e-07\n", + "Overlap for MAP3K7: 0 out of 23 in researchDB\n", + "P-value for MAP3K7: 1.0\n", + "Overlap for IKBKB: 7 out of 24 in researchDB\n", + "P-value for IKBKB: 3.830013304044622e-07\n", + "Overlap for SKP1: 0 out of 7 in researchDB\n", + "P-value for SKP1: 1.0\n", + "Overlap for BIRC3: 3 out of 20 in researchDB\n", + "P-value for BIRC3: 0.007686716976516679\n", + "Overlap for TAB2: 0 out of 5 in researchDB\n", + "P-value for TAB2: 1.0\n", + "Overlap for NFKBIA: 6 out of 28 in researchDB\n", + "P-value for NFKBIA: 1.8192895607157648e-05\n", + "Overlap for SKP2: 1 out of 6 in researchDB\n", + "P-value for SKP2: 0.11702982854059442\n", + "Overlap for TRAF6: 4 out of 46 in researchDB\n", + "P-value for TRAF6: 0.018440536255468803\n", + "Overlap for FBXW11: 5 out of 13 in researchDB\n", + "P-value for FBXW11: 3.4478778936666224e-06\n", + "Overlap for TRAF5: 4 out of 14 in researchDB\n", + "P-value for TRAF5: 0.00020482544648502187\n", + "Overlap for CHUK: 7 out of 21 in researchDB\n", + "P-value for CHUK: 1.84359744108617e-07\n", + "Overlap for RBCK1: 0 out of 11 in researchDB\n", + "P-value for RBCK1: 1.0\n", + "Overlap for TAB1: 0 out of 9 in researchDB\n", + "P-value for TAB1: 1.0\n", + "Overlap for IKBKG: 5 out of 17 in researchDB\n", + "P-value for IKBKG: 2.6711340009329823e-05\n", + "Overlap for CUL1: 2 out of 7 in researchDB\n", + "P-value for CUL1: 0.008357243957233893\n", + "Overlap for TNFRSF1A: 7 out of 21 in researchDB\n", + "P-value for TNFRSF1A: 2.298789369209975e-07\n", + "Overlap for TRAF2: 1 out of 29 in researchDB\n", + "P-value for TRAF2: 0.2850611728304155\n", + "Overlap for TRADD: 6 out of 13 in researchDB\n", + "P-value for TRADD: 1.0736863763836888e-07\n", + "Overlap for BIRC2: 4 out of 30 in researchDB\n", + "P-value for BIRC2: 0.0024805517666413985\n", + "Overlap for UBE2N: 0 out of 16 in researchDB\n", + "P-value for UBE2N: 1.0\n" + ] + } + ], + "source": [ + "pvals = perturbation_signal_recovery(\n", + " pert_to_go,\n", + " model_pert_to_go,\n", + " list(filtered_go_terms.keys()),\n", + " list(pert_emb_df.index),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 262, + "id": "c3e93509-ea30-44d2-b978-5627a449c255", + "metadata": {}, + "outputs": [], + "source": [ + "# save as csv for visualization\n", + "pd.DataFrame.from_dict(data=pvals, orient=\"index\").to_csv(\n", + " \"figures/process_recovery_hypergeo_pvals/GSFA_inhouse_hypergeo_neighbors_recovery_pvalues.csv\",\n", + " header=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c7cbc9b-d955-4778-82ea-9e31b1934f59", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/inhouse_pertspectra_metrics.ipynb b/inhouse_pertspectra_metrics.ipynb new file mode 100644 index 0000000..0e7e833 --- /dev/null +++ b/inhouse_pertspectra_metrics.ipynb @@ -0,0 +1,638 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b8270f66-7a27-4263-aa97-25abeb2cf34d", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2 " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "529f64f3-c83e-481d-872c-df89b9dd063b", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "from utils import (\n", + " auprc,\n", + " factor_enrichment_gsea,\n", + " generate_k_fold,\n", + " get_gprofiler,\n", + " perturbation_signal_recovery,\n", + " read_aws_csv,\n", + " read_aws_h5ad,\n", + " read_aws_pickle,\n", + " retrieve_stringdb_neighbors,\n", + " set_seed,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8adbe3c2-a83b-4a8a-95ec-5f4c103f316c", + "metadata": {}, + "outputs": [], + "source": [ + "# read in trained model outputs generated from ./PertSpectra_load_checkpoints/pertspectra_inhouse.ipynb\n", + "adata = read_aws_h5ad(\n", + " \"s3://pert-spectra/PertSpectra_checkpoints/pertspectra_inhouse/fold_0.h5ad\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5b7bf773-5e0c-43b8-8df2-4ce3376be959", + "metadata": {}, + "source": [ + "# Reconstruction" + ] + }, + { + "cell_type": "markdown", + "id": "3b4b092b-2120-48c6-a492-84656e8b7169", + "metadata": {}, + "source": [ + "## Spearman Coefficient\n", + "- Spearman correlation between predicted and observed expression of test set\n", + "- Compute correlation for the mean expression aggregated on perturbation, across all kfolds" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d62bb784-fd4f-4a86-acd4-fd54e784aa73", + "metadata": {}, + "outputs": [], + "source": [ + "n_folds = 5\n", + "model_adatas = []\n", + "for n in range(0, n_folds):\n", + " # new adata\n", + " adata_n = adata.copy()\n", + " # load model from checkpoint\n", + " s3_dir = \"s3://pert-spectra/PertSpectra_checkpoints/\"\n", + " experiment_name = \"pertspectra_inhouse/\"\n", + " model_name = f\"fold_{n}.h5ad\"\n", + " m_adata = read_aws_h5ad(s3_dir + experiment_name + model_name)\n", + " model_adatas.append(m_adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ac5cff31-fd20-4c61-8d0b-7eb7d25a213b", + "metadata": {}, + "outputs": [], + "source": [ + "# iterate through all models and get the losses and correlations\n", + "# take the mean loss and correlation for the test set\n", + "from scipy.stats import spearmanr\n", + "\n", + "test_corr_singles = []\n", + "test_corr_combos = []\n", + "\n", + "set_seed(0)\n", + "\n", + "for n in range(n_folds):\n", + " adata_n = model_adatas[n]\n", + " train_idx, val_idx, test_idx = generate_k_fold(\n", + " adata_n, adata_n.X, adata_n.obs[\"condition\"], fold_idx=n\n", + " )\n", + " loss_weights = np.ones(adata_n.shape[0])\n", + " adata_test = adata_n[test_idx]\n", + "\n", + " for pert in adata_test.obs[\"condition\"].unique():\n", + " hold_idx = [\n", + " i\n", + " for i, x in enumerate(\n", + " adata_test[adata_test.obs[\"Treatment\"] == \"TNFA+\"].obs[\"condition\"]\n", + " )\n", + " if x == pert\n", + " ]\n", + " recon = adata_test.uns[\"recon\"]\n", + " # correlation\n", + " mean_reconstruction = recon.mean(axis=0)\n", + " mean_observed = np.squeeze(\n", + " np.array(\n", + " adata_test[adata_test.obs[\"Treatment\"] == \"TNFA+\"][hold_idx].X.mean(\n", + " axis=0\n", + " )\n", + " )\n", + " )\n", + " if (\"+\" in pert) and (\"ctrl\" not in pert):\n", + " test_corr_combos.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0], n]\n", + " )\n", + " else:\n", + " test_corr_singles.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0], n]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "62812bb6-f82a-4c0b-ad92-18ec1bb9084d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot train correlation distribution vs test correlation distribution\n", + "test_corr = np.array(test_corr_singles + test_corr_combos)[:, 1].astype(float)\n", + "sns.histplot(test_corr, label=\"test_corr\")\n", + "plt.title(\"Correlation between predicted and observed expression per perturbation\")\n", + "plt.xlabel(\"Spearman Correlation\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2336d296-2911-4607-a902-552a7e7b92fd", + "metadata": {}, + "outputs": [], + "source": [ + "np.savetxt(\n", + " \"figures/reconstruction_spearmans/pertspectra_inhouse_spearman_correlations.csv\",\n", + " test_corr,\n", + " delimiter=\",\",\n", + " fmt=\"%.2f\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "64fc0b35-f87e-4d8a-87f1-e6080a3ee83f", + "metadata": {}, + "source": [ + "# Perturbation Embedding Heatmap" + ] + }, + { + "cell_type": "markdown", + "id": "c36ebe2b-2835-44ed-9f71-a52b522125b1", + "metadata": {}, + "source": [ + "## Hierarchical Clustering + Enrichment\n", + "- Perform enrichment tests (gprofiler) on hierarchical clustering of perturbation embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0aaf8dc3-7e5b-4748-8685-fc8560bf7fc7", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.spatial.distance import pdist, squareform\n", + "\n", + "# Compute the pairwise distances\n", + "tnfa_pos_pert_embeddings_df = pd.DataFrame(\n", + " adata.uns[\"SPECTRA_pert_scores\"], index=adata.uns[\"Spectra_pert_labels\"]\n", + ")\n", + "df = tnfa_pos_pert_embeddings_df.drop(index=[\"basal\", \"ctrl\", \"intergenic\"])\n", + "\n", + "distance = \"euclidean\"\n", + "distances = pdist(df.values, metric=distance)\n", + "\n", + "# Convert the distances into a square distance matrix\n", + "distance_matrix = pd.DataFrame(squareform(distances), index=df.index, columns=df.index)\n", + "clustermap = sns.clustermap(distance_matrix, cmap=\"viridis_r\")\n", + "clustermap.fig.suptitle(f\"Pairwise {distance} distance of perturbation latent vectors\")\n", + "# clustermap.ax_row_dendrogram.set_visible(False)\n", + "clustermap.ax_col_dendrogram.set_visible(False)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b2181a72-86cf-47a0-aa04-ab154fe0bf95", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from scipy.cluster.hierarchy import dendrogram\n", + "\n", + "den = dendrogram(\n", + " clustermap.dendrogram_col.linkage,\n", + " labels=distance_matrix.index,\n", + " color_threshold=0.01,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "00c46f2e-04d9-4b79-bc0e-b1362d6af844", + "metadata": {}, + "outputs": [], + "source": [ + "# extract clusters and perform gprofiler\n", + "from collections import defaultdict\n", + "\n", + "\n", + "def get_cluster_classes(den, label=\"ivl\"):\n", + " cluster_idxs = defaultdict(list)\n", + " for c, pi in zip(den[\"color_list\"], den[\"icoord\"]):\n", + " for leg in pi[1:3]:\n", + " i = (leg - 5.0) / 10.0\n", + " if abs(i - int(i)) < 1e-5:\n", + " cluster_idxs[c].append(int(i))\n", + " cluster_classes = {}\n", + " for c, l in cluster_idxs.items(): # noqa\n", + " i_l = [den[label][i] for i in l]\n", + " cluster_classes[c] = i_l\n", + " return cluster_classes\n", + "\n", + "\n", + "clusters = get_cluster_classes(den)\n", + "# extract functions for clusters\n", + "cluster_process = {}\n", + "for c in clusters:\n", + " cluster_df = pd.DataFrame(clusters[c], columns=[\"gene_symbol\"])\n", + " res = get_gprofiler(cluster_df)\n", + " cluster_process[c] = res[res[\"p_value\"] <= 0.05]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ebfece22-89bd-4010-bcb9-e99431c63e94", + "metadata": {}, + "outputs": [], + "source": [ + "# save grpofiler results\n", + "with open(\n", + " \"figures/pert_embedding_cluster_gprofiler/pertspectra_inhouse_pert_emb_gprofiler.pickle\",\n", + " \"wb\",\n", + ") as handle:\n", + " pickle.dump(cluster_process, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "55eaf3f0-4b9f-4028-a042-f6f03996350f", + "metadata": {}, + "source": [ + "## PR Curve+AUC\n", + "- AUCPR using prior graph as binary label" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ffd66c94-5f10-407c-b019-17d09a11a0ca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " AUC\n", + "StringDB 0.675809\n", + "CORUM 0.705489\n", + " F1 Threshold\n", + "StringDB 0.697966 0.007078\n", + "CORUM 0.639556 0.160069\n" + ] + } + ], + "source": [ + "aucpr_df, f1_df, pr_dict = auprc(distance_matrix)\n", + "print(aucpr_df)\n", + "print(f1_df)\n", + "aucpr_df.to_csv(\"figures/pert_embedding_recall/pertspectra_inhouse_aucpr.csv\")\n", + "f1_df.to_csv(\"figures/pert_embedding_recall/pertspectra_inhouse_f1.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c3ef0a88-3b42-4d4c-b7f4-79c018ee3f45", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot the precision-recall curve\n", + "prior = \"StringDB\"\n", + "plt.figure(figsize=(8, 6))\n", + "plt.plot(\n", + " pr_dict[prior][\"recall\"],\n", + " pr_dict[prior][\"precision\"],\n", + " marker=\".\",\n", + " label=\"Precision-Recall Curve\",\n", + ")\n", + "# Adding labels and title\n", + "plt.xlabel(\"Recall\")\n", + "plt.ylabel(\"Precision\")\n", + "plt.title(\"Precision-Recall Curve\")\n", + "plt.legend()\n", + "# Save the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8fdff66a-8f20-4268-8520-d33a0fa4969a", + "metadata": {}, + "source": [ + "# Interpretability of our learned latent space" + ] + }, + { + "cell_type": "markdown", + "id": "1ec54f80-8a0f-41f0-8c55-edd2b1fbfb3c", + "metadata": {}, + "source": [ + "## Factor Enrichment\n", + "- GSEA on log transformed latent-by-gene factors\n", + "- Then associate each perturbation to its top latent factors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35eb8169-15b1-4b5b-b926-d5b7238f372a", + "metadata": {}, + "outputs": [], + "source": [ + "factor_to_go = factor_enrichment_gsea(adata, adata.uns[\"SPECTRA_factors\"], fdr=1e-2)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "814189ce-53f3-4716-834a-a4f0ea6efcf9", + "metadata": {}, + "outputs": [], + "source": [ + "# filter and add description to processes\n", + "go_df = read_aws_csv(\"s3://pert-spectra/references/GO_to_Description.txt\")\n", + "go_df.set_index(\"Term\", inplace=True)\n", + "go_dict = go_df.to_dict()[\"Description\"]\n", + "\n", + "filtered_factor_to_go = {}\n", + "for i in factor_to_go:\n", + " proc = factor_to_go[i]\n", + " proc[\"descr\"] = [go_dict[x] for x in proc[\"GO_ID\"]]\n", + " filtered_factor_to_go[i] = proc" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "999ea1a6-b9f9-4cdd-a8cc-15e4db23f7f7", + "metadata": {}, + "outputs": [], + "source": [ + "# save latent enrichment results\n", + "with open(\n", + " \"figures/factor_enrichments/pertspectra_inhouse_factor_enrichment.pickle\", \"wb\"\n", + ") as handle:\n", + " pickle.dump(filtered_factor_to_go, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "78d01afb-6501-4424-99f5-05839747f118", + "metadata": {}, + "source": [ + "## Overlap with prior knowledge/ground truth (stringdb)\n", + "- Group A: a set of GO terms associated with a perturbation (either drivers from msigdb, or from literature) and its neighbors in stringdb\n", + "- Group B: a set of GO terms from the interpretability analysis\n", + "- Hypergeometric test on the overlap of the two groups" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d0274191-ea1a-4abf-8704-53f99a08fa3c", + "metadata": {}, + "outputs": [], + "source": [ + "# load precomputed factor_to_go dict if available\n", + "filtered_factor_to_go = read_aws_pickle(\n", + " \"s3://pert-spectra/figures/factor_enrichments/pertspectra_inhouse_factor_enrichment.pickle\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "782c4388-f53b-4963-8ee1-578de81d67de", + "metadata": {}, + "outputs": [], + "source": [ + "# get neighbors for each perturbation\n", + "pert_neighbors = retrieve_stringdb_neighbors(adata.uns[\"Spectra_pert_labels\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "abfc3fd4-806e-459d-af34-0b9f365a8e91", + "metadata": {}, + "outputs": [], + "source": [ + "# construct group A - known processes for each perturbation\n", + "gene_sets = read_aws_pickle(\"s3://pert-spectra/references/GO_to_Gene.pickle\")\n", + "# BP only go terms\n", + "go_reference = read_aws_csv(\"s3://pert-spectra/references/GO_terms.txt.gz\", zipped=True)\n", + "go_bp = go_reference[go_reference[\"go_category\"] == \"biological_process\"]\n", + "go_bp_ids = set(go_bp[\"go_id\"].values)\n", + "filtered_go_terms = {key: gene_sets[key] for key in go_bp_ids if key in gene_sets}\n", + "\n", + "# GO terms per perturbation AND its neighbors in stringdb\n", + "pert_to_go = {key: set() for key in adata.uns[\"Spectra_pert_labels\"]}\n", + "for goterm in filtered_go_terms:\n", + " for pert in adata.uns[\"Spectra_pert_labels\"]:\n", + " if pert in filtered_go_terms[goterm] and set(\n", + " filtered_go_terms[goterm]\n", + " ).intersection(pert_neighbors[pert]):\n", + " pert_to_go[pert].add(goterm)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "3245b161-3e6c-4966-99d9-b7bb1d2ba6e6", + "metadata": {}, + "outputs": [], + "source": [ + "# construct set B - model identified processes for each perturbation\n", + "n = 2 # number of top factors to get processes from\n", + "model_pert_to_go = {}\n", + "for pert in adata.uns[\"Spectra_pert_labels\"]:\n", + " if pert in [\"ctrl\", \"intergenic\", \"basal\"]:\n", + " continue\n", + " # get top factors\n", + " tnfa_pos_pert_embeddings_df = pd.DataFrame(\n", + " adata.uns[\"SPECTRA_pert_scores\"], index=adata.uns[\"Spectra_pert_labels\"]\n", + " )\n", + " pert_loading = tnfa_pos_pert_embeddings_df.loc[pert]\n", + " intergenic_loading = tnfa_pos_pert_embeddings_df.loc[\"intergenic\"]\n", + " delta_loading = np.abs(np.log(pert_loading) - np.log(intergenic_loading))\n", + " top_n_factors = np.argpartition(np.array(delta_loading), -n)[-n:]\n", + " # get processes\n", + " model_processes = set()\n", + " for f in top_n_factors:\n", + " proc = filtered_factor_to_go[f]\n", + " model_processes = model_processes.union(set(proc[\"Term\"]))\n", + " model_pert_to_go[pert] = model_processes" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "e6a99270-618f-467b-8c0c-67610171ddbd", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overlap for SKP2: 2 out of 6 in researchDB\n", + "P-value for SKP2: 0.005521122949289679\n", + "Overlap for NFKBIA: 4 out of 28 in researchDB\n", + "P-value for NFKBIA: 0.0018530509193777707\n", + "Overlap for TRAF2: 12 out of 29 in researchDB\n", + "P-value for TRAF2: 9.372890092145309e-14\n", + "Overlap for BIRC2: 12 out of 30 in researchDB\n", + "P-value for BIRC2: 1.5352819800190863e-13\n", + "Overlap for TAB1: 6 out of 9 in researchDB\n", + "P-value for TAB1: 9.880142853042106e-07\n", + "Overlap for UBE2N: 6 out of 16 in researchDB\n", + "P-value for UBE2N: 3.5981263241025083e-07\n", + "Overlap for CHUK: 12 out of 21 in researchDB\n", + "P-value for CHUK: 8.970401417521286e-16\n", + "Overlap for RBCK1: 8 out of 11 in researchDB\n", + "P-value for RBCK1: 4.469976857613242e-09\n", + "Overlap for IKBKB: 15 out of 24 in researchDB\n", + "P-value for IKBKB: 1.262837991043831e-14\n", + "Overlap for TNFRSF1A: 12 out of 21 in researchDB\n", + "P-value for TNFRSF1A: 3.557360199375414e-11\n", + "Overlap for TRAF6: 14 out of 46 in researchDB\n", + "P-value for TRAF6: 8.701845914835026e-14\n", + "Overlap for BIRC3: 8 out of 20 in researchDB\n", + "P-value for BIRC3: 2.0442561238209195e-09\n", + "Overlap for TRADD: 11 out of 13 in researchDB\n", + "P-value for TRADD: 2.20628781790212e-13\n", + "Overlap for TRAF5: 9 out of 14 in researchDB\n", + "P-value for TRAF5: 8.644484310231755e-13\n", + "Overlap for IKBKG: 12 out of 17 in researchDB\n", + "P-value for IKBKG: 7.268024700298374e-13\n", + "Overlap for TAB2: 5 out of 5 in researchDB\n", + "P-value for TAB2: 2.730409006194343e-07\n", + "Overlap for RIPK1: 28 out of 38 in researchDB\n", + "P-value for RIPK1: 3.1419351093251465e-29\n", + "Overlap for SKP1: 3 out of 7 in researchDB\n", + "P-value for SKP1: 0.00023036479244599009\n", + "Overlap for MAP3K7: 10 out of 23 in researchDB\n", + "P-value for MAP3K7: 4.7379439102193765e-08\n", + "Overlap for FBXW11: 11 out of 13 in researchDB\n", + "P-value for FBXW11: 2.0898341048664293e-13\n", + "Overlap for CUL1: 2 out of 7 in researchDB\n", + "P-value for CUL1: 0.007218985999647311\n" + ] + } + ], + "source": [ + "pvals = perturbation_signal_recovery(\n", + " pert_to_go,\n", + " model_pert_to_go,\n", + " list(filtered_go_terms.keys()),\n", + " list(adata.uns[\"Spectra_pert_labels\"]),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "b057ceb0-49b0-4c4d-870d-d9a3fe7b1428", + "metadata": {}, + "outputs": [], + "source": [ + "# save as csv for visualization\n", + "pd.DataFrame.from_dict(data=pvals, orient=\"index\").to_csv(\n", + " \"figures/process_recovery_hypergeo_pvals/MODEL_inhouse_hypergeo_neighbors_recovery_pvalues.csv\",\n", + " header=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99859d27-3a23-42f0-9d01-1087d698d118", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/inhouse_scETM_metrics.ipynb b/inhouse_scETM_metrics.ipynb new file mode 100644 index 0000000..331d955 --- /dev/null +++ b/inhouse_scETM_metrics.ipynb @@ -0,0 +1,435 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b8270f66-7a27-4263-aa97-25abeb2cf34d", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2 " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "529f64f3-c83e-481d-872c-df89b9dd063b", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "from utils import (\n", + " auprc,\n", + " factor_enrichment_gsea,\n", + " get_gprofiler,\n", + " read_aws_csv,\n", + " read_aws_h5ad,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d15b3b79-f031-4029-b4cc-3085534cf1ed", + "metadata": {}, + "outputs": [], + "source": [ + "# read in scETM results\n", + "adata = read_aws_h5ad(\"s3://pert-spectra/scETM_checkpoints/scetm_inhouse/fold_0.h5ad\")\n", + "# read in pertspectra results to retrieve gene labels\n", + "ref_adata = read_aws_h5ad(\n", + " \"s3://pert-spectra/PertSpectra_checkpoints/pertspectra_inhouse/fold_0.h5ad\"\n", + ")\n", + "adata.var_names = ref_adata.var_names" + ] + }, + { + "cell_type": "markdown", + "id": "5b7bf773-5e0c-43b8-8df2-4ce3376be959", + "metadata": {}, + "source": [ + "# Reconstruction" + ] + }, + { + "cell_type": "markdown", + "id": "3b4b092b-2120-48c6-a492-84656e8b7169", + "metadata": {}, + "source": [ + "## Spearman Coefficient\n", + "- Spearman correlation between predicted and observed expression of test set\n", + "- Compute correlation for the mean expression aggregated on perturbation, across all kfolds" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e2885bd3-5433-47d6-88d5-47be83df5b5a", + "metadata": {}, + "outputs": [], + "source": [ + "n_folds = 5\n", + "model_adatas = []\n", + "for n in range(0, n_folds):\n", + " # new adata\n", + " adata_n = adata.copy()\n", + " # load model from checkpoint\n", + " s3_dir = \"s3://pert-spectra/scETM_checkpoints/\"\n", + " experiment_name = \"scetm_inhouse/\"\n", + " model_name = f\"fold_{n}.h5ad\"\n", + " m_adata = read_aws_h5ad(s3_dir + experiment_name + model_name)\n", + " model_adatas.append(m_adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e0c30413-c2c1-4058-b3bf-4dc74437c0dd", + "metadata": {}, + "outputs": [], + "source": [ + "# iterate through all models and get the losses and correlations\n", + "# take the mean loss and correlation for the test set\n", + "from scipy.stats import spearmanr\n", + "\n", + "test_corr_singles = []\n", + "test_corr_combos = []\n", + "\n", + "for n in range(n_folds):\n", + " adata_n = model_adatas[n]\n", + " loss_weights = np.ones(adata_n.shape[0])\n", + "\n", + " for pert in adata_n.obs[\"condition\"].unique():\n", + " hold_idx = [\n", + " i\n", + " for i, x in enumerate(\n", + " adata_n[adata_n.obs[\"Treatment\"] == \"TNFA+\"].obs[\"condition\"]\n", + " )\n", + " if x == pert\n", + " ]\n", + " recon = adata_n[adata_n.obs[\"Treatment\"] == \"TNFA+\"][hold_idx].uns[\"recon\"]\n", + " # correlation\n", + " mean_reconstruction = recon.mean(axis=0)\n", + " mean_observed = np.squeeze(\n", + " np.array(\n", + " adata_n[adata_n.obs[\"Treatment\"] == \"TNFA+\"][hold_idx].X.mean(axis=0)\n", + " )\n", + " )\n", + " if (\"+\" in pert) and (\"ctrl\" not in pert):\n", + " test_corr_combos.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0], n]\n", + " )\n", + " else:\n", + " test_corr_singles.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0], n]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e61e2799-ed83-421e-86d4-565da4876c10", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot train correlation distribution vs test correlation distribution\n", + "test_corr = np.array(test_corr_singles + test_corr_combos)[:, 1].astype(float)\n", + "sns.histplot(test_corr, label=\"test_corr\")\n", + "plt.title(\"Correlation between predicted and observed expression per perturbation\")\n", + "plt.xlabel(\"Spearman Correlation\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4b36f1b2-1b9c-4686-b9fb-43c1d9f64b59", + "metadata": {}, + "outputs": [], + "source": [ + "np.savetxt(\n", + " \"figures/reconstruction_spearmans/scETM_inhouse_spearman_correlations.csv\",\n", + " test_corr,\n", + " delimiter=\",\",\n", + " fmt=\"%.2f\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "64fc0b35-f87e-4d8a-87f1-e6080a3ee83f", + "metadata": {}, + "source": [ + "# Perturbation Embedding Heatmap" + ] + }, + { + "cell_type": "markdown", + "id": "c36ebe2b-2835-44ed-9f71-a52b522125b1", + "metadata": {}, + "source": [ + "## Hierarchical Clustering + Enrichment\n", + "- Perform enrichment tests (gprofiler) on hierarchical clustering of perturbation embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0aaf8dc3-7e5b-4748-8685-fc8560bf7fc7", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.spatial.distance import pdist, squareform\n", + "\n", + "# aggregate cell embeddings to perturbation embeddings\n", + "cell_emb = adata.uns[\"cell_emb\"] @ adata.uns[\"topics\"]\n", + "perts = []\n", + "pert_emb = []\n", + "for i in adata.obs[\"condition\"].unique():\n", + " if \"+\" not in i and i != \"nan\":\n", + " perts.append(i)\n", + " pert_emb.append(cell_emb[adata.obs[\"condition\"] == i].mean(axis=0))\n", + "pert_emb = np.array(pert_emb)\n", + "pert_emb_df = pd.DataFrame(pert_emb, index=perts)\n", + "\n", + "# Compute the pairwise distances\n", + "df = pert_emb_df.drop(index=[\"ctrl\", \"intergenic\"])\n", + "\n", + "distance = \"euclidean\"\n", + "distances = pdist(df.values, metric=distance)\n", + "\n", + "# Convert the distances into a square distance matrix\n", + "distance_matrix = pd.DataFrame(squareform(distances), index=df.index, columns=df.index)\n", + "clustermap = sns.clustermap(distance_matrix, cmap=\"viridis_r\")\n", + "clustermap.fig.suptitle(f\"Pairwise {distance} distance of perturbation latent vectors\")\n", + "# clustermap.ax_row_dendrogram.set_visible(False)\n", + "clustermap.ax_col_dendrogram.set_visible(False)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "271cd67a-9441-471b-b125-819e8b7b738a", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from scipy.cluster.hierarchy import dendrogram\n", + "\n", + "den = dendrogram(\n", + " clustermap.dendrogram_col.linkage,\n", + " labels=distance_matrix.index,\n", + " color_threshold=0.25,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "89af6f64-194f-4f1c-a5ed-dafd7243d371", + "metadata": {}, + "outputs": [], + "source": [ + "# extract clusters and perform gprofiler\n", + "from collections import defaultdict\n", + "\n", + "\n", + "def get_cluster_classes(den, label=\"ivl\"):\n", + " cluster_idxs = defaultdict(list)\n", + " for c, pi in zip(den[\"color_list\"], den[\"icoord\"]):\n", + " for leg in pi[1:3]:\n", + " i = (leg - 5.0) / 10.0\n", + " if abs(i - int(i)) < 1e-5:\n", + " cluster_idxs[c].append(int(i))\n", + " cluster_classes = {}\n", + " for c, l in cluster_idxs.items(): # noqa\n", + " i_l = [den[label][i] for i in l]\n", + " cluster_classes[c] = i_l\n", + " return cluster_classes\n", + "\n", + "\n", + "clusters = get_cluster_classes(den)\n", + "# extract functions for clusters\n", + "cluster_process = {}\n", + "for c in clusters:\n", + " cluster_df = pd.DataFrame(clusters[c], columns=[\"gene_symbol\"])\n", + " res = get_gprofiler(cluster_df)\n", + " cluster_process[c] = res[res[\"p_value\"] <= 0.05]" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "018bb814-e2b1-47a1-8331-22ec810dfeb3", + "metadata": {}, + "outputs": [], + "source": [ + "# save grpofiler results\n", + "with open(\n", + " \"figures/pert_embedding_cluster_gprofiler/scETM_inhouse_pert_emb_gprofiler.pickle\",\n", + " \"wb\",\n", + ") as handle:\n", + " pickle.dump(cluster_process, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "126853e8-6814-45b0-9420-6b822cb8f42c", + "metadata": {}, + "source": [ + "## PR Curve+AUC\n", + "- AUCPR using prior graph as binary label" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b02b6a47-b384-457b-b9ed-7840807a880e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " AUC\n", + "StringDB 0.573643\n", + "CORUM 0.520186\n", + " F1 Threshold\n", + "StringDB 0.679641 0.0\n", + "CORUM 0.630435 0.0\n" + ] + } + ], + "source": [ + "aucpr_df, f1_df, pr_dict = auprc(distance_matrix)\n", + "print(aucpr_df)\n", + "print(f1_df)\n", + "aucpr_df.to_csv(\"figures/pert_embedding_recall/scETM_inhouse_aucpr.csv\")\n", + "f1_df.to_csv(\"figures/pert_embedding_recall/scETM_inhouse_f1.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "8fdff66a-8f20-4268-8520-d33a0fa4969a", + "metadata": {}, + "source": [ + "# Interpretability of our learned latent space" + ] + }, + { + "cell_type": "markdown", + "id": "1ec54f80-8a0f-41f0-8c55-edd2b1fbfb3c", + "metadata": {}, + "source": [ + "## Factor Enrichment\n", + "- GSEA on log transformed latent-by-gene factors\n", + "- Then associate each perturbation to its top latent factors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35eb8169-15b1-4b5b-b926-d5b7238f372a", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-12-30 15:49:36,590 [INFO] Parsing data files for GSEA.............................\n", + "2024-12-30 15:49:36,656 [INFO] 17782 gene_sets have been filtered out when max_size=300 and min_size=10\n", + "2024-12-30 15:49:36,657 [INFO] 1203 gene_sets used for further statistical testing.....\n" + ] + } + ], + "source": [ + "factor_to_go = factor_enrichment_gsea(adata, np.abs(adata.uns[\"gene_emb\"]), fdr=5e-2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814189ce-53f3-4716-834a-a4f0ea6efcf9", + "metadata": {}, + "outputs": [], + "source": [ + "# filter and add description to processes\n", + "go_df = read_aws_csv(\"s3://pert-spectra/references/GO_to_Description.txt\")\n", + "go_df.set_index(\"Term\", inplace=True)\n", + "go_dict = go_df.to_dict()[\"Description\"]\n", + "\n", + "filtered_factor_to_go = {}\n", + "for i in factor_to_go:\n", + " proc = factor_to_go[i]\n", + " proc[\"descr\"] = [go_dict[x] for x in proc[\"GO_ID\"]]\n", + " filtered_factor_to_go[i] = proc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b785ba3-f9c1-4207-b784-1ffe2ba74c30", + "metadata": {}, + "outputs": [], + "source": [ + "# save latent enrichment results\n", + "with open(\n", + " \"figures/factor_enrichments/scETM_inhouse_factor_enrichment.pickle\", \"wb\"\n", + ") as handle:\n", + " pickle.dump(filtered_factor_to_go, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/norman_GSFA_metrics.ipynb b/norman_GSFA_metrics.ipynb new file mode 100644 index 0000000..25f8cc8 --- /dev/null +++ b/norman_GSFA_metrics.ipynb @@ -0,0 +1,735 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b8270f66-7a27-4263-aa97-25abeb2cf34d", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2 " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "529f64f3-c83e-481d-872c-df89b9dd063b", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from utils import (\n", + " auprc,\n", + " factor_enrichment_gsea,\n", + " get_gprofiler,\n", + " perturbation_signal_recovery,\n", + " read_aws_csv,\n", + " read_aws_h5ad,\n", + " read_aws_npz,\n", + " read_aws_pickle,\n", + " retrieve_stringdb_neighbors,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "39762015-fa94-4ae8-9038-405f7c56a03b", + "metadata": {}, + "outputs": [], + "source": [ + "# read learned parameters\n", + "Z = read_aws_csv(\n", + " \"s3://pert-spectra/gsfa_checkpoints/norman_gsfa_outputs/Z.csv\"\n", + ").to_numpy()\n", + "W = read_aws_csv(\n", + " \"s3://pert-spectra/gsfa_checkpoints/norman_gsfa_outputs/W.csv\"\n", + ").to_numpy()\n", + "F = read_aws_csv(\n", + " \"s3://pert-spectra/gsfa_checkpoints/norman_gsfa_outputs/F.csv\"\n", + ").to_numpy()\n", + "beta = read_aws_csv(\n", + " \"s3://pert-spectra/gsfa_checkpoints/norman_gsfa_outputs/beta.csv\"\n", + ").to_numpy()\n", + "lsfr = read_aws_csv(\n", + " \"s3://pert-spectra/gsfa_checkpoints/norman_gsfa_outputs/lsfr.csv\"\n", + ").to_numpy()\n", + "# read gene and perturbation information\n", + "gene_labels = read_aws_csv(\"s3://pert-spectra/gsfa_checkpoints/norman_top_genes.csv\")[\n", + " \"x\"\n", + "].to_numpy()\n", + "pert_labels = read_aws_npz(\"s3://pert-spectra/gsfa_checkpoints/norman_G_labels.npz\")[\n", + " \"arr_0\"\n", + "]\n", + "pert_labels = [x.split(\"_\")[1] for x in pert_labels[:-1]]\n", + "pert_labels = pert_labels + [\"ctrl\"]\n", + "# read inputs and preprocessed input\n", + "preprocessed_Y = read_aws_npz(\n", + " \"s3://pert-spectra/gsfa_checkpoints/norman_GSFA_preprocessed.npz\"\n", + ")[\"array1\"]\n", + "G = read_aws_npz(\"s3://pert-spectra/gsfa_checkpoints/norman_GSFA_inputs.npz\")[\"array2\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "601bbfb0-17ed-4694-850c-fcbf1fef961b", + "metadata": {}, + "outputs": [], + "source": [ + "# read in adata of raw data for reference\n", + "adata = read_aws_h5ad(\"path to raw Norman adata here\")\n", + "adata = adata[:, gene_labels - 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7888facf-9e1c-4b41-9f48-2e0d194a36af", + "metadata": {}, + "outputs": [], + "source": [ + "# subset to powered perturbations\n", + "obs_df = pd.DataFrame(adata.obs[\"perturbation_name\"])\n", + "category_counts = obs_df[\"perturbation_name\"].value_counts()\n", + "filtered_categories = category_counts[category_counts >= 50].index\n", + "adata = adata[adata.obs[\"perturbation_name\"].isin(filtered_categories)]" + ] + }, + { + "cell_type": "markdown", + "id": "5b7bf773-5e0c-43b8-8df2-4ce3376be959", + "metadata": {}, + "source": [ + "# Reconstruction" + ] + }, + { + "cell_type": "markdown", + "id": "3b4b092b-2120-48c6-a492-84656e8b7169", + "metadata": {}, + "source": [ + "## Spearman Coefficient\n", + "- Spearman correlation between predicted and observed expression (on DE genes if available)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e2885bd3-5433-47d6-88d5-47be83df5b5a", + "metadata": {}, + "outputs": [], + "source": [ + "# for GSFA, measure reconstruction on training\n", + "from scipy.stats import spearmanr\n", + "\n", + "reconstruction = Z @ W.T\n", + "# rebuild pert labels\n", + "G_labels = []\n", + "for row in G:\n", + " pert_idx = np.where(row == 1)[0].tolist()\n", + " perts = [pert_labels[i] for i in pert_idx]\n", + " G_labels.append(\"+\".join(perts))\n", + "test_corr_singles = []\n", + "test_corr_combos = []\n", + "\n", + "for pert in set(G_labels):\n", + " hold_idx = [i for i, x in enumerate(G_labels) if x == pert]\n", + " if not hold_idx:\n", + " continue\n", + " recon = reconstruction[hold_idx]\n", + " # correlation\n", + " mean_reconstruction = recon.mean(axis=0)\n", + " mean_observed = preprocessed_Y[hold_idx].mean(axis=0)\n", + " if (\"+\" in pert) and (\"ctrl\" not in pert):\n", + " test_corr_combos.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0]]\n", + " )\n", + " else:\n", + " test_corr_singles.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0]]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bc94b654-4864-485b-94e7-c1cb1f6f8baf", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot train correlation distribution vs test correlation distribution\n", + "import seaborn as sns\n", + "\n", + "test_corr = np.array(test_corr_singles + test_corr_combos)[:, 1].astype(float)\n", + "sns.histplot(test_corr, label=\"test_corr\")\n", + "plt.title(\"Correlation between predicted and observed expression per perturbation\")\n", + "plt.xlabel(\"Spearman Correlation\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "e7033192-d47d-4568-aa9a-63a7aac918c0", + "metadata": {}, + "outputs": [], + "source": [ + "np.savetxt(\n", + " \"figures/reconstruction_spearmans/GSFA_norman_spearman_correlations.csv\",\n", + " test_corr,\n", + " delimiter=\",\",\n", + " fmt=\"%.2f\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "64fc0b35-f87e-4d8a-87f1-e6080a3ee83f", + "metadata": {}, + "source": [ + "# Perturbation Embedding Heatmap" + ] + }, + { + "cell_type": "markdown", + "id": "c36ebe2b-2835-44ed-9f71-a52b522125b1", + "metadata": {}, + "source": [ + "## Hierarchical Clustering + Enrichment\n", + "- Perform enrichment tests (gprofiler) on hierarchical clustering of perturbation embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0aaf8dc3-7e5b-4748-8685-fc8560bf7fc7", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "from scipy.cluster.hierarchy import dendrogram, fcluster, linkage\n", + "from scipy.spatial.distance import pdist, squareform\n", + "\n", + "# Put into df\n", + "pert_emb_df = pd.DataFrame(beta[:-1], index=pert_labels)\n", + "pert_emb_df = pert_emb_df.drop(columns=[0]).astype(\"float\")\n", + "\n", + "# Compute the pairwise distances\n", + "df = pert_emb_df.drop(index=[\"ctrl\"])\n", + "\n", + "distance = \"euclidean\"\n", + "distances = pdist(df.values, metric=distance)\n", + "\n", + "# Convert the distances into a square distance matrix\n", + "distance_matrix = pd.DataFrame(squareform(distances), index=df.index, columns=df.index)\n", + "\n", + "# Compute the linkage matrix using the condensed distance matrix\n", + "linkage_matrix = linkage(distance_matrix, method=\"ward\")\n", + "\n", + "# Plot the dendrogram (optional)\n", + "dendro = dendrogram(linkage_matrix, no_plot=True)\n", + "\n", + "# Assign clusters\n", + "max_d = 10 # Max distance for flat clusters\n", + "clusters = fcluster(linkage_matrix, max_d, criterion=\"distance\")\n", + "\n", + "# Create a color palette for clusters\n", + "palette = sns.color_palette(\"husl\", len(np.unique(clusters)))\n", + "\n", + "# Map each cluster id to a color\n", + "cluster_colors = [palette[i - 1] for i in clusters]\n", + "\n", + "# Show the plot\n", + "plt.show()\n", + "\n", + "clustermap = sns.clustermap(\n", + " distance_matrix, cmap=\"viridis_r\", row_colors=cluster_colors\n", + ")\n", + "clustermap.fig.suptitle(f\"Pairwise {distance} distance of perturbation latent vectors\")\n", + "# clustermap.ax_row_dendrogram.set_visible(False)\n", + "clustermap.ax_col_dendrogram.set_visible(False)\n", + "# clustermap.cax.set_visible(False)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d187a4f5-b9d8-4dfd-a98b-c759446de3e6", + "metadata": {}, + "outputs": [], + "source": [ + "# extract clusters and perform gprofiler\n", + "index_names = list(distance_matrix.index)\n", + "cluster_df = pd.DataFrame({\"Pert\": index_names, \"Cluster\": clusters})\n", + "cluster_process = {}\n", + "for cluster_id in np.unique(clusters):\n", + " points_in_cluster = cluster_df[cluster_df[\"Cluster\"] == cluster_id][\"Pert\"].tolist()\n", + " gprofiler_in = pd.DataFrame(points_in_cluster, columns=[\"gene_symbol\"])\n", + " res = get_gprofiler(gprofiler_in)\n", + " cluster_process[cluster_id] = res[res[\"p_value\"] <= 0.05]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7dedf5b-de4c-4142-bbd4-1a2e4a4a4ec0", + "metadata": {}, + "outputs": [], + "source": [ + "# Create descriptions for each cluster (for demonstration purpose)\n", + "cluster_descriptions = {\n", + " 1: \"\",\n", + " 2: \"Embryonic development\",\n", + " 3: \"Transcription cis-regulatory region binding, metabolic process\",\n", + " 4: \"protein serine/threonine kinase activity\",\n", + " 5: \"\",\n", + " 6: \"DNA-binding transcription factor activity, RNA polymerase II transcription regulatory region\",\n", + " 7: \"\",\n", + " 8: \"DNA binding, negative regulation of biosynthetic process\",\n", + " 9: \"\",\n", + " 10: \"chromatin\",\n", + " 11: \"epidermal growth factor\",\n", + " 12: \"collagen type II trimer\",\n", + " 13: \"\",\n", + " 14: \"\",\n", + " 15: \"C/EBP complex\",\n", + " 16: \"\",\n", + "}\n", + "\n", + "# Create a consistent color palette\n", + "unique_clusters = sorted(np.unique(clusters))\n", + "palette = sns.color_palette(\"husl\", len(unique_clusters))\n", + "cluster_colors = [palette[i - 1] for i in clusters]\n", + "cluster_colors = {cid: palette[i] for i, cid in enumerate(unique_clusters)}\n", + "\n", + "# Create clustermap using consistent colors\n", + "row_colors = [cluster_colors[cid] for cid in clusters]\n", + "clustermap = sns.clustermap(\n", + " distance_matrix, cmap=\"viridis_r\", row_colors=[palette[i - 1] for i in clusters]\n", + ")\n", + "clustermap.savefig(\n", + " \"figures/figure_pngs/GSFA_norman_clustermap.png\", dpi=600, bbox_inches=\"tight\"\n", + ")\n", + "\n", + "# Prepare data for the table\n", + "cluster_data = {\n", + " \"Cluster ID\": unique_clusters,\n", + " \"Description\": [cluster_descriptions[cid] for cid in unique_clusters],\n", + "}\n", + "\n", + "cluster_df = pd.DataFrame(cluster_data)\n", + "\n", + "# Plot the table with colors matching the clustermap\n", + "fig, ax = plt.subplots(figsize=(18, 4))\n", + "ax.axis(\"off\")\n", + "ax.axis(\"tight\")\n", + "\n", + "# Create table while applying the color to the row background\n", + "table = ax.table(\n", + " cellText=cluster_df.values,\n", + " colLabels=[\"Cluster ID\", \"Description\"],\n", + " cellColours=[[cluster_colors[cid]] * 2 for cid in cluster_df[\"Cluster ID\"]],\n", + " cellLoc=\"center\",\n", + " loc=\"center\",\n", + ")\n", + "\n", + "# Customize table appearance\n", + "table.auto_set_font_size(False)\n", + "table.set_fontsize(10)\n", + "table.scale(1, 1.5)\n", + "\n", + "plt.savefig(\n", + " \"figures/figure_pngs/GSFA_norman_clustermap_descriptions.png\",\n", + " dpi=600,\n", + " bbox_inches=\"tight\",\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 250, + "id": "23b3a2e6-f20f-4626-bcc9-32e9e4ef0d54", + "metadata": {}, + "outputs": [], + "source": [ + "# save grpofiler results\n", + "with open(\n", + " \"figures/pert_embedding_cluster_gprofiler/GSFA_norman_pert_emb_gprofiler.pickle\",\n", + " \"wb\",\n", + ") as handle:\n", + " pickle.dump(cluster_process, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "dc06a81d-1fbe-4b46-82f2-0464e55727e7", + "metadata": {}, + "source": [ + "## PR Curve+AUC\n", + "- AUCPR using prior graph as binary label" + ] + }, + { + "cell_type": "code", + "execution_count": 237, + "id": "73b5c140-05f1-486b-9d28-7592ed1556bb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " AUC\n", + "StringDB 0.732776\n", + "CORUM 0.946837\n", + " F1 Threshold\n", + "StringDB 0.820312 1.0\n", + "CORUM 0.972222 1.0\n" + ] + } + ], + "source": [ + "aucpr_df, f1_df, pr_dict = auprc(distance_matrix)\n", + "print(aucpr_df)\n", + "print(f1_df)\n", + "aucpr_df.to_csv(\"figures/pert_embedding_recall/GSFA_norman_aucpr.csv\")\n", + "f1_df.to_csv(\"figures/pert_embedding_recall/GSFA_norman_f1.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "8fdff66a-8f20-4268-8520-d33a0fa4969a", + "metadata": {}, + "source": [ + "# Interpretability of our learned latent space" + ] + }, + { + "cell_type": "markdown", + "id": "1ec54f80-8a0f-41f0-8c55-edd2b1fbfb3c", + "metadata": {}, + "source": [ + "## Factor Enrichment\n", + "- GSEA on log transformed latent-by-gene factors\n", + "- Then associate each perturbation to its top latent factors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35eb8169-15b1-4b5b-b926-d5b7238f372a", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "factor_to_go = factor_enrichment_gsea(adata, W.T, fdr=5e-2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814189ce-53f3-4716-834a-a4f0ea6efcf9", + "metadata": {}, + "outputs": [], + "source": [ + "# filter and add description to processes\n", + "go_df = read_aws_csv(\"s3://pert-spectra/references/GO_to_Description.txt\")\n", + "go_df.set_index(\"Term\", inplace=True)\n", + "go_dict = go_df.to_dict()[\"Description\"]\n", + "\n", + "filtered_factor_to_go = {}\n", + "for i in factor_to_go:\n", + " proc = factor_to_go[i]\n", + " proc[\"descr\"] = [go_dict[x] for x in proc[\"GO_ID\"]]\n", + " filtered_factor_to_go[i] = proc" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "ff945301-46b0-4beb-9f4e-9bc17ede92d0", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\n", + " \"figures/factor_enrichments/GSFA_norman_factor_enrichment.pickle\", \"wb\"\n", + ") as handle:\n", + " pickle.dump(filtered_factor_to_go, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "6e5f583b-5c2e-45ea-a210-33b0f6ab2b3e", + "metadata": {}, + "source": [ + "## Overlap with prior knowledge/ground truth (stringdb)\n", + "- Group A: a set of GO terms associated with a perturbation (either drivers from msigdb, or from literature) and its neighbors in stringdb\n", + "- Group B: a set of GO terms from the interpretability analysis\n", + "- Hypergeometric test on the overlap of the two groups" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3f70b845-17f9-429e-9bdf-6d36d6fb0a44", + "metadata": {}, + "outputs": [], + "source": [ + "# load precomputed factor_to_go dict if available\n", + "filtered_factor_to_go = read_aws_pickle(\n", + " \"s3://pert-spectra/figures/factor_enrichments/GSFA_norman_factor_enrichment.pickle\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0a0ca673-3177-412f-937d-f0ec50c3824f", + "metadata": {}, + "outputs": [], + "source": [ + "# get neighbors for each perturbation\n", + "pert_neighbors = retrieve_stringdb_neighbors(pert_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "28d8b858-9326-489e-be3f-00a756eb3fb0", + "metadata": {}, + "outputs": [], + "source": [ + "# construct group A - known processes for each perturbation\n", + "gene_sets = read_aws_pickle(\"s3://pert-spectra/references/GO_to_Gene.pickle\")\n", + "# BP only go terms\n", + "go_reference = read_aws_csv(\"s3://pert-spectra/references/GO_terms.txt.gz\", zipped=True)\n", + "go_bp = go_reference[go_reference[\"go_category\"] == \"biological_process\"]\n", + "go_bp_ids = set(go_bp[\"go_id\"].values)\n", + "filtered_go_terms = {key: gene_sets[key] for key in go_bp_ids if key in gene_sets}\n", + "\n", + "# GO terms per perturbation AND its neighbors in stringdb\n", + "pert_to_go = {key: set() for key in df.index}\n", + "for goterm in filtered_go_terms:\n", + " for pert in df.index:\n", + " if pert in filtered_go_terms[goterm] and set(\n", + " filtered_go_terms[goterm]\n", + " ).intersection(pert_neighbors[pert]):\n", + " pert_to_go[pert].add(goterm)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b3b81d9d-7264-4dac-b14b-1f35c84479c2", + "metadata": {}, + "outputs": [], + "source": [ + "# construct set B - model identified processes for each perturbation\n", + "n = 10 # number of top factors to get processes from\n", + "model_pert_to_go = {}\n", + "for pert in pert_emb_df.index:\n", + " if pert in [\"ctrl\", \"intergenic\", \"basal\"]:\n", + " continue\n", + " # get top factors\n", + " pert_emb_df = pd.DataFrame(beta[:-1], index=pert_labels)\n", + " pert_loading = pert_emb_df.loc[pert].to_numpy()[1:].astype(float)\n", + " ctrl_loading = pert_emb_df.loc[\"ctrl\"].to_numpy()[1:].astype(float)\n", + " delta_loading = np.abs(np.log(np.abs(pert_loading)) - np.log(np.abs(ctrl_loading)))\n", + " top_n_factors = np.argpartition(np.array(delta_loading), -n)[-n:]\n", + " # get processes\n", + " model_processes = set()\n", + " for f in top_n_factors:\n", + " proc = filtered_factor_to_go[f]\n", + " model_processes = model_processes.union(set(proc[\"Term\"]))\n", + " model_pert_to_go[pert] = model_processes" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f3a8ecfa-1184-4a30-b787-1ce5e56a5562", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overlap for AHR: 5 out of 12 in researchDB\n", + "P-value for AHR: 2.1319858454369287e-07\n", + "Overlap for ARID1A: 5 out of 16 in researchDB\n", + "P-value for ARID1A: 2.2083684093482343e-06\n", + "Overlap for BAK1: 5 out of 43 in researchDB\n", + "P-value for BAK1: 0.00046615592180678417\n", + "Overlap for BCL2L11: 3 out of 29 in researchDB\n", + "P-value for BCL2L11: 0.008310750164771933\n", + "Overlap for CBL: 5 out of 28 in researchDB\n", + "P-value for CBL: 0.00014285653454198518\n", + "Overlap for CDKN1A: 3 out of 37 in researchDB\n", + "P-value for CDKN1A: 0.0090102759706699\n", + "Overlap for CDKN1B: 2 out of 38 in researchDB\n", + "P-value for CDKN1B: 0.08621078058192475\n", + "Overlap for CDKN1C: 3 out of 15 in researchDB\n", + "P-value for CDKN1C: 0.0008590529302742615\n", + "Overlap for CEBPA: 10 out of 34 in researchDB\n", + "P-value for CEBPA: 9.605689713987963e-11\n", + "Overlap for CEBPB: 8 out of 33 in researchDB\n", + "P-value for CEBPB: 3.5589724354327065e-08\n", + "Overlap for CEBPE: 5 out of 9 in researchDB\n", + "P-value for CEBPE: 9.287161274422125e-08\n", + "Overlap for CITED1: 6 out of 14 in researchDB\n", + "P-value for CITED1: 1.0848735073457748e-08\n", + "Overlap for COL1A1: 3 out of 29 in researchDB\n", + "P-value for COL1A1: 0.005941572402352278\n", + "Overlap for COL2A1: 0 out of 12 in researchDB\n", + "P-value for COL2A1: 1.0\n", + "Overlap for DUSP9: 0 out of 5 in researchDB\n", + "P-value for DUSP9: 1.0\n", + "Overlap for EGR1: 9 out of 18 in researchDB\n", + "P-value for EGR1: 6.076427546891436e-13\n", + "Overlap for ETS2: 4 out of 5 in researchDB\n", + "P-value for ETS2: 1.2643534179980056e-07\n", + "Overlap for FOSB: 3 out of 11 in researchDB\n", + "P-value for FOSB: 0.0005638568648125962\n", + "Overlap for FOXA1: 7 out of 15 in researchDB\n", + "P-value for FOXA1: 1.0193815225213786e-09\n", + "Overlap for FOXL2: 4 out of 7 in researchDB\n", + "P-value for FOXL2: 1.5792583093509072e-06\n", + "Overlap for FOXO4: 4 out of 8 in researchDB\n", + "P-value for FOXO4: 3.8416764921953e-06\n", + "Overlap for HK2: 1 out of 13 in researchDB\n", + "P-value for HK2: 0.15293459959857644\n", + "Overlap for HNF4A: 6 out of 18 in researchDB\n", + "P-value for HNF4A: 2.0064461479781388e-07\n", + "Overlap for IKZF3: 2 out of 6 in researchDB\n", + "P-value for IKZF3: 0.0033188033960969143\n", + "Overlap for IRF1: 4 out of 19 in researchDB\n", + "P-value for IRF1: 0.00011774780147504422\n", + "Overlap for JUN: 9 out of 38 in researchDB\n", + "P-value for JUN: 6.800257732829314e-10\n", + "Overlap for KIF18B: 4 out of 5 in researchDB\n", + "P-value for KIF18B: 1.02966073190364e-07\n", + "Overlap for KIF2C: 1 out of 8 in researchDB\n", + "P-value for KIF2C: 0.11717918583009142\n", + "Overlap for KMT2A: 4 out of 14 in researchDB\n", + "P-value for KMT2A: 5.737318595565771e-05\n", + "Overlap for LHX1: 3 out of 15 in researchDB\n", + "P-value for LHX1: 0.001550750199766002\n", + "Overlap for MAP2K3: 5 out of 18 in researchDB\n", + "P-value for MAP2K3: 6.727417522597989e-06\n", + "Overlap for MAP2K6: 2 out of 21 in researchDB\n", + "P-value for MAP2K6: 0.027671721378544775\n", + "Overlap for MAPK1: 9 out of 55 in researchDB\n", + "P-value for MAPK1: 1.2457826721768169e-08\n", + "Overlap for MEIS1: 2 out of 11 in researchDB\n", + "P-value for MEIS1: 0.011117126714865002\n", + "Overlap for NCL: 3 out of 8 in researchDB\n", + "P-value for NCL: 0.0002326125897755479\n", + "Overlap for PLK4: 1 out of 5 in researchDB\n", + "P-value for PLK4: 0.059561646093364874\n", + "Overlap for POU3F2: 2 out of 9 in researchDB\n", + "P-value for POU3F2: 0.008122844993866516\n", + "Overlap for PTPN1: 1 out of 14 in researchDB\n", + "P-value for PTPN1: 0.1896342601320953\n", + "Overlap for PTPN12: 0 out of 5 in researchDB\n", + "P-value for PTPN12: 1.0\n", + "Overlap for S1PR2: 0 out of 5 in researchDB\n", + "P-value for S1PR2: 1.0\n", + "Overlap for SGK1: 7 out of 17 in researchDB\n", + "P-value for SGK1: 4.12440746718372e-09\n", + "Overlap for SLC38A2: 1 out of 11 in researchDB\n", + "P-value for SLC38A2: 0.12020964276782159\n", + "Overlap for SNAI1: 3 out of 13 in researchDB\n", + "P-value for SNAI1: 0.0005804369395604471\n", + "Overlap for SPI1: 12 out of 25 in researchDB\n", + "P-value for SPI1: 8.848408979799951e-16\n", + "Overlap for STIL: 1 out of 13 in researchDB\n", + "P-value for STIL: 0.15204982937479228\n", + "Overlap for TBX2: 4 out of 8 in researchDB\n", + "P-value for TBX2: 1.9410436163852128e-06\n", + "Overlap for TBX3: 4 out of 14 in researchDB\n", + "P-value for TBX3: 2.0954324972493857e-05\n", + "Overlap for TGFBR2: 3 out of 55 in researchDB\n", + "P-value for TGFBR2: 0.020165641355063494\n", + "Overlap for TP73: 4 out of 17 in researchDB\n", + "P-value for TP73: 5.875369519782811e-05\n" + ] + } + ], + "source": [ + "pvals = perturbation_signal_recovery(\n", + " pert_to_go,\n", + " model_pert_to_go,\n", + " list(filtered_go_terms.keys()),\n", + " list(pert_emb_df.index),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "712834c7-e823-4117-a2f6-dc704c391d7c", + "metadata": {}, + "outputs": [], + "source": [ + "# save as csv for visualization\n", + "pd.DataFrame.from_dict(data=pvals, orient=\"index\").to_csv(\n", + " \"figures/process_recovery_hypergeo_pvals/GSFA_norman_hypergeo_neighbors_recovery_pvalues.csv\",\n", + " header=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cde32c38-f16b-4f39-9dcd-5e5f480b3720", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/norman_pertspectra_metrics.ipynb b/norman_pertspectra_metrics.ipynb new file mode 100644 index 0000000..317909c --- /dev/null +++ b/norman_pertspectra_metrics.ipynb @@ -0,0 +1,796 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b8270f66-7a27-4263-aa97-25abeb2cf34d", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2 " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "529f64f3-c83e-481d-872c-df89b9dd063b", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "from utils import (\n", + " auprc,\n", + " factor_enrichment_gsea,\n", + " generate_k_fold,\n", + " get_gprofiler,\n", + " perturbation_signal_recovery,\n", + " read_aws_csv,\n", + " read_aws_h5ad,\n", + " read_aws_pickle,\n", + " retrieve_stringdb_neighbors,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e6ad83c5-40dc-4f4e-8412-3e8fcdf847fa", + "metadata": {}, + "outputs": [], + "source": [ + "# read in trained model outputs generated from ./PertSpectra_load_checkpoints/pertspectra_norman.ipynb\n", + "adata = read_aws_h5ad(\n", + " \"s3://pert-spectra/PertSpectra_checkpoints/pertspectra_norman/fold_0.h5ad\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5b7bf773-5e0c-43b8-8df2-4ce3376be959", + "metadata": {}, + "source": [ + "# Reconstruction" + ] + }, + { + "cell_type": "markdown", + "id": "3b4b092b-2120-48c6-a492-84656e8b7169", + "metadata": {}, + "source": [ + "## Spearman Coefficient\n", + "- Spearman correlation between predicted and observed expression of test set\n", + "- Compute correlation for the mean expression aggregated on perturbation, across all kfolds" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e2885bd3-5433-47d6-88d5-47be83df5b5a", + "metadata": {}, + "outputs": [], + "source": [ + "n_folds = 5\n", + "model_adatas = []\n", + "for n in range(0, n_folds):\n", + " # new adata\n", + " adata_n = adata.copy()\n", + " # load model from checkpoint\n", + " s3_dir = \"s3://pert-spectra/PertSpectra_checkpoints/\"\n", + " experiment_name = \"pertspectra_norman/\"\n", + " model_name = f\"fold_{n}.h5ad\"\n", + " m_adata = read_aws_h5ad(s3_dir + experiment_name + model_name)\n", + " model_adatas.append(m_adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c6d0cd6c-c29b-4fa1-98f2-e31e7f5eefa1", + "metadata": {}, + "outputs": [], + "source": [ + "# iterate through all models and get the losses and correlations\n", + "# take the mean loss and correlation for the test set\n", + "from scipy.stats import spearmanr\n", + "\n", + "test_corr_singles = []\n", + "test_corr_combos = []\n", + "\n", + "\n", + "for n in range(n_folds):\n", + " adata_n = model_adatas[n]\n", + " train_idx, val_idx, test_idx = generate_k_fold(\n", + " adata_n,\n", + " adata_n.X,\n", + " adata_n.obs[\"perturbation_name\"],\n", + " fold_idx=n,\n", + " perturbation_key=\"perturbation_name\",\n", + " )\n", + " loss_weights = np.ones(adata_n.shape[0])\n", + " adata_test = adata_n[test_idx]\n", + "\n", + " for pert in adata_test.obs[\"perturbation_name\"].unique():\n", + " hold_idx = [\n", + " i for i, x in enumerate(adata_test.obs[\"perturbation_name\"]) if x == pert\n", + " ]\n", + " recon = adata_test[hold_idx].uns[\"recon\"]\n", + " # correlation\n", + " mean_reconstruction = recon.mean(axis=0)\n", + " mean_observed = np.squeeze(np.array(adata_test[hold_idx].X.mean(axis=0)))\n", + " if (\"+\" in pert) and (\"ctrl\" not in pert):\n", + " test_corr_combos.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0], n]\n", + " )\n", + " else:\n", + " test_corr_singles.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0], n]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c46c85fa-1d93-4baf-9226-6b4423a8c054", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot train correlation distribution vs test correlation distribution\n", + "test_corr = np.array(test_corr_singles + test_corr_combos)[:, 1].astype(float)\n", + "sns.histplot(test_corr, label=\"test_corr\")\n", + "plt.title(\"Correlation between predicted and observed expression per perturbation\")\n", + "plt.xlabel(\"Spearman Correlation\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f5ff3e3c-db63-41e3-be86-e2b45ef46b1e", + "metadata": {}, + "outputs": [], + "source": [ + "np.savetxt(\n", + " \"figures/reconstruction_spearmans/pertspectra_norman_spearman_correlations.csv\",\n", + " test_corr,\n", + " delimiter=\",\",\n", + " fmt=\"%.2f\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "64fc0b35-f87e-4d8a-87f1-e6080a3ee83f", + "metadata": {}, + "source": [ + "# Perturbation Embedding Heatmap" + ] + }, + { + "cell_type": "markdown", + "id": "c36ebe2b-2835-44ed-9f71-a52b522125b1", + "metadata": {}, + "source": [ + "## Hierarchical Clustering + Enrichment\n", + "- Perform enrichment tests (gprofiler) on hierarchical clustering of perturbation embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf9b816-2a19-44f0-ae84-ab1e041edc65", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.cluster.hierarchy import dendrogram, fcluster, linkage\n", + "from scipy.spatial.distance import pdist, squareform\n", + "\n", + "# Compute the pairwise distances\n", + "pert_embeddings_df = pd.DataFrame(\n", + " adata.uns[\"SPECTRA_pert_scores\"], index=adata.uns[\"Spectra_pert_labels\"]\n", + ")\n", + "df = pert_embeddings_df.drop(index=[\"basal\", \"ctrl\"])\n", + "\n", + "distance = \"euclidean\"\n", + "distances = pdist(df.values, metric=distance)\n", + "\n", + "# Convert the distances into a square distance matrix\n", + "distance_matrix = pd.DataFrame(squareform(distances), index=df.index, columns=df.index)\n", + "# distance_matrix = (distance_matrix - distance_matrix.min()) / (distance_matrix.max() - distance_matrix.min())\n", + "\n", + "# Compute the linkage matrix using the condensed distance matrix\n", + "linkage_matrix = linkage(distance_matrix, method=\"ward\")\n", + "\n", + "# Plot the dendrogram (optional)\n", + "dendro = dendrogram(linkage_matrix, no_plot=True)\n", + "\n", + "# Assign clusters\n", + "max_d = 0.09\n", + "clusters = fcluster(linkage_matrix, max_d, criterion=\"distance\")\n", + "\n", + "# Create a color palette for clusters\n", + "palette = sns.color_palette(\"husl\", len(np.unique(clusters)))\n", + "\n", + "# Map each cluster id to a color\n", + "cluster_colors = [palette[i - 1] for i in clusters]\n", + "\n", + "# Show the plot\n", + "plt.show()\n", + "\n", + "clustermap = sns.clustermap(\n", + " distance_matrix, cmap=\"viridis_r\", row_colors=cluster_colors\n", + ")\n", + "clustermap.fig.suptitle(f\"Pairwise {distance} distance of perturbation latent vectors\")\n", + "# clustermap.ax_row_dendrogram.set_visible(False)\n", + "clustermap.ax_col_dendrogram.set_visible(False)\n", + "clustermap.cax.set_visible(False)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2fabc196-a8ff-4660-af5a-0d2b8ebb1804", + "metadata": {}, + "outputs": [], + "source": [ + "# extract clusters and perform gprofiler\n", + "index_names = list(distance_matrix.index)\n", + "cluster_df = pd.DataFrame({\"Pert\": index_names, \"Cluster\": clusters})\n", + "cluster_process = {}\n", + "for cluster_id in np.unique(clusters):\n", + " points_in_cluster = cluster_df[cluster_df[\"Cluster\"] == cluster_id][\"Pert\"].tolist()\n", + " gprofiler_in = pd.DataFrame(points_in_cluster, columns=[\"gene_symbol\"])\n", + " res = get_gprofiler(gprofiler_in)\n", + " cluster_process[cluster_id] = res[res[\"p_value\"] <= 0.05]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d6d8eae-7390-49c1-bc1c-62b35882406e", + "metadata": {}, + "outputs": [], + "source": [ + "# Create descriptions for each cluster (for demonstration purpose)\n", + "cluster_descriptions = {\n", + " 1: \"Developmental processes, kinase activity\",\n", + " 2: \"Tyrosine phosphatase activity\",\n", + " 3: \"Transcription factor activity, DNA binding\",\n", + " 4: \"Embryonic development\",\n", + " 5: \"No signficant gprofiler hits\",\n", + " 6: \"Kinase inhibitor activity\",\n", + " 7: \"No signficant gprofiler hits\",\n", + " 8: \"Embryonic morphogenesis\",\n", + " 9: \"DNA-binding transcription activator\",\n", + " 10: \"RNA polymerase II transcription regulation\",\n", + " 11: \"C/EBP complex\",\n", + "}\n", + "\n", + "# Create a consistent color palette\n", + "unique_clusters = sorted(np.unique(clusters))\n", + "palette = sns.color_palette(\"husl\", len(unique_clusters))\n", + "cluster_colors = [palette[i - 1] for i in clusters]\n", + "cluster_colors = {cid: palette[i] for i, cid in enumerate(unique_clusters)}\n", + "\n", + "# Create clustermap using consistent colors\n", + "row_colors = [cluster_colors[cid] for cid in clusters]\n", + "clustermap = sns.clustermap(\n", + " distance_matrix, cmap=\"viridis_r\", row_colors=[palette[i - 1] for i in clusters]\n", + ")\n", + "clustermap.ax_row_dendrogram.set_visible(False)\n", + "# clustermap.ax_col_dendrogram.set_visible(False)\n", + "clustermap.savefig(\n", + " \"figures/figure_pngs/pertspectra_norman_clustermap.png\",\n", + " dpi=600,\n", + " bbox_inches=\"tight\",\n", + ")\n", + "\n", + "# Prepare data for the table\n", + "cluster_data = {\n", + " \"Cluster ID\": unique_clusters,\n", + " \"Description\": [cluster_descriptions[cid] for cid in unique_clusters],\n", + "}\n", + "\n", + "cluster_df = pd.DataFrame(cluster_data)\n", + "\n", + "# Plot the table with colors matching the clustermap\n", + "fig, ax = plt.subplots(figsize=(12, 4))\n", + "ax.axis(\"off\")\n", + "ax.axis(\"tight\")\n", + "\n", + "# Create table while applying the color to the row background\n", + "table = ax.table(\n", + " cellText=cluster_df.values,\n", + " colLabels=[\"Cluster ID\", \"Description\"],\n", + " cellColours=[[cluster_colors[cid]] * 2 for cid in cluster_df[\"Cluster ID\"]],\n", + " cellLoc=\"center\",\n", + " loc=\"center\",\n", + ")\n", + "\n", + "# Customize table appearance\n", + "table.auto_set_font_size(False)\n", + "table.set_fontsize(10)\n", + "table.scale(1, 1.5)\n", + "\n", + "plt.savefig(\n", + " \"figures/figure_pngs/pertspectra_norman_clustermap_descriptions.png\",\n", + " dpi=600,\n", + " bbox_inches=\"tight\",\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "id": "ee08b960-8bb8-4013-ac51-15196d579af7", + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "with open(\n", + " \"figures/pert_embedding_cluster_gprofiler/MODEL_norman_pert_emb_gprofiler.pickle\",\n", + " \"wb\",\n", + ") as handle:\n", + " pickle.dump(cluster_process, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "efbb23aa-4769-446b-a189-1106752453b3", + "metadata": {}, + "source": [ + "## PR Curve+AUC\n", + "- AUCPR using prior graph as binary label" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "33be8fb8-a88c-4718-a38e-7569aa58740d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " AUC\n", + "StringDB 0.753048\n", + "CORUM 0.946676\n", + " F1 Threshold\n", + "StringDB 0.829457 0.973018\n", + "CORUM 0.972222 1.000000\n" + ] + } + ], + "source": [ + "aucpr_df, f1_df, pr_dict = auprc(distance_matrix)\n", + "print(aucpr_df)\n", + "print(f1_df)\n", + "aucpr_df.to_csv(\"figures/pert_embedding_recall/pertspectra_norman_aucpr.csv\")\n", + "f1_df.to_csv(\"figures/pert_embedding_recall/pertspectra_norman_f1.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7cd2b514-5abb-4060-b0e0-a7a854ccf942", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "prior = \"StringDB\"\n", + "sns.set(style=\"whitegrid\")\n", + "\n", + "plt.figure(figsize=(8, 6))\n", + "sns.lineplot(\n", + " x=pr_dict[prior][\"recall\"],\n", + " y=pr_dict[prior][\"precision\"],\n", + " marker=\"o\",\n", + " label=\"Precision-Recall Curve\",\n", + ")\n", + "\n", + "# Add labels and title\n", + "plt.xlabel(\"Recall\")\n", + "plt.ylabel(\"Precision\")\n", + "plt.title(\"Norman: StringDB Precision-Recall Curve for PertSpectra\")\n", + "plt.legend().set_visible(False)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8fdff66a-8f20-4268-8520-d33a0fa4969a", + "metadata": {}, + "source": [ + "# Interpretability of our learned latent space" + ] + }, + { + "cell_type": "markdown", + "id": "1ec54f80-8a0f-41f0-8c55-edd2b1fbfb3c", + "metadata": {}, + "source": [ + "## Factor Enrichment\n", + "- GSEA on log transformed latent-by-gene factors\n", + "- Then associate each perturbation to its top latent factors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35eb8169-15b1-4b5b-b926-d5b7238f372a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-12-13 18:26:07,684 [INFO] Parsing data files for GSEA.............................\n", + "2024-12-13 18:26:07,780 [INFO] 17889 gene_sets have been filtered out when max_size=300 and min_size=10\n", + "2024-12-13 18:26:07,781 [INFO] 1096 gene_sets used for further statistical testing.....\n", + "2024-12-13 18:26:07,781 [INFO] Start to run GSEA...Might take a while..................\n", + "2024-12-13 18:26:35,994 [INFO] Congratulations. GSEApy runs successfully................\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "163\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-12-13 18:26:38,066 [INFO] Parsing data files for GSEA.............................\n", + "2024-12-13 18:26:38,167 [INFO] 17889 gene_sets have been filtered out when max_size=300 and min_size=10\n", + "2024-12-13 18:26:38,168 [INFO] 1096 gene_sets used for further statistical testing.....\n", + "2024-12-13 18:26:38,169 [INFO] Start to run GSEA...Might take a while..................\n", + "2024-12-13 18:27:06,036 [INFO] Congratulations. GSEApy runs successfully................\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "298\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-12-13 18:27:08,089 [WARNING] Duplicated values found in preranked stats: 0.02% of genes\n", + "The order of those genes will be arbitrary, which may produce unexpected results.\n", + "2024-12-13 18:27:08,090 [INFO] Parsing data files for GSEA.............................\n", + "2024-12-13 18:27:08,186 [INFO] 17889 gene_sets have been filtered out when max_size=300 and min_size=10\n", + "2024-12-13 18:27:08,187 [INFO] 1096 gene_sets used for further statistical testing.....\n" + ] + } + ], + "source": [ + "factor_to_go = factor_enrichment_gsea(adata, adata.uns[\"SPECTRA_factors\"], fdr=5e-2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814189ce-53f3-4716-834a-a4f0ea6efcf9", + "metadata": {}, + "outputs": [], + "source": [ + "# filter and add description to processes\n", + "go_df = read_aws_csv(\"s3://pert-spectra/references/GO_to_Description.txt\")\n", + "go_df.set_index(\"Term\", inplace=True)\n", + "go_dict = go_df.to_dict()[\"Description\"]\n", + "\n", + "filtered_factor_to_go = {}\n", + "for i in factor_to_go:\n", + " proc = factor_to_go[i]\n", + " proc[\"descr\"] = [go_dict[x] for x in proc[\"GO_ID\"]]\n", + " filtered_factor_to_go[i] = proc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc5264d2-0934-40e3-a209-7e318a46a583", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\n", + " \"figures/factor_enrichments/pertspectra_norman_factor_enrichment.pickle\", \"wb\"\n", + ") as handle:\n", + " pickle.dump(filtered_factor_to_go, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "b6af72e5-1c50-48d1-8e59-9702b8b3a85d", + "metadata": {}, + "source": [ + "## Proposed Metric 2 continued: Overlap with prior knowledge/ground truth (stringdb)\n", + "- Group A: a set of GO terms associated with a perturbation (either drivers from msigdb, or from literature) and its neighbors in stringdb\n", + "- Group B: a set of GO terms from the interpretability analysis\n", + "- Hypergeometric test on the overlap of the two groups" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f5b8dcc5-32bf-42d3-9ce9-3f1bbe5bf530", + "metadata": {}, + "outputs": [], + "source": [ + "# load precomputed factor_to_go dict if available\n", + "filtered_factor_to_go = read_aws_pickle(\n", + " \"s3://pert-spectra/figures/factor_enrichments/pertspectra_norman_factor_enrichment.pickle\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "31343240-e846-45a8-89f9-0734617a8b09", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# get neighbors for each perturbation\n", + "pert_neighbors = retrieve_stringdb_neighbors(adata.uns[\"Spectra_pert_labels\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "8bfb16c9-bc3d-4942-a10a-e889ee39f2a4", + "metadata": {}, + "outputs": [], + "source": [ + "# construct group A - known processes for each perturbation\n", + "gene_sets = read_aws_pickle(\"s3://pert-spectra/references/GO_to_Gene.pickle\")\n", + "# BP only go terms\n", + "go_reference = read_aws_csv(\"s3://pert-spectra/references/GO_terms.txt.gz\", zipped=True)\n", + "go_bp = go_reference[go_reference[\"go_category\"] == \"biological_process\"]\n", + "go_bp_ids = set(go_bp[\"go_id\"].values)\n", + "filtered_go_terms = {key: gene_sets[key] for key in go_bp_ids if key in gene_sets}\n", + "\n", + "# GO terms per perturbation AND its neighbors in stringdb\n", + "pert_to_go = {key: set() for key in adata.uns[\"Spectra_pert_labels\"]}\n", + "for goterm in filtered_go_terms:\n", + " for pert in adata.uns[\"Spectra_pert_labels\"]:\n", + " if pert in filtered_go_terms[goterm] and set(\n", + " filtered_go_terms[goterm]\n", + " ).intersection(pert_neighbors[pert]):\n", + " pert_to_go[pert].add(goterm)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b2190d7b-8c1d-4fdb-9ed5-221e715ae099", + "metadata": {}, + "outputs": [], + "source": [ + "# construct set B - model identified processes for each perturbation\n", + "n = 10 # number of top factors to get processes from\n", + "model_pert_to_go = {}\n", + "for pert in adata.uns[\"Spectra_pert_labels\"]:\n", + " if pert in [\"ctrl\", \"intergenic\", \"basal\"]:\n", + " continue\n", + " # get top factors\n", + " pert_embeddings_df = pd.DataFrame(\n", + " adata.uns[\"SPECTRA_pert_scores\"], index=adata.uns[\"Spectra_pert_labels\"]\n", + " )\n", + " pert_loading = pert_embeddings_df.loc[pert]\n", + " ctrl_loading = pert_embeddings_df.loc[\"ctrl\"]\n", + " delta_loading = np.abs(np.log(pert_loading) - np.log(ctrl_loading))\n", + " top_n_factors = np.argpartition(np.array(delta_loading), -n)[-n:]\n", + " # get processes\n", + " model_processes = set()\n", + " for f in top_n_factors:\n", + " proc = filtered_factor_to_go[f]\n", + " model_processes = model_processes.union(set(proc[\"Term\"]))\n", + " model_pert_to_go[pert] = model_processes" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "daa45a87-632e-46f7-8caf-c1de5463c5dc", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overlap for AHR: 8 out of 12 in researchDB\n", + "P-value for AHR: 4.845823421588032e-12\n", + "Overlap for ARID1A: 1 out of 16 in researchDB\n", + "P-value for ARID1A: 0.18058513844900828\n", + "Overlap for BAK1: 4 out of 43 in researchDB\n", + "P-value for BAK1: 8.604968054508552e-05\n", + "Overlap for BCL2L11: 5 out of 29 in researchDB\n", + "P-value for BCL2L11: 3.511897516201057e-05\n", + "Overlap for CBL: 4 out of 28 in researchDB\n", + "P-value for CBL: 0.00015631895405348394\n", + "Overlap for CDKN1A: 15 out of 37 in researchDB\n", + "P-value for CDKN1A: 9.94025808969477e-20\n", + "Overlap for CDKN1B: 13 out of 38 in researchDB\n", + "P-value for CDKN1B: 4.2909462790682503e-16\n", + "Overlap for CDKN1C: 4 out of 15 in researchDB\n", + "P-value for CDKN1C: 2.365186115944503e-05\n", + "Overlap for CEBPA: 7 out of 34 in researchDB\n", + "P-value for CEBPA: 2.2822948081394184e-08\n", + "Overlap for CEBPB: 10 out of 33 in researchDB\n", + "P-value for CEBPB: 5.670390667165458e-13\n", + "Overlap for CEBPE: 2 out of 9 in researchDB\n", + "P-value for CEBPE: 0.0017532384752871838\n", + "Overlap for CITED1: 7 out of 14 in researchDB\n", + "P-value for CITED1: 1.7522486320600642e-10\n", + "Overlap for COL1A1: 10 out of 29 in researchDB\n", + "P-value for COL1A1: 1.171842110722407e-11\n", + "Overlap for COL2A1: 1 out of 12 in researchDB\n", + "P-value for COL2A1: 0.06012023574110627\n", + "Overlap for DUSP9: 2 out of 5 in researchDB\n", + "P-value for DUSP9: 0.00153909025679683\n", + "Overlap for EGR1: 7 out of 18 in researchDB\n", + "P-value for EGR1: 4.6232048631669204e-10\n", + "Overlap for ETS2: 0 out of 5 in researchDB\n", + "P-value for ETS2: 1.0\n", + "Overlap for FOSB: 8 out of 11 in researchDB\n", + "P-value for FOSB: 2.821729541176879e-10\n", + "Overlap for FOXA1: 7 out of 15 in researchDB\n", + "P-value for FOXA1: 2.2846765707660486e-10\n", + "Overlap for FOXL2: 4 out of 7 in researchDB\n", + "P-value for FOXL2: 1.9453990596871714e-07\n", + "Overlap for FOXO4: 3 out of 8 in researchDB\n", + "P-value for FOXO4: 8.312021414586989e-05\n", + "Overlap for HK2: 0 out of 13 in researchDB\n", + "P-value for HK2: 1.0\n", + "Overlap for HNF4A: 8 out of 18 in researchDB\n", + "P-value for HNF4A: 2.964510745573481e-11\n", + "Overlap for IKZF3: 2 out of 6 in researchDB\n", + "P-value for IKZF3: 0.0011041577637246342\n", + "Overlap for IRF1: 7 out of 19 in researchDB\n", + "P-value for IRF1: 3.2140371659103896e-11\n", + "Overlap for JUN: 14 out of 38 in researchDB\n", + "P-value for JUN: 3.3731394971336345e-20\n", + "Overlap for KIF18B: 2 out of 5 in researchDB\n", + "P-value for KIF18B: 0.0009097386111233253\n", + "Overlap for KIF2C: 2 out of 8 in researchDB\n", + "P-value for KIF2C: 0.006701346521695229\n", + "Overlap for KMT2A: 4 out of 14 in researchDB\n", + "P-value for KMT2A: 1.0435444603803768e-06\n", + "Overlap for LHX1: 4 out of 15 in researchDB\n", + "P-value for LHX1: 6.9312991269755605e-06\n", + "Overlap for MAP2K3: 14 out of 18 in researchDB\n", + "P-value for MAP2K3: 1.6969977201863256e-18\n", + "Overlap for MAP2K6: 13 out of 21 in researchDB\n", + "P-value for MAP2K6: 3.9511035294897324e-15\n", + "Overlap for MAPK1: 6 out of 55 in researchDB\n", + "P-value for MAPK1: 9.632130141049859e-06\n", + "Overlap for MEIS1: 3 out of 11 in researchDB\n", + "P-value for MEIS1: 0.00011537613165619079\n", + "Overlap for NCL: 2 out of 8 in researchDB\n", + "P-value for NCL: 0.004359967761603796\n", + "Overlap for PLK4: 1 out of 5 in researchDB\n", + "P-value for PLK4: 0.05842913374150724\n", + "Overlap for POU3F2: 2 out of 9 in researchDB\n", + "P-value for POU3F2: 0.0021589123872628973\n", + "Overlap for PTPN1: 1 out of 14 in researchDB\n", + "P-value for PTPN1: 0.13016740932186296\n", + "Overlap for PTPN12: 1 out of 5 in researchDB\n", + "P-value for PTPN12: 0.03477850399423442\n", + "Overlap for S1PR2: 1 out of 5 in researchDB\n", + "P-value for S1PR2: 0.05691742035245258\n", + "Overlap for SGK1: 6 out of 17 in researchDB\n", + "P-value for SGK1: 3.3205018690768594e-08\n", + "Overlap for SLC38A2: 0 out of 11 in researchDB\n", + "P-value for SLC38A2: 1.0\n", + "Overlap for SNAI1: 3 out of 13 in researchDB\n", + "P-value for SNAI1: 0.0001730097629803037\n", + "Overlap for SPI1: 5 out of 25 in researchDB\n", + "P-value for SPI1: 3.2646388203028726e-06\n", + "Overlap for STIL: 3 out of 13 in researchDB\n", + "P-value for STIL: 0.001438174753528276\n", + "Overlap for TBX2: 4 out of 8 in researchDB\n", + "P-value for TBX2: 4.942710375809906e-07\n", + "Overlap for TBX3: 6 out of 14 in researchDB\n", + "P-value for TBX3: 1.0268128265777117e-09\n", + "Overlap for TGFBR2: 13 out of 55 in researchDB\n", + "P-value for TGFBR2: 2.400695076793383e-16\n", + "Overlap for TP73: 10 out of 17 in researchDB\n", + "P-value for TP73: 3.454158558799769e-16\n" + ] + } + ], + "source": [ + "pvals = perturbation_signal_recovery(\n", + " pert_to_go,\n", + " model_pert_to_go,\n", + " list(filtered_go_terms.keys()),\n", + " list(adata.uns[\"Spectra_pert_labels\"]),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "85bc6476-2818-40b0-b187-02671693cb8f", + "metadata": {}, + "outputs": [], + "source": [ + "# save as csv for visualization\n", + "pd.DataFrame.from_dict(data=pvals, orient=\"index\").to_csv(\n", + " \"figures/process_recovery_hypergeo_pvals/pertspectra_norman_hypergeo_neighbors_recovery_pvalues.csv\",\n", + " header=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78d8aaed-10e6-49fd-a838-70839e8b2561", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/norman_scETM_metrics.ipynb b/norman_scETM_metrics.ipynb new file mode 100644 index 0000000..8cc942b --- /dev/null +++ b/norman_scETM_metrics.ipynb @@ -0,0 +1,506 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 12, + "id": "b8270f66-7a27-4263-aa97-25abeb2cf34d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2 " + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "529f64f3-c83e-481d-872c-df89b9dd063b", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from utils import (\n", + " auprc,\n", + " factor_enrichment_gsea,\n", + " get_gprofiler,\n", + " read_aws_csv,\n", + " read_aws_h5ad,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00d0db8a-5c3d-4e9f-bce0-46c90135cab4", + "metadata": {}, + "outputs": [], + "source": [ + "# read in scETM results\n", + "adata = read_aws_h5ad(\"s3://pert-spectra/scETM_checkpoints/scetm_norman/fold_0.h5ad\")\n", + "# read in pertspectra results to retrieve gene labels\n", + "ref_adata = read_aws_h5ad(\n", + " \"s3://pert-spectra/PertSpectra_checkpoints/pertspectra_norman/fold_0.h5ad\"\n", + ")\n", + "adata.var_names = ref_adata.var_names" + ] + }, + { + "cell_type": "markdown", + "id": "5b7bf773-5e0c-43b8-8df2-4ce3376be959", + "metadata": {}, + "source": [ + "# Reconstruction" + ] + }, + { + "cell_type": "markdown", + "id": "3b4b092b-2120-48c6-a492-84656e8b7169", + "metadata": {}, + "source": [ + "## Spearman Coefficient\n", + "- Spearman correlation between predicted and observed expression (on DE genes if available)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e2885bd3-5433-47d6-88d5-47be83df5b5a", + "metadata": {}, + "outputs": [], + "source": [ + "n_folds = 5\n", + "model_adatas = []\n", + "for n in range(0, n_folds):\n", + " # new adata\n", + " adata_n = adata.copy()\n", + " # load model from checkpoint\n", + " s3_dir = \"s3://pert-spectra/scETM_checkpoints/\"\n", + " experiment_name = \"scetm_norman/\"\n", + " model_name = f\"fold_{n}.h5ad\"\n", + " m_adata = read_aws_h5ad(s3_dir + experiment_name + model_name)\n", + " model_adatas.append(m_adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cfe07156-f95b-481e-a1ac-495e69f7e954", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "(18547, 4990)\n", + "test\n", + "1\n", + "(20156, 4990)\n", + "test\n", + "2\n", + "(21410, 4990)\n", + "test\n", + "3\n", + "(20621, 4990)\n", + "test\n", + "4\n", + "(20752, 4990)\n", + "test\n" + ] + } + ], + "source": [ + "# iterate through all models and get the losses and correlations\n", + "# take the mean loss and correlation for the test set\n", + "from scipy.stats import spearmanr\n", + "\n", + "test_corr_singles = []\n", + "test_corr_combos = []\n", + "\n", + "\n", + "for n in range(n_folds):\n", + " adata_n = model_adatas[n]\n", + " loss_weights = np.ones(adata_n.shape[0])\n", + "\n", + " for pert in adata_n.obs[\"perturbation_name\"].unique():\n", + " hold_idx = [\n", + " i for i, x in enumerate(adata_n.obs[\"perturbation_name\"]) if x == pert\n", + " ]\n", + " recon = adata_n[hold_idx].uns[\"recon\"]\n", + " # correlation\n", + " mean_reconstruction = recon.mean(axis=0)\n", + " mean_observed = np.squeeze(np.array(adata_n[hold_idx].X.mean(axis=0)))\n", + " if (\"+\" in pert) and (\"ctrl\" not in pert):\n", + " test_corr_combos.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0], n]\n", + " )\n", + " else:\n", + " test_corr_singles.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0], n]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "72f27f1d-a5a5-46f8-a9a4-efcd5f3119fb", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot train correlation distribution vs test correlation distribution\n", + "import seaborn as sns\n", + "\n", + "test_corr = np.array(test_corr_singles + test_corr_combos)[:, 1].astype(float)\n", + "sns.histplot(test_corr, label=\"test_corr\")\n", + "plt.title(\"Correlation between predicted and observed expression per perturbation\")\n", + "plt.xlabel(\"Spearman Correlation\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e4dcca93-bc7e-4bae-b434-50f2a2fb52ab", + "metadata": {}, + "outputs": [], + "source": [ + "np.savetxt(\n", + " \"figures/reconstruction_spearmans/scETM_norman_spearman_correlations.csv\",\n", + " test_corr,\n", + " delimiter=\",\",\n", + " fmt=\"%.2f\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "64fc0b35-f87e-4d8a-87f1-e6080a3ee83f", + "metadata": {}, + "source": [ + "# Perturbation Embedding Heatmap" + ] + }, + { + "cell_type": "markdown", + "id": "c36ebe2b-2835-44ed-9f71-a52b522125b1", + "metadata": {}, + "source": [ + "## Hierarchical Clustering + Enrichment\n", + "- Perform enrichment tests (gprofiler) on hierarchical clustering of perturbation embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0aaf8dc3-7e5b-4748-8685-fc8560bf7fc7", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "from scipy.cluster.hierarchy import dendrogram, fcluster, linkage\n", + "from scipy.spatial.distance import pdist, squareform\n", + "\n", + "# aggregate cell embeddings to perturbation embeddings\n", + "cell_emb = adata.uns[\"cell_emb\"] @ adata.uns[\"topics\"]\n", + "perts = []\n", + "pert_emb = []\n", + "for i in adata.obs[\"perturbation_name\"].unique():\n", + " if \"+\" not in i and i != \"nan\":\n", + " perts.append(i)\n", + " pert_emb.append(cell_emb[adata.obs[\"perturbation_name\"] == i].mean(axis=0))\n", + "pert_emb = np.array(pert_emb)\n", + "pert_emb_df = pd.DataFrame(pert_emb, index=perts)\n", + "\n", + "# Compute the pairwise distances\n", + "df = pert_emb_df.drop(index=[\"control\"])\n", + "\n", + "distance = \"euclidean\"\n", + "distances = pdist(df.values, metric=distance)\n", + "\n", + "# Convert the distances into a square distance matrix\n", + "distance_matrix = pd.DataFrame(squareform(distances), index=df.index, columns=df.index)\n", + "\n", + "# Compute the linkage matrix using the condensed distance matrix\n", + "linkage_matrix = linkage(distance_matrix, method=\"ward\")\n", + "\n", + "# Plot the dendrogram (optional)\n", + "dendro = dendrogram(linkage_matrix, no_plot=True)\n", + "\n", + "# Assign clusters\n", + "max_d = 1.2 # Max distance for flat clusters\n", + "clusters = fcluster(linkage_matrix, max_d, criterion=\"distance\")\n", + "\n", + "# Create a color palette for clusters\n", + "palette = sns.color_palette(\"husl\", len(np.unique(clusters)))\n", + "\n", + "# Map each cluster id to a color\n", + "cluster_colors = [palette[i - 1] for i in clusters]\n", + "\n", + "# Show the plot\n", + "plt.show()\n", + "\n", + "clustermap = sns.clustermap(\n", + " distance_matrix, cmap=\"viridis_r\", row_colors=cluster_colors\n", + ")\n", + "clustermap.fig.suptitle(f\"Pairwise {distance} distance of perturbation latent vectors\")\n", + "# clustermap.ax_row_dendrogram.set_visible(False)\n", + "clustermap.ax_col_dendrogram.set_visible(False)\n", + "# clustermap.cax.set_visible(False)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "c655f7dc-7d39-43dd-9af5-6ca0faf6d8eb", + "metadata": {}, + "outputs": [], + "source": [ + "# extract clusters and perform gprofiler\n", + "index_names = list(distance_matrix.index)\n", + "cluster_df = pd.DataFrame({\"Pert\": index_names, \"Cluster\": clusters})\n", + "cluster_process = {}\n", + "for cluster_id in np.unique(clusters):\n", + " points_in_cluster = cluster_df[cluster_df[\"Cluster\"] == cluster_id][\"Pert\"].tolist()\n", + " gprofiler_in = pd.DataFrame(points_in_cluster, columns=[\"gene_symbol\"])\n", + " res = get_gprofiler(gprofiler_in)\n", + " cluster_process[cluster_id] = res[res[\"p_value\"] <= 0.05]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e41dff85-043e-46d2-8737-4db7d7bf969c", + "metadata": {}, + "outputs": [], + "source": [ + "# Create descriptions for each cluster (for demonstration purpose)\n", + "cluster_descriptions = {\n", + " 1: \"Protein tyrosine phosphatase activity\",\n", + " 2: \"\",\n", + " 3: \"protein serine/threonine kinase activity\",\n", + " 4: \"DNA-binding transcription factor activity, RNA polymerase II transcription\",\n", + " 5: \"MAPK activity, regulation of IRE1-mediated unfolded protein\",\n", + " 6: \"morphogenesis, DNA-binding transcription factor activity, transcription by RNA polymerase II\",\n", + " 7: \"negative regulation of ERBB signaling pathway\",\n", + " 8: \"MAPK activity, procentriole replication\",\n", + " 9: \"vesicle lumen\",\n", + " 10: \"collagen trimer\",\n", + " 11: \"DNA-binding transcription factor activity, RNA polymerase II transcription\",\n", + " 12: \"\",\n", + " 13: \"\",\n", + " 14: \"C/EBP complex\",\n", + "}\n", + "\n", + "# Create a consistent color palette\n", + "unique_clusters = sorted(np.unique(clusters))\n", + "palette = sns.color_palette(\"husl\", len(unique_clusters))\n", + "cluster_colors = [palette[i - 1] for i in clusters]\n", + "cluster_colors = {cid: palette[i] for i, cid in enumerate(unique_clusters)}\n", + "\n", + "# Create clustermap using consistent colors\n", + "row_colors = [cluster_colors[cid] for cid in clusters]\n", + "clustermap = sns.clustermap(\n", + " distance_matrix, cmap=\"viridis_r\", row_colors=[palette[i - 1] for i in clusters]\n", + ")\n", + "clustermap.savefig(\n", + " \"figures/figure_pngs/scETM_norman_clustermap.png\", dpi=600, bbox_inches=\"tight\"\n", + ")\n", + "\n", + "# Prepare data for the table\n", + "cluster_data = {\n", + " \"Cluster ID\": unique_clusters,\n", + " \"Description\": [cluster_descriptions[cid] for cid in unique_clusters],\n", + "}\n", + "\n", + "cluster_df = pd.DataFrame(cluster_data)\n", + "\n", + "# Plot the table with colors matching the clustermap\n", + "fig, ax = plt.subplots(figsize=(18, 4))\n", + "ax.axis(\"off\")\n", + "ax.axis(\"tight\")\n", + "\n", + "# Create table while applying the color to the row background\n", + "table = ax.table(\n", + " cellText=cluster_df.values,\n", + " colLabels=[\"Cluster ID\", \"Description\"],\n", + " cellColours=[[cluster_colors[cid]] * 2 for cid in cluster_df[\"Cluster ID\"]],\n", + " cellLoc=\"center\",\n", + " loc=\"center\",\n", + ")\n", + "\n", + "# Customize table appearance\n", + "table.auto_set_font_size(False)\n", + "table.set_fontsize(10)\n", + "table.scale(1, 1.5)\n", + "\n", + "plt.savefig(\n", + " \"figures/figure_pngs/scETM_norman_clustermap_descriptions.png\",\n", + " dpi=600,\n", + " bbox_inches=\"tight\",\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "a53c2cec-1c3d-465d-a627-47bcc7aa4da7", + "metadata": {}, + "outputs": [], + "source": [ + "# save grpofiler results\n", + "with open(\n", + " \"figures/pert_embedding_cluster_gprofiler/scETM_norman_pert_emb_gprofiler.pickle\",\n", + " \"wb\",\n", + ") as handle:\n", + " pickle.dump(cluster_process, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "5ad3121b-7c81-435e-9d09-ab431200706a", + "metadata": {}, + "source": [ + "## PR Curve+AUC\n", + "- AUCPR using prior graph as binary label" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "4509c613-b436-4143-8246-b9d83125b4bf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " AUC\n", + "StringDB 0.726058\n", + "CORUM 0.946866\n", + " F1 Threshold\n", + "StringDB 0.820312 1.0\n", + "CORUM 0.972222 1.0\n" + ] + } + ], + "source": [ + "aucpr_df, f1_df, pr_dict = auprc(distance_matrix)\n", + "print(aucpr_df)\n", + "print(f1_df)\n", + "aucpr_df.to_csv(\"figures/pert_embedding_recall/scETM_norman_aucpr.csv\")\n", + "f1_df.to_csv(\"figures/pert_embedding_recall/scETM_norman_f1.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "8fdff66a-8f20-4268-8520-d33a0fa4969a", + "metadata": {}, + "source": [ + "# Interpretability of our learned latent space" + ] + }, + { + "cell_type": "markdown", + "id": "1ec54f80-8a0f-41f0-8c55-edd2b1fbfb3c", + "metadata": {}, + "source": [ + "## Factor Enrichment\n", + "- GSEA on log transformed latent-by-gene factors\n", + "- Then associate each perturbation to its top latent factors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35eb8169-15b1-4b5b-b926-d5b7238f372a", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "factor_to_go = factor_enrichment_gsea(adata, np.abs(adata.uns[\"gene_emb\"]), fdr=0.05)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814189ce-53f3-4716-834a-a4f0ea6efcf9", + "metadata": {}, + "outputs": [], + "source": [ + "# filter and add description to processes\n", + "go_df = read_aws_csv(\"s3://pert-spectra/references/GO_to_Description.txt\")\n", + "go_df.set_index(\"Term\", inplace=True)\n", + "go_dict = go_df.to_dict()[\"Description\"]\n", + "\n", + "filtered_factor_to_go = {}\n", + "for i in factor_to_go:\n", + " proc = factor_to_go[i]\n", + " proc[\"descr\"] = [go_dict[x] for x in proc[\"GO_ID\"]]\n", + " filtered_factor_to_go[i] = proc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d1e2990-edee-430e-957f-328342271a42", + "metadata": {}, + "outputs": [], + "source": [ + "# save latent enrichment results\n", + "with open(\n", + " \"figures/factor_enrichments/scETM_norman_factor_enrichment.pickle\", \"wb\"\n", + ") as handle:\n", + " pickle.dump(filtered_factor_to_go, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..46cb4d7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[tool.ruff.lint] +select = ["E4", "E7", "E9", "F", "I"] + diff --git a/replogle_pertspectra_metrics.ipynb b/replogle_pertspectra_metrics.ipynb new file mode 100644 index 0000000..0ac04bb --- /dev/null +++ b/replogle_pertspectra_metrics.ipynb @@ -0,0 +1,946 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b8270f66-7a27-4263-aa97-25abeb2cf34d", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2 " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "529f64f3-c83e-481d-872c-df89b9dd063b", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "from utils import (\n", + " auprc,\n", + " factor_enrichment_gsea,\n", + " get_gprofiler,\n", + " perturbation_signal_recovery,\n", + " read_aws_csv,\n", + " read_aws_h5ad,\n", + " read_aws_pickle,\n", + " retrieve_stringdb_neighbors,\n", + " split_data_by_cell,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c57d97ad-da1a-4853-b825-a45aae5ca3dc", + "metadata": {}, + "outputs": [], + "source": [ + "# read in trained model outputs generated from ./PertSpectra_load_checkpoints/pertspectra_replogle.ipynb\n", + "adata = read_aws_h5ad(\n", + " \"s3://pert-spectra/PertSpectra_checkpoints/pertspectra_replogle/replogle.h5ad\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5b7bf773-5e0c-43b8-8df2-4ce3376be959", + "metadata": {}, + "source": [ + "# Reconstruction" + ] + }, + { + "cell_type": "markdown", + "id": "3b4b092b-2120-48c6-a492-84656e8b7169", + "metadata": {}, + "source": [ + "## Spearman Coefficient\n", + "- Spearman correlation between predicted and observed expression (on DE genes if available)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e2885bd3-5433-47d6-88d5-47be83df5b5a", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import spearmanr\n", + "\n", + "test_corr_singles = []\n", + "test_corr_combos = []\n", + "train_idx, val_idx, test_idx = split_data_by_cell(\n", + " adata.X, adata.obs[\"gene\"], test_size=0.2, val_size=0.2\n", + ")\n", + "adata_test = adata[test_idx]\n", + "\n", + "for pert in adata_test.obs[\"gene\"].unique():\n", + " hold_idx = [i for i, x in enumerate(adata_test.obs[\"gene\"]) if x == pert]\n", + " recon = adata.uns[\"recon\"][hold_idx]\n", + " # correlation\n", + " mean_reconstruction = recon.mean(axis=0)\n", + " mean_observed = np.squeeze(np.array(adata_test[hold_idx].X.mean(axis=0)))\n", + " if (\"+\" in pert) and (\"ctrl\" not in pert):\n", + " test_corr_combos.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0]]\n", + " )\n", + " else:\n", + " test_corr_singles.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0]]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c75473ab-68bf-4be3-b2bf-a5bc3fe081d8", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot train correlation distribution vs test correlation distribution\n", + "test_corr = np.array(test_corr_singles + test_corr_combos)[:, 1].astype(float)\n", + "sns.histplot(test_corr, label=\"test_corr\")\n", + "plt.title(\"Correlation between predicted and observed expression per perturbation\")\n", + "plt.xlabel(\"Spearman Correlation\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a03e387a-9b61-4b4e-90c2-e1995d46c9e6", + "metadata": {}, + "outputs": [], + "source": [ + "np.savetxt(\n", + " \"figures/reconstruction_spearmans/model_replogle_spearman_correlations.csv\",\n", + " test_corr,\n", + " delimiter=\",\",\n", + " fmt=\"%.2f\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "64fc0b35-f87e-4d8a-87f1-e6080a3ee83f", + "metadata": {}, + "source": [ + "# Perturbation Embedding Heatmap" + ] + }, + { + "cell_type": "markdown", + "id": "c36ebe2b-2835-44ed-9f71-a52b522125b1", + "metadata": {}, + "source": [ + "## Hierarchical Clustering + Enrichment\n", + "- Perform enrichment tests (gprofiler) on hierarchical clustering of perturbation embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99d3263b-8988-47a6-a57b-0dce4186eb6b", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "from scipy.spatial.distance import pdist, squareform\n", + "\n", + "# Compute the pairwise distances\n", + "pert_embeddings_df = pd.DataFrame(\n", + " adata.uns[\"SPECTRA_pert_scores\"], index=adata.uns[\"Spectra_pert_labels\"]\n", + ")\n", + "df = pert_embeddings_df.drop(index=[\"basal\", \"ctrl\"])\n", + "\n", + "distance = \"euclidean\"\n", + "distances = pdist(df.values, metric=distance)\n", + "\n", + "# Convert the distances into a square distance matrix\n", + "distance_matrix = pd.DataFrame(squareform(distances), index=df.index, columns=df.index)\n", + "clustermap = sns.clustermap(distance_matrix, cmap=\"viridis_r\")\n", + "clustermap.fig.suptitle(f\"Pairwise {distance} distance of perturbation latent vectors\")\n", + "# clustermap.ax_row_dendrogram.set_visible(False)\n", + "clustermap.ax_col_dendrogram.set_visible(False)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7a684ec0-9241-42a4-b65b-b3bdf51b7c7f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import scipy\n", + "\n", + "den = scipy.cluster.hierarchy.dendrogram(\n", + " clustermap.dendrogram_col.linkage,\n", + " labels=distance_matrix.index,\n", + " color_threshold=0.25,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c63a3f07-044f-44e7-bbb3-26890d565536", + "metadata": {}, + "outputs": [], + "source": [ + "# extract clusters and perform gprofiler\n", + "from collections import defaultdict\n", + "\n", + "\n", + "def get_cluster_classes(den, label=\"ivl\"):\n", + " cluster_idxs = defaultdict(list)\n", + " for c, pi in zip(den[\"color_list\"], den[\"icoord\"]):\n", + " for leg in pi[1:3]:\n", + " i = (leg - 5.0) / 10.0\n", + " if abs(i - int(i)) < 1e-5:\n", + " cluster_idxs[c].append(int(i))\n", + " cluster_classes = {}\n", + " for c, l in cluster_idxs.items(): # noqa\n", + " i_l = [den[label][i] for i in l]\n", + " cluster_classes[c] = i_l\n", + " return cluster_classes\n", + "\n", + "\n", + "clusters = get_cluster_classes(den)\n", + "# extract functions for clusters\n", + "cluster_process = {}\n", + "for c in clusters:\n", + " cluster_df = pd.DataFrame(clusters[c], columns=[\"gene_symbol\"])\n", + " res = get_gprofiler(cluster_df)\n", + " cluster_process[c] = res[res[\"p_value\"] <= 0.05]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c4149852-b136-46a4-bdec-f065425455b6", + "metadata": {}, + "outputs": [], + "source": [ + "# save grpofiler results\n", + "with open(\n", + " \"figures/pert_embedding_cluster_gprofiler/pertspectra_replogle_pert_emb_gprofiler.pickle\",\n", + " \"wb\",\n", + ") as handle:\n", + " pickle.dump(cluster_process, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "84864ce6-2724-4d11-a955-1ec4585c7876", + "metadata": {}, + "source": [ + "## PR Curve+AUC\n", + "- AUCPR using prior graph as binary label" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d281c6a7-767a-44c0-aeb1-3c5bf11e27a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " AUC\n", + "StringDB 0.215162\n", + "CORUM 0.262926\n", + " F1 Threshold\n", + "StringDB 0.246123 0.869706\n", + "CORUM 0.306326 0.884541\n" + ] + } + ], + "source": [ + "aucpr_df, f1_df, pr_dict = auprc(distance_matrix)\n", + "print(aucpr_df)\n", + "print(f1_df)\n", + "aucpr_df.to_csv(\"figures/pert_embedding_recall/pertspectra_replogle_aucpr.csv\")\n", + "f1_df.to_csv(\"figures/pert_embedding_recall/pertspectra_replogle_f1.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "cd6af182-b82d-4fe1-a5a8-fccb34556130", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot the precision-recall curve\n", + "prior = \"StringDB\"\n", + "plt.figure(figsize=(8, 6))\n", + "plt.plot(\n", + " pr_dict[prior][\"recall\"],\n", + " pr_dict[prior][\"precision\"],\n", + " marker=\".\",\n", + " label=\"Precision-Recall Curve\",\n", + ")\n", + "# Adding labels and title\n", + "plt.xlabel(\"Recall\")\n", + "plt.ylabel(\"Precision\")\n", + "plt.title(\"Precision-Recall Curve\")\n", + "plt.legend()\n", + "# Save the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8fdff66a-8f20-4268-8520-d33a0fa4969a", + "metadata": {}, + "source": [ + "# Interpretability of our learned latent space" + ] + }, + { + "cell_type": "markdown", + "id": "1ec54f80-8a0f-41f0-8c55-edd2b1fbfb3c", + "metadata": {}, + "source": [ + "## Factor Enrichment\n", + "- GSEA on log transformed latent-by-gene factors\n", + "- Then associate each perturbation to its top latent factors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35eb8169-15b1-4b5b-b926-d5b7238f372a", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "factor_to_go = factor_enrichment_gsea(adata, adata.uns[\"SPECTRA_factors\"], fdr=5e-2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814189ce-53f3-4716-834a-a4f0ea6efcf9", + "metadata": {}, + "outputs": [], + "source": [ + "# filter and add description to processes\n", + "go_df = read_aws_csv(\"s3://pert-spectra/references/GO_to_Description.txt\")\n", + "go_df.set_index(\"Term\", inplace=True)\n", + "go_dict = go_df.to_dict()[\"Description\"]\n", + "\n", + "filtered_factor_to_go = {}\n", + "for i in factor_to_go:\n", + " proc = factor_to_go[i]\n", + " proc[\"descr\"] = [go_dict[x] for x in proc[\"GO_ID\"]]\n", + " filtered_factor_to_go[i] = proc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e82f91a-70e9-44c2-b802-058348b77052", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_factor_to_go" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b91d73c-87de-4197-8f7c-2554ee3d0491", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\n", + " \"figures/factor_enrichments/MODEL_replogle_factor_enrichment.pickle\", \"wb\"\n", + ") as handle:\n", + " pickle.dump(filtered_factor_to_go, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "b01b3f40-7d97-481a-8e57-01627cd627ad", + "metadata": {}, + "source": [ + "## Overlap with prior knowledge/ground truth\n", + "- Group A: a set of GO terms associated with a perturbation (either drivers from msigdb, or from literature) and its neighbors in stringdb\n", + "- Group B: a set of GO terms from the interpretability analysis\n", + "- Hypergeometric test on the overlap of the two groups" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "96895bc7-560b-4893-9f3f-40a8f733779f", + "metadata": {}, + "outputs": [], + "source": [ + "# load precomputed factor_to_go dict if available\n", + "filtered_factor_to_go = read_aws_pickle(\n", + " \"s3://pert-spectra/figures/factor_enrichments/pertspectra_replogle_factor_enrichment.pickle\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e9eca6ce-4fc7-4f6a-b140-0b2276d9abe4", + "metadata": {}, + "outputs": [], + "source": [ + "# get neighbors for each perturbation\n", + "pert_neighbors = retrieve_stringdb_neighbors(adata.uns[\"Spectra_pert_labels\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ce927aaa-f5b4-4403-ab21-beeb1202a4f7", + "metadata": {}, + "outputs": [], + "source": [ + "# construct group A - known processes for each perturbation\n", + "gene_sets = read_aws_pickle(\"s3://pert-spectra/references/GO_to_Gene.pickle\")\n", + "# BP only go terms\n", + "go_reference = read_aws_csv(\"s3://pert-spectra/references/GO_terms.txt.gz\", zipped=True)\n", + "go_bp = go_reference[go_reference[\"go_category\"] == \"biological_process\"]\n", + "go_bp_ids = set(go_bp[\"go_id\"].values)\n", + "filtered_go_terms = {key: gene_sets[key] for key in go_bp_ids if key in gene_sets}\n", + "\n", + "# GO terms per perturbation AND its neighbors in stringdb\n", + "pert_to_go = {key: set() for key in adata.uns[\"Spectra_pert_labels\"]}\n", + "for goterm in filtered_go_terms:\n", + " for pert in adata.uns[\"Spectra_pert_labels\"]:\n", + " if pert in filtered_go_terms[goterm] and set(\n", + " filtered_go_terms[goterm]\n", + " ).intersection(pert_neighbors[pert]):\n", + " pert_to_go[pert].add(goterm)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e9433684-9108-4d8b-8c0f-f9c3f79155e5", + "metadata": {}, + "outputs": [], + "source": [ + "# construct set B - model identified processes for each perturbation\n", + "n = 10 # number of top factors to get processes from\n", + "model_pert_to_go = {}\n", + "for pert in adata.uns[\"Spectra_pert_labels\"]:\n", + " if pert in [\"ctrl\", \"intergenic\", \"basal\"]:\n", + " continue\n", + " # get top factors\n", + " pert_embeddings_df = pd.DataFrame(\n", + " adata.uns[\"SPECTRA_pert_scores\"], index=adata.uns[\"Spectra_pert_labels\"]\n", + " )\n", + " pert_loading = pert_embeddings_df.loc[pert]\n", + " ctrl_loading = pert_embeddings_df.loc[\"ctrl\"]\n", + " delta_loading = np.abs(np.log(pert_loading) - np.log(ctrl_loading))\n", + " top_n_factors = np.argpartition(np.array(delta_loading), -n)[-n:]\n", + " # get processes\n", + " model_processes = set()\n", + " for f in top_n_factors:\n", + " proc = filtered_factor_to_go[f]\n", + " model_processes = model_processes.union(set(proc[\"Term\"]))\n", + " model_pert_to_go[pert] = model_processes" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "739fb773-f08a-486c-b70b-3317bc749cd4", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overlap for ATP6AP1: 0 out of 9 in researchDB\n", + "P-value for ATP6AP1: 1.0\n", + "Overlap for RPL30: 0 out of 5 in researchDB\n", + "P-value for RPL30: 1.0\n", + "Overlap for DDX21: 3 out of 13 in researchDB\n", + "P-value for DDX21: 5.980456287677466e-05\n", + "Overlap for PHB: 2 out of 24 in researchDB\n", + "P-value for PHB: 0.010741839670463106\n", + "Overlap for DMAP1: 1 out of 14 in researchDB\n", + "P-value for DMAP1: 0.06354093376899977\n", + "Overlap for MCRS1: 1 out of 19 in researchDB\n", + "P-value for MCRS1: 0.10580925901854948\n", + "Overlap for CCNH: 3 out of 7 in researchDB\n", + "P-value for CCNH: 3.0005424766078534e-05\n", + "Overlap for NUMA1: 1 out of 6 in researchDB\n", + "P-value for NUMA1: 0.03329641937620341\n", + "Overlap for ALDOA: 0 out of 6 in researchDB\n", + "P-value for ALDOA: 1.0\n", + "Overlap for SNAPC3: 0 out of 5 in researchDB\n", + "P-value for SNAPC3: 1.0\n", + "Overlap for SEC61A1: 0 out of 8 in researchDB\n", + "P-value for SEC61A1: 1.0\n", + "Overlap for CCT3: 3 out of 10 in researchDB\n", + "P-value for CCT3: 5.402090913326375e-05\n", + "Overlap for EIF6: 2 out of 6 in researchDB\n", + "P-value for EIF6: 0.000615303446286681\n", + "Overlap for UTP20: 1 out of 5 in researchDB\n", + "P-value for UTP20: 0.04628097253128953\n", + "Overlap for FBL: 1 out of 6 in researchDB\n", + "P-value for FBL: 0.028199168104588085\n", + "Overlap for RPS7: 3 out of 9 in researchDB\n", + "P-value for RPS7: 8.84599979430884e-06\n", + "Overlap for BUB1B: 2 out of 7 in researchDB\n", + "P-value for BUB1B: 0.00040051787251466374\n", + "Overlap for PES1: 0 out of 6 in researchDB\n", + "P-value for PES1: 1.0\n", + "Overlap for ATP6V1A: 2 out of 11 in researchDB\n", + "P-value for ATP6V1A: 0.0007310971425003066\n", + "Overlap for NFRKB: 1 out of 13 in researchDB\n", + "P-value for NFRKB: 0.02948322206040407\n", + "Overlap for MIOS: 0 out of 5 in researchDB\n", + "P-value for MIOS: 1.0\n", + "Overlap for NEDD8: 1 out of 8 in researchDB\n", + "P-value for NEDD8: 0.05022968114062165\n", + "Overlap for TAF1: 0 out of 23 in researchDB\n", + "P-value for TAF1: 1.0\n", + "Overlap for RAD21: 1 out of 11 in researchDB\n", + "P-value for RAD21: 0.015586554804595597\n", + "Overlap for RPS6: 4 out of 9 in researchDB\n", + "P-value for RPS6: 3.850507725903189e-07\n", + "Overlap for MAX: 0 out of 7 in researchDB\n", + "P-value for MAX: 1.0\n", + "Overlap for VPS72: 1 out of 10 in researchDB\n", + "P-value for VPS72: 0.08166368798369315\n", + "Overlap for GATA1: 2 out of 20 in researchDB\n", + "P-value for GATA1: 0.008055568024662566\n", + "Overlap for SUPT4H1: 1 out of 8 in researchDB\n", + "P-value for SUPT4H1: 0.009473365146555657\n", + "Overlap for C1QBP: 0 out of 12 in researchDB\n", + "P-value for C1QBP: 1.0\n", + "Overlap for RPL26: 4 out of 7 in researchDB\n", + "P-value for RPL26: 4.910516979866623e-08\n", + "Overlap for UBA3: 0 out of 5 in researchDB\n", + "P-value for UBA3: 1.0\n", + "Overlap for MED17: 1 out of 9 in researchDB\n", + "P-value for MED17: 0.058361519318946814\n", + "Overlap for SNRPG: 1 out of 6 in researchDB\n", + "P-value for SNRPG: 0.02866347810549369\n", + "Overlap for PRPF19: 1 out of 6 in researchDB\n", + "P-value for PRPF19: 0.044798272846770126\n", + "Overlap for NAF1: 0 out of 6 in researchDB\n", + "P-value for NAF1: 1.0\n", + "Overlap for TXNL4A: 2 out of 5 in researchDB\n", + "P-value for TXNL4A: 0.000529721111247926\n", + "Overlap for RPS19: 2 out of 11 in researchDB\n", + "P-value for RPS19: 0.0009645903158693221\n", + "Overlap for AATF: 1 out of 6 in researchDB\n", + "P-value for AATF: 0.03791093137572726\n", + "Overlap for PHB2: 2 out of 19 in researchDB\n", + "P-value for PHB2: 0.009185594870537925\n", + "Overlap for VPS28: 0 out of 9 in researchDB\n", + "P-value for VPS28: 1.0\n", + "Overlap for RPS28: 1 out of 7 in researchDB\n", + "P-value for RPS28: 0.03336150600238275\n", + "Overlap for EXOSC7: 2 out of 13 in researchDB\n", + "P-value for EXOSC7: 0.002471059938780703\n", + "Overlap for WDR12: 0 out of 5 in researchDB\n", + "P-value for WDR12: 1.0\n", + "Overlap for POLD1: 2 out of 12 in researchDB\n", + "P-value for POLD1: 0.0035245791373806645\n", + "Overlap for COPS3: 0 out of 6 in researchDB\n", + "P-value for COPS3: 1.0\n", + "Overlap for NVL: 3 out of 5 in researchDB\n", + "P-value for NVL: 2.4319712243376875e-06\n", + "Overlap for PAF1: 0 out of 13 in researchDB\n", + "P-value for PAF1: 1.0\n", + "Overlap for RAD51: 5 out of 28 in researchDB\n", + "P-value for RAD51: 1.0101324774451766e-06\n", + "Overlap for CCT5: 3 out of 10 in researchDB\n", + "P-value for CCT5: 4.480528947125347e-05\n", + "Overlap for MED1: 1 out of 35 in researchDB\n", + "P-value for MED1: 0.1771416321151657\n", + "Overlap for EXOC1: 0 out of 9 in researchDB\n", + "P-value for EXOC1: 1.0\n", + "Overlap for CCNK: 1 out of 10 in researchDB\n", + "P-value for CCNK: 0.03587615783321504\n", + "Overlap for RPL10: 2 out of 6 in researchDB\n", + "P-value for RPL10: 0.000615303446286681\n", + "Overlap for ACTR8: 5 out of 16 in researchDB\n", + "P-value for ACTR8: 5.384424999101365e-08\n", + "Overlap for SNRPD3: 2 out of 6 in researchDB\n", + "P-value for SNRPD3: 0.0005149057259652727\n", + "Overlap for LAMTOR4: 0 out of 5 in researchDB\n", + "P-value for LAMTOR4: 1.0\n", + "Overlap for TAF7: 1 out of 17 in researchDB\n", + "P-value for TAF7: 0.10010863013860945\n", + "Overlap for RFC4: 2 out of 5 in researchDB\n", + "P-value for RFC4: 0.00016526499161894154\n", + "Overlap for RPTOR: 1 out of 27 in researchDB\n", + "P-value for RPTOR: 0.27462065680998554\n", + "Overlap for RPL23: 1 out of 10 in researchDB\n", + "P-value for RPL23: 0.041995859043389136\n", + "Overlap for SMC1A: 4 out of 10 in researchDB\n", + "P-value for SMC1A: 2.4186483678243333e-08\n", + "Overlap for SF3A2: 1 out of 5 in researchDB\n", + "P-value for SF3A2: 0.014186403172397456\n", + "Overlap for EIF2B3: 1 out of 6 in researchDB\n", + "P-value for EIF2B3: 0.03652850935432338\n", + "Overlap for POLD3: 1 out of 8 in researchDB\n", + "P-value for POLD3: 0.05868377093103718\n", + "Overlap for INTS3: 1 out of 7 in researchDB\n", + "P-value for INTS3: 0.04888428544377561\n", + "Overlap for MNAT1: 2 out of 11 in researchDB\n", + "P-value for MNAT1: 0.00139474279814442\n", + "Overlap for GRB2: 2 out of 16 in researchDB\n", + "P-value for GRB2: 0.0011432815400414342\n", + "Overlap for INO80B: 3 out of 13 in researchDB\n", + "P-value for INO80B: 0.00012280139562832808\n", + "Overlap for CCT8: 2 out of 10 in researchDB\n", + "P-value for CCT8: 0.0030106276764372197\n", + "Overlap for BOP1: 1 out of 6 in researchDB\n", + "P-value for BOP1: 0.03329641937620341\n", + "Overlap for COPS8: 0 out of 6 in researchDB\n", + "P-value for COPS8: 1.0\n", + "Overlap for INO80: 1 out of 20 in researchDB\n", + "P-value for INO80: 0.031251091286424816\n", + "Overlap for UBE2N: 2 out of 16 in researchDB\n", + "P-value for UBE2N: 0.007349623192881736\n", + "Overlap for TAF12: 1 out of 17 in researchDB\n", + "P-value for TAF12: 0.12302325506021794\n", + "Overlap for EIF2B2: 2 out of 9 in researchDB\n", + "P-value for EIF2B2: 0.0009190149284216131\n", + "Overlap for EIF2B4: 0 out of 8 in researchDB\n", + "P-value for EIF2B4: 1.0\n", + "Overlap for TELO2: 1 out of 5 in researchDB\n", + "P-value for TELO2: 0.037472286923089536\n", + "Overlap for BCR: 2 out of 14 in researchDB\n", + "P-value for BCR: 0.002795217526457602\n", + "Overlap for RPL11: 4 out of 15 in researchDB\n", + "P-value for RPL11: 4.932969797549702e-06\n", + "Overlap for MED30: 0 out of 6 in researchDB\n", + "P-value for MED30: 1.0\n", + "Overlap for PPP1CA: 1 out of 18 in researchDB\n", + "P-value for PPP1CA: 0.036471485600401074\n", + "Overlap for MED10: 0 out of 5 in researchDB\n", + "P-value for MED10: 1.0\n", + "Overlap for EXOSC4: 2 out of 15 in researchDB\n", + "P-value for EXOSC4: 0.007603333550124456\n", + "Overlap for KANSL2: 0 out of 7 in researchDB\n", + "P-value for KANSL2: 1.0\n", + "Overlap for MED14: 0 out of 7 in researchDB\n", + "P-value for MED14: 1.0\n", + "Overlap for NCBP1: 5 out of 24 in researchDB\n", + "P-value for NCBP1: 3.952539644538647e-07\n", + "Overlap for EXOSC3: 1 out of 14 in researchDB\n", + "P-value for EXOSC3: 0.08421492165545252\n", + "Overlap for SUPT6H: 2 out of 9 in researchDB\n", + "P-value for SUPT6H: 0.0011577054718808673\n", + "Overlap for COPS5: 2 out of 12 in researchDB\n", + "P-value for COPS5: 0.0016176646331365141\n", + "Overlap for UPF1: 4 out of 11 in researchDB\n", + "P-value for UPF1: 1.4839056717361259e-06\n", + "Overlap for SDHA: 0 out of 6 in researchDB\n", + "P-value for SDHA: 1.0\n", + "Overlap for DAD1: 1 out of 6 in researchDB\n", + "P-value for DAD1: 0.045713472662023885\n", + "Overlap for TRRAP: 3 out of 18 in researchDB\n", + "P-value for TRRAP: 0.0003614867115463507\n", + "Overlap for CPSF6: 1 out of 8 in researchDB\n", + "P-value for CPSF6: 0.03742397846149575\n", + "Overlap for TAF6: 2 out of 14 in researchDB\n", + "P-value for TAF6: 0.005215080250213804\n", + "Overlap for GINS4: 1 out of 5 in researchDB\n", + "P-value for GINS4: 0.028598634586795634\n", + "Overlap for METTL3: 3 out of 13 in researchDB\n", + "P-value for METTL3: 0.00012656348190672007\n", + "Overlap for MED7: 0 out of 7 in researchDB\n", + "P-value for MED7: 1.0\n", + "Overlap for TTK: 3 out of 11 in researchDB\n", + "P-value for TTK: 2.9651787336133862e-05\n", + "Overlap for GTF2H1: 1 out of 7 in researchDB\n", + "P-value for GTF2H1: 0.014338369809308697\n", + "Overlap for RPS9: 0 out of 5 in researchDB\n", + "P-value for RPS9: 1.0\n", + "Overlap for CHMP6: 1 out of 25 in researchDB\n", + "P-value for CHMP6: 0.08187584716296124\n", + "Overlap for SEH1L: 2 out of 11 in researchDB\n", + "P-value for SEH1L: 0.0006118761218801126\n", + "Overlap for MED8: 0 out of 5 in researchDB\n", + "P-value for MED8: 1.0\n", + "Overlap for MTOR: 5 out of 63 in researchDB\n", + "P-value for MTOR: 0.00022537551709343121\n", + "Overlap for TAF2: 0 out of 12 in researchDB\n", + "P-value for TAF2: 1.0\n", + "Overlap for MED6: 0 out of 6 in researchDB\n", + "P-value for MED6: 1.0\n", + "Overlap for ENO1: 0 out of 6 in researchDB\n", + "P-value for ENO1: 1.0\n", + "Overlap for MED20: 1 out of 6 in researchDB\n", + "P-value for MED20: 0.045255964139129114\n", + "Overlap for ATP6V1B2: 0 out of 5 in researchDB\n", + "P-value for ATP6V1B2: 1.0\n", + "Overlap for GAB2: 0 out of 6 in researchDB\n", + "P-value for GAB2: 1.0\n", + "Overlap for MYBBP1A: 4 out of 11 in researchDB\n", + "P-value for MYBBP1A: 1.2223399245264868e-06\n", + "Overlap for KANSL3: 0 out of 8 in researchDB\n", + "P-value for KANSL3: 1.0\n", + "Overlap for MCM3: 2 out of 6 in researchDB\n", + "P-value for MCM3: 0.00027671399181365935\n", + "Overlap for RFC5: 3 out of 5 in researchDB\n", + "P-value for RFC5: 9.646951764033886e-07\n", + "Overlap for NUP62: 3 out of 12 in researchDB\n", + "P-value for NUP62: 5.5790530145847814e-05\n", + "Overlap for GLRX3: 1 out of 6 in researchDB\n", + "P-value for GLRX3: 0.031908474742726685\n", + "Overlap for TADA3: 1 out of 19 in researchDB\n", + "P-value for TADA3: 0.08802395013148663\n", + "Overlap for METTL14: 2 out of 9 in researchDB\n", + "P-value for METTL14: 0.0012215579257578778\n", + "Overlap for MCM5: 3 out of 6 in researchDB\n", + "P-value for MCM5: 1.3088944199027693e-06\n", + "Overlap for SMG5: 2 out of 5 in researchDB\n", + "P-value for SMG5: 0.0009853709543044674\n", + "Overlap for DKC1: 2 out of 12 in researchDB\n", + "P-value for DKC1: 0.0033095825799992597\n", + "Overlap for SKP2: 0 out of 6 in researchDB\n", + "P-value for SKP2: 1.0\n", + "Overlap for RPS3A: 3 out of 5 in researchDB\n", + "P-value for RPS3A: 6.1926356356671515e-06\n", + "Overlap for NCBP2: 2 out of 19 in researchDB\n", + "P-value for NCBP2: 0.001539819932302026\n", + "Overlap for CHMP3: 0 out of 26 in researchDB\n", + "P-value for CHMP3: 1.0\n", + "Overlap for KAT8: 0 out of 10 in researchDB\n", + "P-value for KAT8: 1.0\n", + "Overlap for HSPA9: 3 out of 8 in researchDB\n", + "P-value for HSPA9: 2.5499492628815173e-05\n", + "Overlap for POLD2: 1 out of 5 in researchDB\n", + "P-value for POLD2: 0.028598634586795634\n", + "Overlap for CASP8AP2: 0 out of 5 in researchDB\n", + "P-value for CASP8AP2: 1.0\n", + "Overlap for UBE2I: 2 out of 11 in researchDB\n", + "P-value for UBE2I: 0.00045257392877956064\n", + "Overlap for EXOSC8: 1 out of 12 in researchDB\n", + "P-value for EXOSC8: 0.03556147437814442\n", + "Overlap for EIF2B1: 1 out of 6 in researchDB\n", + "P-value for EIF2B1: 0.040670816947906745\n", + "Overlap for YEATS4: 1 out of 13 in researchDB\n", + "P-value for YEATS4: 0.0736482173215087\n", + "Overlap for RPL5: 4 out of 12 in researchDB\n", + "P-value for RPL5: 1.8029092140895893e-07\n", + "Overlap for KIF20A: 0 out of 6 in researchDB\n", + "P-value for KIF20A: 1.0\n", + "Overlap for EIF2B5: 1 out of 11 in researchDB\n", + "P-value for EIF2B5: 0.05442069701975445\n", + "Overlap for WDR61: 0 out of 6 in researchDB\n", + "P-value for WDR61: 1.0\n", + "Overlap for ATF5: 2 out of 6 in researchDB\n", + "P-value for ATF5: 0.0003405680502523435\n", + "Overlap for HINFP: 0 out of 7 in researchDB\n", + "P-value for HINFP: 1.0\n", + "Overlap for EXOSC6: 2 out of 11 in researchDB\n", + "P-value for EXOSC6: 0.0018034167554763156\n", + "Overlap for MAGOH: 2 out of 8 in researchDB\n", + "P-value for MAGOH: 0.001194722130001048\n", + "Overlap for MAD2L1: 5 out of 9 in researchDB\n", + "P-value for MAD2L1: 2.5363032932725105e-09\n", + "Overlap for HSPD1: 3 out of 18 in researchDB\n", + "P-value for HSPD1: 0.00020118530173201323\n", + "Overlap for XPO1: 2 out of 8 in researchDB\n", + "P-value for XPO1: 0.0010311513996934835\n", + "Overlap for EIF4G2: 1 out of 5 in researchDB\n", + "P-value for EIF4G2: 0.030146560935722915\n", + "Overlap for RPL7: 0 out of 5 in researchDB\n", + "P-value for RPL7: 1.0\n", + "Overlap for BRD8: 0 out of 10 in researchDB\n", + "P-value for BRD8: 1.0\n", + "Overlap for CLTC: 2 out of 13 in researchDB\n", + "P-value for CLTC: 0.0017894707781930812\n", + "Overlap for RBM4: 1 out of 8 in researchDB\n", + "P-value for RBM4: 0.030656133269285887\n", + "Overlap for DNAJA3: 2 out of 8 in researchDB\n", + "P-value for DNAJA3: 0.0009038201917254252\n", + "Overlap for SUPT16H: 3 out of 7 in researchDB\n", + "P-value for SUPT16H: 1.5546548801441513e-05\n", + "Overlap for MCM6: 3 out of 5 in researchDB\n", + "P-value for MCM6: 5.374919107367473e-06\n", + "Overlap for TSG101: 1 out of 24 in researchDB\n", + "P-value for TSG101: 0.10133881660330367\n", + "Overlap for TAF8: 0 out of 7 in researchDB\n", + "P-value for TAF8: 1.0\n", + "Overlap for MCM2: 4 out of 10 in researchDB\n", + "P-value for MCM2: 2.618018946162168e-07\n", + "Overlap for PRPF6: 2 out of 6 in researchDB\n", + "P-value for PRPF6: 0.0005857221455239715\n", + "Overlap for CDC73: 2 out of 20 in researchDB\n", + "P-value for CDC73: 0.005107020540025154\n", + "Overlap for EXOSC2: 2 out of 12 in researchDB\n", + "P-value for EXOSC2: 0.002042317444150196\n", + "Overlap for EP400: 1 out of 10 in researchDB\n", + "P-value for EP400: 0.06984722382984401\n", + "Overlap for CHAF1B: 3 out of 5 in researchDB\n", + "P-value for CHAF1B: 3.6691488048685227e-07\n", + "Overlap for RPS24: 2 out of 5 in researchDB\n", + "P-value for RPS24: 0.0006620439075372245\n", + "Overlap for EXOC4: 0 out of 9 in researchDB\n", + "P-value for EXOC4: 1.0\n", + "Overlap for MED12: 0 out of 8 in researchDB\n", + "P-value for MED12: 1.0\n", + "Overlap for ATP6V1H: 1 out of 9 in researchDB\n", + "P-value for ATP6V1H: 0.06173790884147587\n", + "Overlap for NUDT21: 1 out of 9 in researchDB\n", + "P-value for NUDT21: 0.046116910946927304\n", + "Overlap for CCT7: 2 out of 9 in researchDB\n", + "P-value for CCT7: 0.001602393812389033\n", + "Overlap for CUL1: 0 out of 7 in researchDB\n", + "P-value for CUL1: 1.0\n", + "Overlap for RRS1: 1 out of 6 in researchDB\n", + "P-value for RRS1: 0.024943816481445866\n", + "Overlap for YEATS2: 0 out of 13 in researchDB\n", + "P-value for YEATS2: 1.0\n", + "Overlap for AP2S1: 0 out of 7 in researchDB\n", + "P-value for AP2S1: 1.0\n", + "Overlap for NCKAP1: 1 out of 10 in researchDB\n", + "P-value for NCKAP1: 0.09189207950588776\n", + "Overlap for XRCC5: 2 out of 25 in researchDB\n", + "P-value for XRCC5: 0.007925137444318536\n", + "Overlap for DICER1: 1 out of 10 in researchDB\n", + "P-value for DICER1: 0.02972129585261667\n", + "Overlap for CCT4: 1 out of 11 in researchDB\n", + "P-value for CCT4: 0.03433672065092231\n", + "Overlap for SMC3: 5 out of 11 in researchDB\n", + "P-value for SMC3: 1.2919646600125852e-09\n", + "Overlap for ERCC3: 1 out of 16 in researchDB\n", + "P-value for ERCC3: 0.07818335296874425\n", + "Overlap for MCM4: 2 out of 6 in researchDB\n", + "P-value for MCM4: 0.00038676590840153185\n", + "Overlap for CPSF3: 0 out of 6 in researchDB\n", + "P-value for CPSF3: 1.0\n", + "Overlap for CCT2: 3 out of 12 in researchDB\n", + "P-value for CCT2: 7.369246161295306e-05\n", + "Overlap for TCP1: 2 out of 12 in researchDB\n", + "P-value for TCP1: 0.002331790613204371\n", + "Overlap for NUP54: 0 out of 6 in researchDB\n", + "P-value for NUP54: 1.0\n", + "Overlap for MIS12: 3 out of 5 in researchDB\n", + "P-value for MIS12: 3.134875270533881e-06\n", + "Overlap for CTR9: 0 out of 15 in researchDB\n", + "P-value for CTR9: 1.0\n", + "Overlap for ERCC2: 0 out of 14 in researchDB\n", + "P-value for ERCC2: 1.0\n", + "Overlap for TRMT112: 0 out of 5 in researchDB\n", + "P-value for TRMT112: 1.0\n", + "Overlap for PNPT1: 0 out of 7 in researchDB\n", + "P-value for PNPT1: 1.0\n", + "Overlap for MED21: 0 out of 6 in researchDB\n", + "P-value for MED21: 1.0\n", + "Overlap for RPS3: 7 out of 15 in researchDB\n", + "P-value for RPS3: 4.927470217586745e-12\n" + ] + } + ], + "source": [ + "pvals = perturbation_signal_recovery(\n", + " pert_to_go,\n", + " model_pert_to_go,\n", + " list(filtered_go_terms.keys()),\n", + " list(adata.uns[\"Spectra_pert_labels\"]),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "98cb9f87-58e7-46fb-ab9c-bfc04fb7209a", + "metadata": {}, + "outputs": [], + "source": [ + "# save as csv for visualization\n", + "pd.DataFrame.from_dict(data=pvals, orient=\"index\").to_csv(\n", + " \"figures/process_recovery_hypergeo_pvals/pertspectra_replogle_hypergeo_neighbors_recovery_pvalues.csv\",\n", + " header=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39264204-40a8-46b4-b24f-5e88fb3280ce", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/replogle_scETM_metrics.ipynb b/replogle_scETM_metrics.ipynb new file mode 100644 index 0000000..fc70a14 --- /dev/null +++ b/replogle_scETM_metrics.ipynb @@ -0,0 +1,402 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b8270f66-7a27-4263-aa97-25abeb2cf34d", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2 " + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "529f64f3-c83e-481d-872c-df89b9dd063b", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from utils import (\n", + " auprc,\n", + " factor_enrichment_gsea,\n", + " get_gprofiler,\n", + " read_aws_csv,\n", + " read_aws_h5ad,\n", + " split_data_by_cell,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db596f53-18d6-4c3b-82ac-601923be6eb1", + "metadata": {}, + "outputs": [], + "source": [ + "# read in scETM results\n", + "adata = read_aws_h5ad(\n", + " \"s3://pert-spectra/scETM_checkpoints/scetm_replogle/scetm_replogle.h5ad\"\n", + ")\n", + "# read in pertspectra results to retrieve gene labels\n", + "ref_adata = read_aws_h5ad(\n", + " \"s3://pert-spectra/PertSpectra_checkpoints/pertspectra_replogle/replogle.h5ad\"\n", + ")\n", + "adata.var_names = ref_adata.var_names" + ] + }, + { + "cell_type": "markdown", + "id": "5b7bf773-5e0c-43b8-8df2-4ce3376be959", + "metadata": {}, + "source": [ + "# Reconstruction" + ] + }, + { + "cell_type": "markdown", + "id": "3b4b092b-2120-48c6-a492-84656e8b7169", + "metadata": {}, + "source": [ + "## Spearman Coefficient\n", + "- Spearman correlation between predicted and observed expression (on DE genes if available)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e2885bd3-5433-47d6-88d5-47be83df5b5a", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import spearmanr\n", + "\n", + "test_corr_singles = []\n", + "test_corr_combos = []\n", + "train_idx, val_idx, test_idx = split_data_by_cell(\n", + " adata.X, adata.obs[\"gene\"], test_size=0.2, val_size=0.2\n", + ")\n", + "adata_test = adata[test_idx]\n", + "for pert in adata_test.obs[\"gene\"].unique():\n", + " hold_idx = [i for i, x in enumerate(adata_test.obs[\"gene\"]) if x == pert]\n", + " recon = adata_test.uns[\"recon\"][hold_idx]\n", + " # correlation\n", + " mean_reconstruction = recon.mean(axis=0)\n", + " mean_observed = np.squeeze(np.array(adata_test[hold_idx].X.mean(axis=0)))\n", + " if (\"+\" in pert) and (\"ctrl\" not in pert):\n", + " test_corr_combos.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0]]\n", + " )\n", + " else:\n", + " test_corr_singles.append(\n", + " [pert, spearmanr(mean_reconstruction, mean_observed)[0]]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9f7404d5-c4b3-44be-a6c3-0e213a6704b7", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot train correlation distribution vs test correlation distribution\n", + "import seaborn as sns\n", + "\n", + "test_corr = np.array(test_corr_singles + test_corr_combos)[:, 1].astype(float)\n", + "sns.histplot(test_corr, label=\"test_corr\")\n", + "plt.title(\"Correlation between predicted and observed expression per perturbation\")\n", + "plt.xlabel(\"Spearman Correlation\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c6817a48-76e2-4d36-9725-68dab2568d2f", + "metadata": {}, + "outputs": [], + "source": [ + "np.savetxt(\n", + " \"figures/reconstruction_spearmans/scETM_replogle_spearman_correlations.csv\",\n", + " test_corr,\n", + " delimiter=\",\",\n", + " fmt=\"%.2f\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "64fc0b35-f87e-4d8a-87f1-e6080a3ee83f", + "metadata": {}, + "source": [ + "# Perturbation Embedding Heatmap" + ] + }, + { + "cell_type": "markdown", + "id": "c36ebe2b-2835-44ed-9f71-a52b522125b1", + "metadata": {}, + "source": [ + "## Hierarchical Clustering + Enrichment\n", + "- Perform enrichment tests (gprofiler) on hierarchical clustering of perturbation embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0aaf8dc3-7e5b-4748-8685-fc8560bf7fc7", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "from scipy.spatial.distance import pdist, squareform\n", + "\n", + "# aggregate cell embeddings to perturbation embeddings\n", + "cell_emb = adata.uns[\"cell_emb\"] @ adata.uns[\"topics\"]\n", + "perts = []\n", + "pert_emb = []\n", + "for i in adata.obs[\"gene\"].unique():\n", + " if \"+\" not in i and i != \"nan\":\n", + " perts.append(i)\n", + " pert_emb.append(cell_emb[adata.obs[\"gene\"] == i].mean(axis=0))\n", + "pert_emb = np.array(pert_emb)\n", + "pert_emb_df = pd.DataFrame(pert_emb, index=perts)\n", + "\n", + "# Compute the pairwise distances\n", + "df = pert_emb_df.drop(index=[\"non-targeting\"])\n", + "\n", + "distance = \"euclidean\"\n", + "distances = pdist(df.values, metric=distance)\n", + "\n", + "# Convert the distances into a square distance matrix\n", + "distance_matrix = pd.DataFrame(squareform(distances), index=df.index, columns=df.index)\n", + "clustermap = sns.clustermap(distance_matrix, cmap=\"viridis_r\")\n", + "clustermap.fig.suptitle(f\"Pairwise {distance} distance of perturbation latent vectors\")\n", + "# clustermap.ax_row_dendrogram.set_visible(False)\n", + "clustermap.ax_col_dendrogram.set_visible(False)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "bc7de299-d751-480c-b275-23ffab452b3e", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAhkAAAG/CAYAAAD1t8XlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8fJSN1AAAACXBIWXMAAA9hAAAPYQGoP6dpAADAv0lEQVR4nOzdd3hUVfoH8O/0XjLpZdILCQmhdxQBKYqoiCirrGIB62Jf+dnWyqqsZXXFtrpiwy4qIr2J9N5JSEjvbVImU8/vD7yHSUybkEnB9/M8eUiGKWfu3Ln3vee85z0ixhgDIYQQQkgXE/d0AwghhBByYaIggxBCCCE+QUEGIYQQQnyCggxCCCGE+AQFGYQQQgjxCQoyCCGEEOITFGQQQgghxCek3f2CbrcbhYWF0Ol0EIlE3f3yhBBCCOkExhhqa2sRFhYGsbhjfRTdHmQUFhbCbDZ398sSQgghpAvk5eUhIiKiQ/ft9iBDp9MBONtIvV7f3S9PCCGEkE6wWCwwm838PN4R3R5kCEMker2eggxCCCGkj/Em1YESPwkhhBDiExRkEEIIIcQnKMgghBBCiE9QkEEIIYQQn6AggxBCCCE+QUEGIYQQQnyCggxCCCGE+AQFGYQQQgjxCQoyCCGEEOITFGQQQgghxCcoyCCEEEKIT1CQQQghhBCfoCCDEEIIIT7R7auwChhjaLA7e+rlCSGkW6hkEq9WrSTkQtJjQcbc/+7CoVJ7T708IYR0i6FRfvjqjlEUaJA/pR4bLjmQV91TL00IId1mT04VrA5XTzeDkB7RYz0Zgj2PT4JaLunpZhBCSJdqsLsw9Ll1Pd0MQnpUjwcZarkEanmPN4MQQgghXYxmlxBCCCHEJyjIIIQQQohPUJBBCCGEEJ+gIIMQQgghPkFBBiGEEEJ8wusgo6CgADfeeCP8/f2hVqsxcOBA7N271xdtI4QQQkgf5tXc0aqqKowZMwaXXHIJVq1ahaCgIJw+fRpGo9FHzSOEEEJIX+VVkPHiiy/CbDbjww8/5LdFR0d3dZsIIYQQcgHwarjkhx9+wNChQ3HttdciKCgIgwYNwnvvvdfmY2w2GywWS5MfQgghhFz4vAoysrKysHTpUiQkJGD16tW444478Le//Q3Lli1r9TGLFy+GwWDgP2az+bwbTQghhJDez6sgw+12Y/DgwXjhhRcwaNAgLFiwALfffjuWLl3a6mMWLVqEmpoa/pOXl3fejSaEEEJI7+dVkBEaGoqUlJQmtyUnJyM3N7fVxygUCuj1+iY/hBBCCLnweRVkjBkzBidPnmxy26lTpxAVFdWljSKEEEJI3+dVkHH//fdjx44deOGFF5CZmYnPPvsM7777Lu6++25ftY8QQgghfZRXQcawYcPw3Xff4fPPP0dqaiqeffZZvPbaa7jhhht81T5CCCGE9FFe1ckAgOnTp2P69Om+aAshhBBCLiC0dgkhhBBCfIKCDEIIIYT4BAUZhBBCCPEJCjIIIYQQ4hMUZBBCCCHEJyjIIIQQQohPUJBBCCGEEJ+gIIMQQgghPkFBBiGEEEJ8goIMQgghhPgEBRmEEEII8QkKMgghhBDiExRkEEIIIcQnKMgghBBCiE9QkEEIIYQQn6AggxBCCCE+QUEGIYQQQnyCggxCCCGE+AQFGYQQQgjxCQoyCCGEEOITFGQQQgghxCcoyCCEEEKIT1CQQQghhBCfoCCDEEIIIT5BQQYhhBBCfIKCDEIIIYT4BAUZhBBCCPEJCjIIIYQQ4hMUZBBCCCHEJyjIIIQQQohPUJBBCCGEEJ+gIIMQQgghPkFBBiGEEEJ8goIMQgghhPgEBRmEEEII8QkKMgghhBDiExRkEEIIIcQnKMgghBBCiE9QkEEIIYQQn6AggxBCCCE+QUEGIYQQQnyCggxCCCGE+AQFGYQQQgjxCQoyCCGEEOITFGQQQgghxCcoyCCEEEKIT3gVZPzjH/+ASCRq8hMSEuKrthFCCCGkD5N6+4D+/ftj3bp1/G+JRNKlDSKEEELIhcHrIEMqlVLvBSGEEELa5XVORkZGBsLCwhATE4Prr78eWVlZbd7fZrPBYrE0+SGEEELIhc+rIGPEiBFYtmwZVq9ejffeew/FxcUYPXo0KioqWn3M4sWLYTAY+I/ZbD7vRhNCCCGk9/MqyJg2bRquueYapKWlYdKkSVi5ciUA4KOPPmr1MYsWLUJNTQ3/ycvLO78WE0IIIaRP8Donw5NGo0FaWhoyMjJavY9CoYBCoTiflyGEEEJIH3RedTJsNhuOHz+O0NDQrmoPIYQQQi4QXgUZDz30EDZv3ozs7Gzs3LkTs2bNgsViwU033eSr9hFCCCGkj/JquCQ/Px9z5sxBeXk5AgMDMXLkSOzYsQNRUVG+ah8hhBBC+iivgozly5f7qh2EEEIIucDQ2iWEEEII8QkKMgghhBDiExRkEEIIIcQnKMgghBBCiE9QkEEIIYQQn6AggxBCCCE+QUEGIYQQQnyCggxCCCGE+AQFGYQQQgjxCQoyCCGEEOITFGQQQgghxCcoyCCEEEKIT1CQQQghhBCfoCCDEEIIIT5BQQYhhBBCfIKCDEIIIYT4BAUZhBBCCPEJCjIIIYQQ4hMUZBBCCCHEJyjIIIQQQohPUJBBCCGEEJ+gIIMQQgghPkFBBiGEEEJ8goIMQgghhPgEBRmEEEII8QkKMgghhBDiExRkEEIIIcQnKMgghBBCiE9QkEEIIYQQn6AggxBCCCE+QUEGIYQQQnyCggxCCCGE+AQFGYQQQgjxCQoyCCGEEOITFGQQQgghxCcoyCCEEEKIT1CQQQghhBCfoCCDEEIIIT5BQQYhhBBCfIKCDEIIIYT4BAUZhBBCCPEJCjIIIYQQ4hMUZBBCCCHEJyjIIIQQQohPUJBBCCGEEJ+gIIMQQgghPnFeQcbixYshEolw3333dVFzCCGEEHKh6HSQsXv3brz77rsYMGBAV7aHEEIIIReITgUZdXV1uOGGG/Dee+/Bz8+vzfvabDZYLJYmP4QQQgi58HUqyLj77rtx+eWXY9KkSe3ed/HixTAYDPzHbDZ35iUJIYQQ0sd4HWQsX74c+/btw+LFizt0/0WLFqGmpob/5OXled1IQgghhPQ9Um/unJeXh4ULF2LNmjVQKpUdeoxCoYBCoehU4wghhBDSd3kVZOzduxelpaUYMmQIv83lcmHLli148803YbPZIJFIuryRhBBCCOl7vAoyJk6ciMOHDze5bd68eejXrx/+/ve/U4BBCCGEEM6rIEOn0yE1NbXJbRqNBv7+/n+4nRBCCCF/blTxkxBCCCE+4VVPRks2bdrUBc0ghBBCyIWGejIIIYQQ4hMUZBBCCCHEJyjIIIQQQohPUJBBCCGEEJ+gIIMQQgghPkFBBiGEEEJ8goIMQgghhPgEBRmEEEII8QkKMgghhBDiExRkEEIIIcQnKMgghBBCiE9QkEEIIYQQn6AggxBCCCE+QUEGIYQQQnyCggxCCCGE+AQFGYQQQgjxCQoyCCGEEOITFGQQQgghxCcoyCCEEEKIT1CQQQghhBCfoCCDEEIIIT5BQQYhhBBCfIKCDEIIIYT4BAUZhBBCCPEJCjIIIYQQ4hMUZBBCCCHEJyjIIIQQQohPUJBBCCGEEJ+gIIMQQgghPkFBBiGEEEJ8goIMQgghhPgEBRmEEEII8QlpTzeAkL6AMQarw9XTzSB9SIPd2eLvhHSESiaBSCTq6WacNwoyCGkHYwyz3t6OvTlVPd0U0kcNfW59TzeB9DFDo/zw1R2j+nygQcMlhLTD6nBRgEEI6VZ7cqouiN5T6skgxAt7Hp8EtVzS080ghFygGuwuDH1uXU83o8tQkEGIF9RyCdRy+toQQkhH0HAJIYQQQnyCggxCCCGE+AQFGYQQQgjxCQoyCCGEEOITFGQQQgghxCcoyCCEEEKIT1CQQQghhBCfoCCDEEIIIT7hVZCxdOlSDBgwAHq9Hnq9HqNGjcKqVat81TZCCCGE9GFeBRkRERH45z//iT179mDPnj2YMGECrrzyShw9etRX7SOEEEJIH+VVfeQrrriiyd/PP/88li5dih07dqB///5d2jBCCCGE9G2dXoTB5XLhq6++Qn19PUaNGtXq/Ww2G2w2G//bYrF09iUJIYQQ0od4nfh5+PBhaLVaKBQK3HHHHfjuu++QkpLS6v0XL14Mg8HAf8xm83k1mBBCCCF9g9c9GUlJSThw4ACqq6vxzTff4KabbsLmzZtbDTQWLVqEBx54gP9tsVgo0CCEEPKnwRiD1eHq0H0b7M4Wf2+PSiaBSCTyum2+5nWQIZfLER8fDwAYOnQodu/ejddffx3vvPNOi/dXKBRQKBTn10pCCCGkD2KMYdbb27E3p8rrxw59bn3H7xvlh6/uGNXrAo3zrpPBGGuSc0EIIYSQs6wOV6cCDG/tyanqcG9Jd/KqJ+P//u//MG3aNJjNZtTW1mL58uXYtGkTfvnlF1+1jxBCCLkg7Hl8EtRySZc+Z4PdhaHPrevS5+xKXgUZJSUlmDt3LoqKimAwGDBgwAD88ssvuPTSS33VPkIIIeSCoJZLoJZ3elJnn+TVu/3vf//rq3YQQggh5AJDa5cQQgghxCcoyCCEEEKIT1CQQQghhBCfoCCDEEIIIT5BQQYhhBBCfIKCDEIIIYT4BAUZhBBCCPEJCjIIIYQQ4hMUZBBCCCHEJyjIIIQQQohPUJBBCCGEEJ+gIIMQQgghPkFBBiGEEEJ8goIMQgghhPgEBRmEEEII8QkKMgghhBDiExRkEEIIIcQnKMgghBBCiE9QkEEIIYQQn6AggxBCCCE+QUEGIYQQQnyCggxCCCGE+AQFGYQQQgjxiR4PMhhjPd0EQgghhPhAjwcZN76/iwINQggh5ALU40HG/rxqWB2unm4GIYQQQrpYjwcZhBBCCLkwUZBBCCGEEJ+gIIMQQgghPkFBBiGEEEJ8goIMQgghhPgEBRmEEEII8QkKMgghhBDiExRkEEIIIcQnKMgghBBCiE9QkEEIIYQQn6AggxBCCCE+QUEGIYQQQnyCggxCCCGE+AQFGYQQQgjxCQoyCCGEEOITFGQQQgghxCekPd2A3ooxBqvD1dPNIL1Ag93Z4u/kz0slk0AkEvV0M8ifRFvno44en3pqn6UgowWMMcx6ezv25lT1dFNILzP0ufU93QTSCwyN8sNXd4yiQIP4nDfno7aOTz21z9JwSQusDhcFGISQVu3JqaKeTtItuup81FP7LPVktGPP45Oglkt6uhmEkF6gwe7C0OfW9XQzyJ9UZ85HPb3PehVkLF68GN9++y1OnDgBlUqF0aNH48UXX0RSUpKv2tfj1HIJ1HKKxQghhPSsvng+8mq4ZPPmzbj77ruxY8cOrF27Fk6nE5MnT0Z9fb2v2kcIIYSQPsqrkOiXX35p8veHH36IoKAg7N27FxdddFGXNowQQgghfdt59bvU1NQAAEwmU6v3sdlssNls/G+LxXI+L0kIIYSQPqLTs0sYY3jggQcwduxYpKamtnq/xYsXw2Aw8B+z2dzZlySEEEJIH9LpIOOee+7BoUOH8Pnnn7d5v0WLFqGmpob/5OXldfYlCSGEENKHdGq45N5778UPP/yALVu2ICIios37KhQKKBSKTjWOEOJbVNnWO1T9tfOoSuqfk1dBBmMM9957L7777jts2rQJMTExvmoXIcTHqLLt+aHqr96hKql/Tl4FGXfffTc+++wzrFixAjqdDsXFxQAAg8EAlUrlkwYSQnyDKtuS7iRUnOxrdR7I+fHq0166dCkAYPz48U1u//DDD3HzzTd3uhGMsU4/lhBy/qiyLfGVnq44+WfS0vBne0N8vh7G8nq4xBeo+4yQntUXKwkSQs7pyPBnS0N8vh7GogXSCCGEkD6us8Ofvl44jS5dCCGEkAtIR4Y/u2sYi4IMQggh5ALSm4Y/abiEEEIIIT5BQQYhhBBCfIKCDEIIIYT4BAUZhBBCCPEJCjIIIYQQ4hMUZBBCCCHEJ3rHHBdCCCF9grcr957vyrW0emvfRkEGIYSQDjnflXs7s3Itrd7at9FwCSGEkA7piZV7fV32mvgW9WQQQgjxmq9X7qXVWy8MFGQQQgjxWm8qXU16LxouIYQQQohPUJBBCCGEEJ+gIIMQQgghPkEDaoQQQsgFrKXaJm3VL+nK2iQUZBBCCCEXqI7UNmlev6Qra5PQcAkhhBBygepMbZOurE1CPRmEEEJ8hjEG5nB7/Ti33dXkdzc6d1UtkompWujv2qtt4ovaJBRkEEII8QnGGMrePgR7jsXrx1rB+O9Fz+2EqpNBhjxKj8A7BvS5QIMxBrfbCpfrXLDlclnhckkgFqs69X56orYJBRmEEEJ8gjncnQowAEAFEX6F/rzbYM+xgDncEPmwOmlXY4xh777ZqKnZB5tTDmAJAGDr1uFQSO0wGIZgyOAv+kTgREEGIYQQnwt9fES3nuiZ3YWi53Z22+t1JbfbipqafQAAhdSO/07+W5P/r6nZC7fbColE3RPN8woFGYQQQnxOJJdA3I1BhvdZIL3TuLE7eTDhcjVg668jerhF3qEggxBCCOmlJBJ1n+ixaA1NYSWEEEKIT/SKIIMx1v6dCCGEENKn9IogY/7a+RRoEEII8Rpj7GwdjRZ+BK39P513zmGMocFh5X83OKxocDSc9zbq8ZwMbdITOFThgNVphVrWd8edCCGEdK+O1uEobmWWSV+todHVGGP466q/Yn/JUQDPAgDGf3kxRGIHBgUNwkdTP+r0NurxION8McbArNb27+iFJhFwgxVuZ9dnRItUnSumQggh5KzzqcMB9M0aGr5gdVpxoOwARGJAl/xok//bX7r/vDoB+nSQwRhDzl9ugHX//i593kaJHLjiBQBAxpixULrsXfr8AKAaPBhRn35CgQYhhHQBb+pw9NUaGp5DFy5XQ6crf7Zl0+xNUElVsDqtGP/l+PN+vr4dZFitXR5gAIDSZceq7x/q8uf1ZN23D8xqhUhNQ0SEEHK+WqrD0dq6KZ5ZBp49102er5etecIYw/4Dc/nfW38d4ZPKnyqpqktTF/p0kOEpYduvEKtUPd2MdrmtVmSMGdvTzSCEkAtaT+VrMMbgcDj433aPIMZud0DqUSZMJpN1+PndbissloNNbmu38idjgL3h3N/2BkCmA7oxeLpgggyxSgUx9QoQQkiP8uw9aD7DQ9AdvQQ9ka/BGMMHH3yAvLw8fpuDiQEMAQC8/PLLkInOBRlmsxm33HKL19ti9KhN+G37+PYaA3wwBcg9AODDs7e9HA9EDgJu+aXbAo0LJsgghBDSs9rqPfDsMejuWR3dla/hcDiaBBgAIBO5cbNyd4v3z8vLg8PhgFwu9+p1JJKmvfaMMdR7rNZa73JBxRohytsJtQg4o/yLx4vuABwNgFzj1Wt2FgUZhBBCukRHew+6e1aHN+umdNWaJw899FCrwYPdbseSJUu65HUYY5ixLxO7K2uh/P22tF+PYrifGisAiADgocyz/7Ekvkte0xsUZHQBb6bRuj3u5/Zi6i1NeSWE9CUt9R701VkdnSGXy73uoegMq5tht6UekIrROCWc376rthENYiU07kZA3nOpBBRknKfzmUbrTQIoTXklhPQlLfUeXCgro/ZWh8f0BwCkbTvawy05h4KM8+SrabTN0ZTXtjHG4LTTIcwbDo9EPIfNBQejANYbUnnvmuJIvNfSFNfWklUFvW1qqye1pFesFNIEBRldyBfTaGnKa/sYY/j25X0ozqrp6ab0KXYwwHj29w8e/hVy9M4DZ28VGmfA1Q8N7rUnHNK2jkxxbWl6K5Ui9w4FGV2IptH2DKfdTQFGJ8ghwsPVvb+2TG9VdLoGTrsbMsWfuyR1W5r3FPTklNY/tK2TU1ypFLl3KMggF5R5L42lgz7xKYfNhQ8f+bWnm9HrMcZQ3kZPQU9OaW2uI1Nc+3TSKmOAxwqrZ4tyqbulVgYFGeSCIlNIKMggpBfwpqegp3sHOjLFtasyvhhjsNvtTf72uWVXAgV7zv29JB4wj+yWolwUZBBC/pQ6myzssLla/N0bf7akUc+eAs8hFGZ3oeSlsye/nh4+6Q4tVQRdtmwZbrvtNt++X88AQ9BNRbm8DjK2bNmCl19+GXv37kVRURG+++47XHXVVT5oGiGE+EZXJQt3dtjkz5Y0KvQUtDWE0puGT3ylpYqgBQUFnar62Sk9UJTL6/ku9fX1SE9Px5tvvumL9pA/EcbY2amTXfAj6Irn6pbuS9KjejpZWEga/bPxtiLohWzhwoWdfmynD1FydbcX5vK6J2PatGmYNm2aL9pC/kR8Ne20KxLy/mxXmX923ZksTEmj5/zZK4LKZLJOP/bgodv4715fFHnevxsuqHyek2Gz2WCz2fjfFkvnV8UjF46evpJsS9HpGlhrHT458fzZxuL7gs4kC3dF8bfO5nMAF8Z+1NcqgnqezN12V5O8EWF5d8+ETrvd7tVS7t6orT3Mf/fq+Rk7mwQqWHYVcNvarmtYC3weZCxevBhPP/20r1+m1/NmfRNPnV3rRNAX1jzx5kqSMYYVrx1A6RnfBqu+utqkXpK+r6fzOYDu248YY3B6XCQ2qRLbaIPD7QQASBWK824LY+wPdTRaSgBtbyl5XySNMsZQ/v4R/nfxczt53giAPyRzAsCSJUs6tZQ7YwwuVwP/2+VqgFjcRfVsHNamSaAFu88mf7awja1Oa5O/O8vnQcaiRYvwwAMP8L8tFgvMZrOvX7ZXOZ/1TTx1pvJnX1jzxJsrSYfN5fMAw5eogFPf1xt64bpjP2KMYfmTj6Dw1HF+m0MkBaJvBwAsnX8DZOxskBGWlILrn37xvF6refVNzxO5Z49Be0vJe578uwpzuOHIq21ym5A34oTrDwGGwNul3Blj2LtvNmpq9vHbtv46AgbDEKQP+KDzb8ALjDH8ddVfcaDsAL9t/rr5+GTaJ516Pp8HGQqFAgqFwtcv06t11/omLbHu2wdXZWWXlzvvDT0kfanwFo3FX5i6ah/s6PCL0+7Cx49vB9Cx4ZbzGVZx2mxNAgwAkDEn7s1e+of7Fp48BqfNBom4c3kGrSWENq+f0ZHEUXuO5WwviI+OTyGPDEPxS7tb/D9heffOLuXudlubBBiCmpq92H/gRq+f7w860CPR6GpsEmAAwKGyQ016NrxBdTK6WfP1TRhjyJ13CxoPHfLZa/pi7ZPe0ENChbdIT+uKfbCzwy8dCVq7aljlznc/gUyh/MPtDlsjls5v+eTn2cXuTXd76OMjAKDdBNDmiaNumxPFz+8CAJS/fwQBt6V2+DW9IZK3PimzK5d3Hzf27Pvf+uvZ7eGZh9Fpn13n1d1XzVyFad+e30QPr4OMuro6ZGZm8r+zs7Nx4MABmEwmREZGnldj/gyar2/ibmjwaYDhK7QqLCFdw5fDL101rCJTKCFT/jHIaM4zmKj43zH+e/n7RxB0V3qHXqujVT/bqtLpyKttdQqskNPRXfkcnSWR+ODYWvTHXpK2KCXnPvPO5mV4HWTs2bMHl1xyCf9byLe46aab8L///a9TjbjQdTSi98Uqrl2tL6wK2xuXfe+KKpG+ciHMVPCF1vajjnyWnd2mXTX80l3Dc01mXLjdqPzwXHKks6DuXHvaOOl3p9ZyOprnc9D34ax7NtzDf5+/bj7eGvOW18/hdZAxfvx4KlaEc7NFWpr94ZmvIAyHCHJvuRXRyz9vcSemVVzPX19Y9r235WbQjJezPIMKxhh+eP0ASrLbHv9v7bMMjtFjxsKBkCkkXm3XvjQEyBjDV889zv/+9vmnMc41vQdb1L72cjo6sobKn+n8d7TiKP+9s3kZlJPRCa3NFhGu8D3zFZjV2mQ4pPHgwV49zNDeVFtvp9R2d4Job8j872toxkvXB6cl2Ra8d9+WCzqAc9psKM48yf8uOX0KiO659niryXoqXhQBq/jo3DAQYwy48D7aLtXrg4y2TnodPeF19YmuvdkifTVfwdupth0ZNunJBNG+NPukJ9CMl3O8CU6b71dtbUcK4LqGL/IoPHM6vBnIceSfGwa6IIJHxny6EmuvDjK8Oem1dcLz5YnOM4+iL+QrtMUXU217MuDqS13PpPdoKTj1DCTa2q+Ex1IA13VaW1TNl3Ux/lSWXQXctMJnT9+7g4wuOun58kTX2TwKxhjcDQ1wVVXx25yVlRBbrX9I/uyJmhTnm4Ta1wMu8uclBBHtJX62lNwpU0gglYubJIQ21NqhhrzVx5yPzian9qVk347kUfiyLsYFr2A30MkaGB3Rq4MMT5056fXWEx1jDLkt9NCcnnRpi/fviSGH3pqE2tm5971ZT82G6S0zXrr7hCes/itw2FyQNqt90FaOhtBDIeRbtPe4T34vnuX5GJFIxD/31j6H9rZLR/NIWupR6au5IkIeBWMM5e8d5kMXvqyLQc5PnwkyeutJrzNYY6NXPTR9Ncejqwnrlgh+eP0ArnlkSJccsHtKb5kN05Nd+915wmtpe3/4yK8IjtE3uV9HcjRaWq69vccJj5HKxS1+7p6fQ3vb5XySnPtqroiQR+G2u5rkRrQ0RbYvXZD09vadjz4TZFzI4tatbdJLcyHkePjiC+60u5usW1KSbemyA3ZPodkw3XvCa217tzVV1ZtET899fe5zo3gJcM/f22qHJ2+2S0eTnHsyV8RzYTOg9QXQhPuer44UA2stobQ7inJ99tln/Pdly5bhtttua+PevuXLgOyCCTJamoXS1uyT3rD2hkBqMl0wvTSAd7VBukJXH7B7yoU6G6a14aD21uHwde/TvJfOBvDtnXTbSyD2PCj//Na56epSj1oL0jbqLsx7aSykcjHfRi1tl/a2RV9Ici5//0iTRcaar2QqaL7iaWdPeu0VA2srobQ7inIVFhby3wsKCuBwOCDpio/QY3vNHrAEPx24p81ZtgzA7Rvv5X/f6/F7V7gggoyOzEJp3iPQG9beuFD1ZG0Qb646e5u+cKLwVm/OG+iqbe3ZvtKc2jbu2TKpXIwf/32wQ/kfffl41XwVU+Bc8StPzVc87eh79jYYaSuhtCNFuXotj+2115CGBrESGndjq3e3ikQ47FF060j5kVbv2xkXRpDRiVkof9Y8h95Yd6QrXYgn6r6sL+YNMMZ6XQG589kWnj1JvSFnqaMLoHnLc3ikNU3KoHsMkYQ+PgKQieGud6DkpT38/3t6LZMLIVfjgggyPLU3C6Uv5zmcr75Qd4RcuPpC3gDgmwJLzce8W3uNru6J68gsGaB7e0q87R3oaL6A5/BIa8/jOQxT+nswAQCQiVHxwdEmPRutDed0F8YY9h+Y2yOv3ZrOBD0XXJBxIcxC6ei6KF4/bxfWHXFVVjYJ5jrSro5cUQG9dyaIoCunnPpqGmlv3IZ/5l6mlR65Gj+8fgAzFg5s8X5dvY062pPUvKfk7DTfc13sDlsjpApFj+xTnV3NtbnmwzDN/6+loZOWhnO6i9tthcVysEdeuzULNy30+jEXRJDRpAvMav3DCc9ziKA3JIO2FQ16sy7K+Wivx0dI3mxtGXpvc1wYY/iuA1dUQO8ef/bllNOuvHrvzdvQU0sBW08XkmreJqGOhueih5737UhbyjxyNYRZUR1pQ0vbwpv333y2i1Ira/JYe6MT//v7tib3ZYxh+ZOPoPDUcX6/pfNvRFhSCq5/+sVu36d8sZpryCPDUPzS7hb/z1fDOReCzuRr9Pkgo/lMhowxY5suUNbGEEFPJYPm3XFnq//XXeuitNfj425oaDXA6Ey7vBmb780zQfrKlNOi0zWw1jq6ZBv66qTekYCtJxJCm6+++uEjvzYpvNW8VktrvRJtcbvPnSibX3S0tl3aKgLWkuZ1ZT5+fPsfioF9/+rhJu/lmkeGwGmzNQkwBIUnj8Fps0GmVHboPbbXNs/fuz0YlrX+ekKxL4GQm9FRjDHY7Xb+t91uh0wm61BP74Wo7wcZzWYyAE1PeN4MEQjDABKTyac7ve1Ix6LB3rIuSlu9Hp1pV2tj874Yi/flF7erppwKJwPPGiBdoau2pbBseVvfidYCEc/t37xHoLMBW2sBVFcFQy3VzPAsvNVSrZaWtLXvffrkDv6757RXoGMFvRy29stoN68rIzxWCOBbqzvj6c53PwFwtiejq7jdblT899xshp6o1um5kmpzzXM3ip/bCZlZ16HnZYzhgw8+QF5eHr9tyZIlMJvNuOWWW9p45NlaGfPm/aVDr9OX9LogoyNDG60Na8StW9ukNDdjrMnj4tathcTPj0fxLQ0HNO8J6Yr301m9Jb+krXZ0NCnL8/+ajzu31mXekZNGW1dBza/kujrg6Krxc4fN1eUBRlcSli1vi2cgInxuzbe/Z49A889MCNg6GnB1Rw9HR+totLZfrfxP6+Ppnvu757TX5t+Fm18cA7lS+nuvw34+7PLD6wdwxd/O5Sa09325/snhWP7Mrjbb2xKZ4vx7LZorf+/IeQ2BdEXPh2e10D88fwu5G63lcvzheR2OJgGGIC8vDw6Ho83HCrUyLjS9Ksjo6NBGa0GA59V2S+uDnJ50KX8sWugBEXTl9NbePiZ+PlorutWStgKB1rqGhROXTCFp9fFtbd/mV2od+SzaS+rsaKJmZ6+qu7ogV4sBnN3VZD2NruAZiAgn+/aupD0JAdv5BFzNezjOt2ejo59D854IQVlu27MdWrLitf0oPXPuhLbq7cNnhzDs7j/kdfzQwQAOAA8wgHPDIj2lvVkg7elsz4fnEFVHBT88FCUv72n/ji146KGHAJztyfgz611BRgeHNjoSBLS2PojwWE/xv26FSCSC22rlPSEtJZCSploquuVuaPBqm7XVNSycuNoag24vSc8b3iZ1tnWF2/yA31bw0tFZJd6+r84mqXZkeKStoa2W1vRoXla7rTZ7PqZ5omJ7PR2ebepIkNreVX1Hcgc6U4CrNZ4BBtD2cEzz1xWGUeTKtg/rJdmWDg23CLpjDRBvnrezyZ+lnQgWKj8/4fVjBHK5vNOPvZD0qiDDU0t5AK6GBmSOHQfgXBDQ0ecCWq/9kH/X3T4fNvmzyL3lVkR+8N8O39/z4HLzi2MgEon+cPJq6aQlaC9JzxtdmdQpXFWrdDIA6PDJ3pvApT2dfT8l2RaIRKIOX8kLvS/NZyp4trOtstqC9hIVAe+GlpoHqc23W/PXa689nU3y7IyOBmUAEGDWofz37vwVr+7HNX8fArH4j4mKnvfr6HthjOGr5x7nf3/1/BP4y7Mvd/kxsVuOsQ7vA6S2hlVIx/TaIKN5HgBjDPk3z+N/C0GA+b13O/Rcf5jm6pEh7e2wSUuRffP8D2+CoAtJ48GDYI2tl7D11Pwgvurtw00OfB050LaXpNfa67b0/549CnOfG9XqibF5j0JrV9dCwDP93vQuCV7OZ9ZNR4ZhOpt4K1NIIJWL/zBTwfOzbO0743lba8MrrV2dd7RwVWvbraXXa+v/hR6A7tCRoExQ7pEvUJpTi29e3Itr/v7H4RDP+3U0YdVha0Rx5kn+d3HGiS6bYdKbdLa3hjGGNhcGIb03yGiu1VkkHTihtTTNVZn+x4IuQu9JWzMmWspDiPr8M+TecGOT4RlvgqCO8NzxnZWV/IPry0M67WW3e3OgnffSWDDGWr2SFm7r6BBCW8FN8274tq6uhZOkZzu9DRK6YtZNS0mq3g7htDVc095n6VmMasWr+zHzkcFt9hI0v+q+5pEhf3jtthJvO5q06a3W8i+62vlM8SzNqcW3L+/rVK9L856Lb1961uvn6Is8C351pDy5oPz9IzDeluyLJnUIA9Dg8vgOM9Zk7ZLeoNcEGWd7As4FDK6GBrit1hankzafRdLuczc2/iFAaTz4x8zvjszmaDEPoaqq9fyPDl7Vt/mazQIbz/euGpiOyM8+a7F7tKe0dCXQ0lTGriSVi5uctFa8uh8z7hvY5D7Ciex8exXayhURVtT0nAngeWLqyIwUb07+vso/6YrZG56fuWfSYmlOLb59aV+TnILmQUnzq24hh6Cjxal8VWOlK/Mv2rLyrXPHp5b25fZ0tOBXc067rUnPRenpU169bnfp6twQz2RUbxJT28sP8WynUC+jqzAAT+N5ZOzI4rfNPpiFL9Nju+w1ukLvCDIYQ8lfb4XtwLkvlpB7wWeDeGitZkNHurxi165B1qWTz7fFrWov/6MzWurFEVgPHETOnL906VLqrQ39dPT5mxcbYwB+fPtccR/PYYSWXrszml9Jl+bUNsm+B/544D3ftTRaGpYRnq/5SbWjvD35B8foMfPhwV4HmZ3J1/AcuujIlXZbV/3enqyb594AbRenah6o9bVCR2U55050Le3LXaGlz6entlPz4lftKf/waLv36WmMMSxbtoz/vWTJEoSHh3fZ89ugQIaoX5Pb9loa0ODuXft6rwgyFA40CTA8Wffta3PGgufOme9xcsttpapm4f0PnEdL29c8/8Mbnc3raDx4kK8l0lIwINQe6chaKC1NI/Y2CbZ5sTG3WI7S3Pomt7WWN+F54HPam161ttX70dI2b+9E1pk6F77qhhd4e/Ivybbg25f3tTic0DwQ8DzxtpR/0pFckx9eP4CZD7c91CHwNpBo63vTUu6NoKV9qflU0M5M2+xNgUlnelDaa39Lz+nt7KWu4jlE0WThsla4CuvbvU93aW07OBwOFBQUNLmt+d9dZefIZIzY8ccqrb1BrwgyPLXUE9DWjAXPq+ZGj5Ob3eN3z52gsYPVNjur+dBGR7+IbZ3cO5LX0draJq3VHmn1/q1MI+6q2iHtJXN6Hvg87/fhI78iKKr1qnvdNVbekaCkMwdfIQBoLfnUMwBo3rMiDCd4Jkc2T6pd8dp+iMUiFGf98WQtbOfmBbVaSowsybbAWuv4Q/6FvdHp9XturiOfoWfvk+dsFofNBYlHqeiWpoI2H2pq73P66Y0DHWl2r7XyTd8trtU8d+N8nW/tjJ7UVvVQwcKFC/H666/7rA1qSe8ZLm+u1wUZLQ2FNJ+x4Hlw6GiJbm/8YSaKF0MFzfM/Ovy4Nk7uzRdya0vzYOB81kLxxdCPJ29Pxm1dzXX0Ss9uO3cybKi1Q42zc9nPp3DT+STpCY9paYjEM8hqLx+ieXLkH4aPzrS/fZoX1PIczvJMxGwpGPAsk91ZHfkMhd6nsxUwz81m+fCRXxEUo//D/ZtM73yt6fegvaCmPL/3XC13Rlme9yfujnwnGWNw2prmbnSVthYu660c+XVAO5NtujIXo6/pdUFGazyrteW3scBYS9pakMyT0KXc1oJr3SFh269gjPG8lLba35EZMc3vyxiDq6qq3cJjIqWySXDXFcvNe544V3bgytWbegEd8ekT506GnlUvmxegaivoaBKEut344fVzV4ze1B/wHLpob4iked2N5lrqzWiNUOTKaXd3uKBW82mSzbWXZNhVhGGzlnpZSlsYUmnS7maBVnclcfYlK//T/nfyq+efwLWP+WbWiaiLE8JJz+sdQUYHrgRPT57Cf/d2yKOjvR3CsExbC651B5FSiTyPQKet9nuzvomQs9G83LpnIOV5Am2+tkvzIZbzVdaBg7zEi9UPz0fz9Tk8p6m2GVS8dqDNWRIt6cg02uAYPa74WzpWvHaAb6e2EmaBpr0ZbV2RCkWuPJ9HGIbwxSJ1XUkoN++5bgfpOh0JvIozTsBpt3VDa/58elMeUFfp+bCRMTz++bmDcu7NN7dcY96LIYPOaj4sE7dubYce52po4L93djzec0jEVVXV4WXWGWNw/z7dV+C2WnmvjOftQgJoq8MyDQ1NenHaK1LWHbor16I5Iej49uW9TbrZV7yy/w+zWJprbx/oSHKnEKw0D8TaKjQm9Ga0V8mypecRhiGaL1zXG5VkW7Di1Y6trEz6lt66z7WFoevKrn/22WedbMS5121SN6MX6PGeDIUDSCo693fj4SPI9ajs2ZNamyrbnOeU2I4OzXhq3mPgTQ2Q1laSVQ4aBBHQJKBonkDbPOeipXoinmJ+/AHZV8w4e18vcg/O52vn6y7t9maLNE+U7Mg4tzcJd56JjGfzDFqur3HjsyPxye9DPW0dyIThmtaKg3nmKDQP5hlj3VbR8nx1ZvEx0nV8FQx0JImyN2FgWCU/d4xtLUjoaPBQWFiImE6UuZh76Az/vbfNMunxIKMlvkjm7CjPA2/zXoCOaN72jjyuo70W3jy2sYXeisaDB5u8p44GUQIhwAA6vkYJA3Ag/V6vXqc7NZ8t0tI6HN5qLxBpUpjM7uJluZv3XHgGWKvfP1cXoK3enfaGazxzFFYtPZc46Xa78e3LB5r0sPhydgLpnTp6nPNVJdC+tlaIE26Uic8F9EVFRS3er7Cw0KftOFDX0P6dekjPD5f0Mp65H549Cq0O4/wudu2aFm/PW7Cgw6+dsO1X3rvgrbh1a5G4dw/it275w/8pUlLOtad5oaxO1uzv6BolbrEcFkPvqkDnSRhW8iSRifCzxwm4K7nd7ibDL588vh3vLtyMb17a2+Y0UG+Ke3X0c/R8npaSTzszO4H0bR3tneytlUAvJJ3pLdo5sudKnLeGgozmWsk1aDx8BDnXXddqoNFakS/b0Y53/4lVqiYLt3nj9KRLkXf7/Bbr1tuOnWuDrVn9EM8cjI4M9XgGU31x/LS5Fa/tb3LS//CRX/Hty/s6vNKnt3547UCL00lLsi2d7jlprjN5LJ5TUAMjtV3SDkL+zM73+Hjw0O1eP6Yz9TJ8ncHRK4dLequ28kW6oshX85O+t7xdK6V5DkZHhqk8g6nO5J/0Nq2d8H32et0wbbIzr+E5xEL5DoScv04ncf6utrYTvamdCGxsPi7NQD0ZXvJlvoi7jTVK2uI5HOL1MsVe8gymejJ3xhfmPjeqx147wNx6NVNCSN/TWn6GL3kmgPYWFGT0Ip3tGfAcDvGmSNaF0BPRlbqrJkdLPBMyCSGkM3pjAigFGb1IV/QMeNM7caH1RJwvz+W1CSGEnD8KMi4wBQvv6+km9Fmey2sTQgg5fxRkXGA8h04IIYSQnkRBBiGEEEJ8goIMQgghhPgEBRmEEEII8QkKMgghhBDiExRkEEIIIcQnKMgghBBCiE9QkEEIIYQQn+hUkPHWW28hJiYGSqUSQ4YMwdatW7u6XYQQQgjp47wOMr744gvcd999eOyxx7B//36MGzcO06ZNQ25uri/aRwghhJA+yusg45VXXsGtt96K2267DcnJyXjttddgNpuxdOlSX7SPEEIIIX2U1Js72+127N27F48++miT2ydPnozffvutxcfYbDbYbDb+d01NDQDAbTu7WpzL6kKdy+VVo0nf4WIuWO31Pd0MQkgHMOaA3eHo1GMlIqDW9uf6rjvggk1ka/+OHSQSOVBf7+7QfW1wwS1qf70lS6MbrjYWzqwVMbisHTsHC/fzZiFOr4KM8vJyuFwuBAcHN7k9ODgYxcXFLT5m8eLFePrpp/9we8HSmwEAeQCGe9MI0vecmtHTLSCEdINFWNnTTfiTGdfuPcI69Dw1Xr1qbW0tDAZDh+7rVZAhEIlETf5mjP3hNsGiRYvwwAMP8L/dbjcqKyvh7+/f6mMIIYQQ0rswxlBbW4uwsI6FLoCXQUZAQAAkEskfei1KS0v/0LshUCgUUCgUTW4zGo3evCwhhBBCeoGO9mAIvEr8lMvlGDJkCNauXdvk9rVr12L06NFevTAhhBBCLmxeD5c88MADmDt3LoYOHYpRo0bh3XffRW5uLu644w5ftI8QQgghfZTXQcZ1112HiooKPPPMMygqKkJqaip+/vlnREVF+aJ9hBBCCOmjRMybuSiEEEIIIR1Ea5cQQgghxCcoyCCEEEKIT1CQQQghhBCfoCCDEEIIIT7RK4MMt7vt2u0//vgjlixZgp9++qnd52poaMDOnTt5rfXDhw93SRt7QmNjIz799FO8+OKL+PTTT2G1Wnu0PW63GxaLBS6XC9u3b0djY6PXz9FXPh9HB9dzOHHiRLv3af65LVu27A8/Xe2ZZ57hP08//TQeeeQR/n+FhYUAgJKSEjz//PO488478fzzz7e6VIC3SkpK/vBaXeX777/HZZddhsrKSrz00kvYs2dPlz6/p+zsbK/2z/r6s+t4HDp0iH/mzZ+jrq79tSe8UVVV1aXP56moqMirNSua68x33fNcsG3btk6/tqfTp0/z34uLi/H444/zz2rTpk1d8hp9UVdt3+Z6bHZJQ0MDDh8+jOHDh0MkEuHgwYNwOBxISUnB6NGjMXPmTHz55Ze4/fbbccMNN+CJJ55AVlYWRowYgaCgIBw+fBh79+5FQUEBPvroI+zevRsPP/ww5HI5Fi9ejEmTJkGpVOL6669HcnIyNm/ejAULFqC0tBRmsxmPP/44Tp48if/7v/+DWq1GfX09nn/+eRw9ehSnTp3CpEmTUFFRgY8//hgWiwU6nQ6RkZFQq9Ww2+3o378/TCYT1q9fj+uvvx5arRbBwcF49NFHkZWVhSeffBKffvoprrzyShgMBqxZswZlZWUYPnw4rrvuOqxZswbr169HcHAwUlNTcebMGWzYsAEOhwMrV65EYWEhPvzwQ+zevRuXX345rrvuOsyePRu33XYbMjIykJ6ejq+//hqPPPII/Pz84Ofnh08++QT19fWYM2cO8vLysGLFCjz11FM4fvw4VqxYgXnz5uFf//oXcnJykJaWhquvvhqnT5/GgQMH8OOPPyIiIgLl5eV48cUXAZw9OYSHh2P48OH48ccfERMTg7S0NADAe++9h/feew8xMTHYv38/LrnkElitVixbtgwbN27E+vXr8eSTT8LlcuGdd97BwoULkZubi2PHjmHQoEF46aWXcM011+Df//43Ro0ahfXr1+Oaa67Btm3b8N5772HTpk2oq6vD+PHj8f3330MkEsFgMGD69OlgjOHpp5/GzJkzsWTJEgQFBeHRRx9FQEAAvvrqK5jNZuTl5WHfvn0oKSnBG2+8gfXr1yMnJwdarRZ2ux1XXHEFnnjiCcTHx2PevHn4xz/+gfvvvx/l5eVITU3FoEGDMGjQIDDGcOmll+Ktt97C4cOHsWnTJowfP57vx6dOnQJw9sC1bt06fPnll4iLi4PD4UBQUBBuvvlm2O12TJ06Ff/85z/x/PPP46677sKbb76Jffv2YdCgQUhNTcWAAQNw/fXXQ61W49ixY4iIiEBkZCRGjBjR5veosbERX3/9NW688UZ+2/r16/HDDz/grrvuQlJSEhhjmDBhAoKDg3HLLbfgmWeeQVxcHFQqFcaMGYN///vfeOyxx7BkyRK89dZb8PPzQ21tLV588UV89NFHGDlyJBwOBzQaDfLy8uDn5wfgbHE+AJDJZEhNTUV1dTXuvPNOfP7556irq0NQUBBycnJgNBpx4MABXHvttcjIyMDtt98OlUqFoKAgfPzxx7jjjjuwf/9+LFu2DL/99huuvPJKLF68GMDZoO3UqVOQyWSwWCw4dOgQ/Pz8sG7dOmRmZqKwsBDjx49HTk4OYmNjoVarcd9992HQoEFQKpV8m5SVlWH79u2oqanB3LlzUVVVxd+H1WpFRkYG0tLS8Ntvv+Gf//wnfvzxRxQWFqKiogJ33303Lr30UixbtgwSiQSTJk3Cm2++icbGRtTW1iIjIwOffvopZs6ciccffxyffvop7rzzTuTm5qKxsRFKpRJutxsulwtSqRSJiYmoqqqCUqnEiRMnIJFIcO+996KiogIvvPACvv/+exw7dgz3338/VCoVNm3ahIsuugh79uzBt99+i7vvvhsvvfQSEhISkJSUhKeeegoTJkzAzz//jIKCAgQFBWHfvn1QKBR48803ERwcjCuuuAI2mw2//vor0tLScPfdd0Oj0WD27Nmw2WxIS0vDF198gVmzZsFgMODxxx+H3W7Htm3bkJiYCJVKBZVKBY1GgwULFqC8vBwJCQlYtGgRKioqMHv2bAQHB2P69OnIy8vDyZMnsXfvXpSWlkKn02H37t3YsWMHdDodzGYz/P39cezYMQQGBiIhIQE33XQTtmzZgttuuw3fffcdJk6ciK1bt2LDhg245ZZb8M033yA3Nxd33nknJk2ahO+//x7XXnstjEYjsrKysHDhQgwcOBDJycmYOXMmtm3bhh9++AGBgYFobGxEWloann/+ecTFxeHkyZNYv349nn/+eezfvx8qlQoBAQHQarX47rvvkJCQgICAADDGYDQaMWTIENx66614++23cf/99/P9XrBq1Sps3LgRw4YNw/333w+FQoHbbrsNkZGR+OGHHzB69Gjs378fDocDqampqKqqwtixY/HEE0/Abrfjt99+w6uvvoqSkhJUVlZi8+bNkMvlGDp0KMaNG4e//vWvyMrKQkNDA1JSUpCfn4+PP/4YcrkclZWV2LVrF6677jocPXoUEyZMwNSpU2GxWBAbG4tly5YhNjYWFRUVGDFiBN555x0kJCTwfffdd99Feno6Tp8+DblcjjVr1kCtVuObb75BQUEBkpKScPHFF+Onn37ClClTsHbtWkyaNAmBgYEdPtd3W5Cxe/du/iX59ddfAQAajYZ/+err68EYg0wmg9vthlKphNPpRFxcHKRSKQoLC2G1WjFmzBjs2rULIpEIFRUVMJlMEIvFKC8vh0KhQGxsLDIzM6HRaOByudDY2IiEhATk5OSgoaEBUqkUOp0OGo0GNpsNDocDarUaDQ0NqKurg9vt5muqSCQSuN1uvP3227j77rthMBhgMBhQWloKhUKB6upqGI1GWCwW2Gw2SCQSAMCECROwdu1aBAYGwmKx8KtgqVQKh8MBkUgEiUQCp9MJxhjcbjfEYjEkEgkMBgOcTieAs3XitVotysvL+Uq2QUFBqK2tRWBgIMrKyiCRSJCUlISMjAyoVCo4HA44HA40NDSAMQaJRAKtVguXy4W6ujqkpKTgzJkz0Ol0KC8vh1gshlwuh1gsRm1tLdLT05GXlweLxQIAGDNmDM6cOQOVSgWJRIKSkhJeWl74/Gw2G5RKJRwOBxQKBW+/Wq2Gy+WCn58f/P39kZ+fjxEjRuC3336D3W4HcDbY7N+/P44dO4aYmBgUFBTg4osv5gfJoqIipKSkoF+/fli5ciXi4uJw6NAh/jmJRCKEhISgsrKyyfYUdmvhwFhdXQ3gbI+En58fdDodGGMQi8UoKytDYGAgysvLYbfbYTQaUVFRAa1Wi4aGBshkMn6ysNvtCAkJQVVVFVJSUnD06FFoNBrU19ejsbEREokERqMRUqkU1dXVSExMhNFoRH5+Purr6zFs2DBs2bIF/fr1Q3Z2NiwWC/R6PUQiEYYOHYrffvsNLpcLMpmMf55SqRTx8fFobGzEmTNnMGnSJFgsFuzYsQN1dXXw8/ODSCSCUqlEYWEhRCIRNBoNGGO4//77sWPHDmzevBkOhwMSiQQNDQ1Qq9XQaDQoLy8HYwwREREoLCzEmDFj+Pfvt99+4yd3rVaLuro6iMViMMYglUpht9ubrD8kEonAGENAQAACAgJgMBiwf/9+aDQa1NbWQiqVQiKRYMyYMTh58iTEYjEUCgWOHz8OqVTK10BSqVS444478MMPP0AmkyEvLw+NjY2w2WyQSqVwOp2Qy+W48sorsWLFCr6/Cc+vUqnAGMO4cWcXj9q5cyesViucTicaGxv5tvrrX/8KlUqF1atXIycnB06nk++XwomksbGRHyuE9+10OpGamsprBNntdigUCtTX18Nut0Mul8PlckGn06GxsRGhoaHIzs6GWCxGv379cObMGf4ZaLVaOBwOREZG4vjx4xg9ejSOHz8O4GyvhNFohE6ng0KhwKlTp3gALZPJIBKJ4O/vj6qqKlitVsTFxSEzMxPR0dHIycmBXq9HQ0MDdDodqqur4XQ6IZVKIZVK+Xusq6tDSEgIqqurYTAY0L9/f/z666+w2WwQiURwu90IDQ1FSUkJRowYgSNHjsDpdEKn00GtViMvLw8hISEoLi6GQqEAYwwXX3wx1q5di379+uH06dOw2+38NaOjo3Hq1ClIJBKIxWK43W6EhYWhtLQUYWFhyM7OhkKhgMPhgMvlglwuh7+/P0pKSsAYg06n498/q9UKqVSKuro6yOVyaLVaVFVVwWAwoL6+HiaTCYWFhRg5ciQyMzNhsViQlJSEkpIS/hkeP34caWlp2LdvH6qqqiCTyRAYGIjS0lJ+HImNjUVtbS1qamogl8shEolQV1cHhUKB5ORkHDp0CADgdDohk8n4d0k4LigUCr7/M8YQExODwsJC/h6FY6bBYEBFRQXfx9RqNd8PFyxYgDfeeAMikQiNjY1wuVxQqVSw2Wz8cQaDAS6Xix8z7HY7tFotLBYLZDIZ7HY7YmNjUVlZCbvdjrq6OiQlJeH06dOQSCT8XGi32xEREQGLxQLGGKxWK98Hnn/+eezatQuffPJJx0/+rBvMmTOHhYWFsalTpzK5XM4iIyOZQqFg/v7+TKFQMKVSyRQKBROJREwmkzGTycREIhETiUSsX79+TKPRMJlMxiQSCVMqlUypVLLExERmMBhYYGAg69+/P1Or1UwsFvMfk8nExGIxk0gkbNSoUSwhIYFFREQwsVjMpFIpk0gkLC0tjRkMBpacnMzkcjlTq9VMoVCw0NBQFh4ezgIDA5lKpWJyuZxJpVL+WKlUyuRyOROJREwsFjMATCaT8dcODAxkEomEhYSEsKCgICaRSNjIkSOZXq9nISEhzGQyscsuu4ylpqYykUjEYmJimFarZWKxmPn7+zO5XM5kMhl//xKJhInFYmY2m5lSqWQAmFwuZ0FBQXy7iMViplQqmZ+fH99OYrGYDRgwgMlkMqbVaplCoWB6vZ7JZDIWFBTEt49UKmVarZZJpVI2YMAAZjQamUgkYlqtloWEhDCNRsMA8PuLxWK+XSZNmsRMJhPTaDRMo9GwyMhIFhgYyLRaLdNqtUwulzOlUslEIhEDwIxGIwPApFIpA8AUCgVTKBRMJpMxPz8/ptFo+PaVy+VMLBazUaNGsZCQEAaASSQSfh+VSsWkUikbMWIEk0ql/DnMZjMTi8XMaDTybSKVSllUVBQTi8Vs6NChTCwW820ibC+9Xs9UKhWTSCTMYDAwvV7PxGIx8/PzYzqdjs2ZM4eJRCK+DYTPXiKRMJlMxgwGA7vkkktYUFAQ34ejo6P5/giAabVaBoBvP+E9q9VqJpPJGAAWFBTERCIRU6vV/H0qlUq+34rFYv7/CoWCaTQavs/4+fkxtVrNPzNhuyQkJDCRSMRSUlKYSCRiV1xxBdPpdCw5OZkBYGFhYSw0NJQ/ViQSMT8/P2YwGJhIJGLXXXcd0+l07Prrr2dyuZy99957TCaTscmTJzOFQsHMZjMLCAjg+7+wv6hUKqZSqZharWZarZZ/Z4T9QSKRMJFIxN+jTCbjP8J+LhKJWGBgIBOJRCw8PJx/Pmq1mj/eYDAwhULBALDBgwfzbSV854R9VyKRsHHjxvHPXbif8LkmJCQwuVzO9yHhOCT8LpFIWFBQEIuIiGDp6enMz8+PxcTEMKlUyo9T06ZNYwAYAKbX63l7ALBJkybxfU+j0fDvbUREBJNKpUyn0zG5XM4UCgVTq9X8NYXjj7ANhe+x8N3x3FbR0dH8OBUbG8v0ej1/bpPJxCQSCdPpdCwhIYFva4PBwP+WSCQsJSWFyeVyZjQa2fjx45lKpeL7ZUxMDN//FQoF+9vf/sYA8M9GeO9Ce2JjY5nRaGRisZhNmDCBBQYGMoPBwGQyGUtOTmZRUVH8OygSiVhQUBA/buh0OiaRSNhll13GNBoNi42N5fuNsC2E74jBYGARERFMJBKxgIAAptPpmFgsZpGRkXz7yeVy/rkI55uIiAh2+eWXM7FYzAwGA5swYQI/Dlx33XUsODiYH4/9/f35ZxYVFcVEIhELDg5mSUlJ/DucnJzMZDIZe+yxx5hYLGbTp09nCoWCbz/huUJDQ/lx32AwsPDwcKZWq1lMTAyLiopioaGh/L0Jrym8X6VSydLS0phSqeTnL8/zlVqt5u0R9o+bb76Z79fC8WjgwIH8nBcUFMRUKhVLTEzk52SlUslUKhXT6/VMqVSyp59+mt17771enf+7JSdDrVZjxowZWLVqFdLS0jBt2jSIRCKYzWZERETgxRdfhFarhcFgQGJiIgYMGACtVguFQoHCwkJotVrExsZCJpNBLpfDZrMhKysLNTU1qKmpQVZWFgwGA1QqFfz9/REbG4uEhATI5XJIJBIcPHgQWVlZ/AqmX79+UKvVvDszLy8PYrEY4eHhkEqlqKiogMvlwrRp0xAZGYmwsDDIZDKMHTsWU6dOhV6vR2JiIjQaDb+a7t+/P4KDg8EYQ2VlJdRqNWpra6FUKiGRSDB48GCEhIRAKpXyq9Ds7GxMmDCBR506nQ5vvfUWwsLCoFAoePt1Oh3kcjmqqqoQHh4Og8GAqKgoJCQkoH///nxFW6fTCaVSiWHDhiEwMBBqtZr3SIjFYmg0GgQEBMDlciExMRGxsbEYOXIkGGOw2+1gjCEvLw9WqxU6nQ4xMTGw2WyQyWS49dZb4efnB7VaDYVCwSPmo0eP8q7fxMREuN1u/l5CQ0Phdrv5VZdCoYBer0d8fDwCAwPh5+eH1NRU3m19+eWXY9SoUfD39+e9K0qlEmfOnEF9fT2MRiNOnDiBmJgY3uMlEolw9OhR6PV63ttSWVkJuVyOmpoayGQyDB8+HBKJBIWFhVCpVKioqICfnx+cTiemTp0KlUoFPz8/RERE4LLLLoNer0dsbCyCgoIgEol4b4bQIyX836OPPoqEhASMHTsW/v7+aGhowJYtW1BeXg6NRsN7zRQKBRISEjB8+HA+5HbfffehX79+MBqN0Gq1kEgk/CrJ7XZjyJAh/IpcoVAgJCQE/v7+CAsLw5gxYyASiWCz2XjvnVQqhVgshsPhQHBwMGQyGWQyGfR6PdRqNc6cOQOxWIycnBwAwK5du2A0GpGXlwedToeGhgb4+flh9uzZEIvFMJvNSE1Nhdvt5lesVqsVW7duhU6nwyOPPAKxWIxt27ZBLBajuroa1dXVuOGGG9DQ0MB7yMaNGweTyYSEhASMGjUKEyZMgEajgdlsxuDBgzF06FDMmTMHaWlpsNvt8PPzw8yZMzF27FhotVrMnz8fiYmJuOyyy2AwGBAbG4uYmBhMnDgR0dHREIvPHsLCw8NhNBqhVquRn5+PsLAwhISEwGaz4W9/+xsGDx7MqxJXV1dDLpfjvvvu472IGo0GSqUSBQUFcDgcuPXWW6FSqfD3v/8darUa/v7+AM4uDqXX61FZWYlTp06hpqYG1dXVCA0NhdPphFarxYgRI5CcnIz4+HhMnjwZTqcTERER6NevHx544AEolUqIxWIMHjwYN954IxobG+F2u6HX6/mVemBgIIKCgqBUKqFWq/m/wcHBCAkJ4Vfv119/PXQ6HT9GyWQyxMXFQafTISAgAA0NDQgNDYVWq0VqaiomT54MqVQKq9WKxsZGxMTEQC6Xw8/PDxaLBQqFAlKpFJmZmXA4HLBYLNi2bRvvpVMqlbjvvvsgEokgFovhcrl4j8IVV1wBnU6Hp59+GnK5HAaDAQqFAhqNBnV1dQgMDMT27dsRFBSEsLAw3vOm0WhgMpl4z0VAQABkMhlGjx7Nv1uFhYWIj4/nx5/bbrsNEydOhE6nQ1xcHC655BIYDAbU1dVhxIgRCAwM5D1MZrMZ0dHRSEpKQmBgIORyOeLi4hATE4OBAweisbERmZmZUCgUCA4OxtatW3nPVUlJCdRqNe+FSE5OhlKp5D0cIpEI5eXlyMjI4G0LCQnhPTb+/v7YvHkz/P39UVNTg+TkZAQGBkIsFqOoqIjv8zabDaWlpZDJZCgvL4dOp4PL5cItt9yCefPm4eabb8aMGTMwcOBAyOVy6PV65OXlwe12o6CgAEqlErW1tXC5XFAqlVCpVLDb7XA6nfw4uWvXLkilUqhUKlRXV0Ov1+PkyZOYPHkyampqYDQa+bYRzoUJCQkIDQ3FDTfcAAB48sknodFovDr/d8twSXZ2No4cOYIrrrgCt99+O/bu3YuKigpcffXVaGxshFqtRnFxMdLS0mA2m/Gf//wHa9euhVarBQDepVxeXo7s7GzExsbC398fR44cwUcffQSz2QyZTAapVAq1Wo2rr74aNpuNn7wWLFiAq6++GkajEcuWLcNbb70F4OxY7E8//YSLL76Y7yTHjx/H5s2b8fDDDyM+Ph4AcPDgQaSnp8PtdvMhgbKyMvj7+2Pp0qV4++23UVVVhQceeADXXXcdvvzyS2zcuBE6nQ4//fQTPwHExMTgoosuwtdff43hw4cjOzsbUqkUWq0WEyZMQFBQEGQyGRQKBTIyMrB161bExMTguuuuw6uvvoqcnBy89dZbGDhwIJYtW4bPPvsMN998MyorK7F7924EBATgnnvuQUhICEwmEw4dOoTQ0FAsX74cBw4cwOnTp2GxWDBy5EhMmjQJw4cPR0REBCoqKvDuu+9Cp9Pho48+wv3334+NGzfyk7bJZMITTzyBf/3rXwgICMAVV1wBmUyG7777Dvv378fVV1+N1NRULFy4EPPnz8fnn38Oq9WKxx57DI2NjbBarXysMyEhAUePHsWUKVMwd+5cTJkyBUuXLkVoaChmz56Nl156Ca+99hquvvpqzJ8/H9u2bUNNTQ1mzJgBxhgcDgcqKiqQm5uLAQMGwGq14vvvv8fDDz+M7777DsHBwZg9ezbq6+shkUgwb9486PV67Nq1C8888wwee+wxGAwG7N27F3a7HX/729+wfv16uFwuLFmyhA+95ObmYujQoYiIiMBvv/2GDRs2wN/fH+Hh4ZDL5SgqKsLixYuxatUqPProoygsLERJSQkef/xxJCYmwmQy4eTJk4iJiUFNTQ1uvPFGXHnllfj444+xYcMGPPzww6ioqMDSpUtRX18PrVaL+Ph4bNu2DSaTCXV1dZgyZQo++OADnu/jcrkQEBCAd999F19++SXef/995OXlQSKRwGaz8WGW77//Hvv374dMJkNISAjWr1+P6OhomM1mfPTRRzhx4gTPAdi7dy+ys7N5MFNRUYGrrroKBoMBZWVlaGxsRHZ2NtLS0rB3716enzR16lT4+flh9erVmDx5MrZv34577rkHEyZMwMaNG1FVVYXs7Gz88MMPGDZsGFatWoVBgwZBpVLhmmuuwbRp0/Dbb79hxIgR+Ne//gWj0chzXq644gq88MILOHXqFBITE5GRkYGEhAS8/fbbSE5Oxm+//YYHH3wQAHDRRRfhhRdewIgRI7B7924sWbIEU6ZMgd1uR1ZWFsRiMcaPH4+pU6fiiSeewKFDh6BSqRAeHo63334b33zzDVasWAG73Y7ly5cjLi4OQUFBqK+vR0NDAzQaDf7xj3/g9ddfR3V1NUpLS/Gvf/0L+/fvR0FBAebMmYMpU6Zg8+bN2L59OyQSCWJiYngSXUREBL755hsYDAY8+eSTuO6665CSkgK5XI7//Oc/ePbZZxETE4Pdu3ejvr4el112GQ4fPoyUlBQUFRVBLpejf//+sFqtWLduHRYvXoyKigokJCRAJpPhjTfewK5du/DQQw/h0KFDWLNmDYKDg/HMM8/g5ZdfRmZmJg4ePIjg4GDY7XbU1taCMYaEhAS43W588cUXuOeee3DDDTfgp59+wvTp03lbamtrsW3bNoSHh+Paa6/Fhg0bcOrUKezatQvvv/8+EhIS8Oyzz6K+vh79+/dHQEAANmzYgCeeeAKMMZw8eRK33norTCYTvvjiCzQ2NmL8+PGQy+UIDQ3F9ddfj+XLl6OsrAx6vR4ZGRn4z3/+g6VLl2LRokXYuHEjpkyZgnXr1iE1NRWDBw/G2LFj8c0336CoqAjV1dWwWq2IiYlBdXU1AgICMGnSJEyePBmZmZl45ZVXkJycjE8++QSJiYl4/fXXef5GQEAAxGIxgoKCcObMGXz11VeYPXs2tm/fjkmTJsFms0EsFkOn0+Htt9+G3W5HRUUFEhMT8Ze//AXHjh3D5Zdfjjlz5uBvf/sbHn30UbzyyisQiUS44447EBYWhocffhgBAQF477334HK5cPr0aQwfPhynT5/G5s2b8d133yEoKAhJSUm44oorcOTIERw7dgxxcXF8+Q5hSfVTp05hwYIFaGhoQHZ2NsxmM4qLi9GvXz+MHz8e06dPx88//8yHuS699FI8+OCDEIvFuOmmm1BXV4d33nkHMTExGDp0KGpqanD48GEYDAbExMRAKpWirKwMJ0+exLRp05CUlISJEydiyZIlmDt3LjZv3oxffvkFH3/8MUwmk1fn/27LyVi3bh2+//57OJ1OnDx5EhqNBtnZ2bDb7SgrK0NAQAAKCgoAnL3qttvtUCqVCA4O5gfhkJAQbNiwAW+88QaOHz+Or7/+GiEhIbwnQiwWo7GxESNGjMDBgwehVCpRVVUFt9vNx5zEYjEOHz6M2267DQcOHOC5EMnJyTh16hRqa2v5WPzs2bPx9ddfw+l04pJLLkFUVBTy8/MRHx+P/Px8HDt2DOnp6ZBKpTCbzcjKysLJkyd5Uturr76KcePGoV+/fjh48CDPtxg2bBgKCwuxYsUKDBgwgCeiyWQyXHTRRXj//fcBACdPnsSLL76It956C+Xl5Th9+jRCQ0OhVqtRWFgIu92O22+/HbW1tSgpKcHevXthNptx5MgRDBs2DNdeey3+/e9/Q6VSAQDy8/Nx2223YcuWLdBqtTh27BjCw8Nx5swZfhX04Ycf4sorr+Rj29HR0XjllVdQXl6ORYsWQS6XY9q0adiyZQssFgsWLFiA119/neeBCGOILpcLVVVVGDRoEG677TYsXLgQ4eHhvIcmMzMTs2fPxtq1azFq1CisWbMGgwYNglgs5ldswslWJpNBrVbj22+/hVwux/Hjx2Gz2RAUFMTHde12O5KSklBQUICqqirY7XaYzWYEBgaisLAQR48exQsvvIDXX38der0eVqsVer0e+fn5GDNmDHbu3Am5XA6VSoWwsDCcPHkSl19+OVasWMF7mkaOHIni4mJIpVJcffXVWLZsGYYMGYL8/Hx+Ja9QKCAWi1FSUgK5XI6xY8fi5MmTPO8oMDAQubm5mDVrFiIjI2E2m1FRUYHDhw/jmWeewX333YeCggI8/PDDyMzMRElJCWQyGXbt2oVbb70VV111FQ4ePAi3243a2lrs2rULoaGhuPjiizFkyBAUFBQgJycHH3zwAXbt2oXjx49DrVYDODcrq6amBkVFRfxkVVpailmzZuGdd97BggULYLPZsGXLFuTk5OCqq67CqlWr4HK5eC9RWFgYGhsbkZ6ejsbGRlgsFhw7dgxJSUmor6/HX/7yF7zzzjs8P0gY454wYQL279+P2NhY2Gw2ZGRkwO12w2q1IioqCg0NDaiqqkJ6ejqOHTuGxsZGREVFQa1Wo6qqCqGhoTwAZIzxnjp/f39YrVao1WqUlpbyMe8BAwZAKpXizJkzSEpKwq+//gqtVotx48ZhxIgRWL9+PTQaDU6cOIEzZ85AoVDAZDLx3AuFQoH58+fj73//O4KCglBTU4PQ0FCed3Ds2DEUFxcjJiYG27dvh0wmQ21tLQDA398fEREROHPmDCQSCRQKBe9tSU5OxkcffQTg7OwJp9MJk8kEq9WK/Px8zJ07F+PGjcN9992HhQsXwmKxYPfu3Th+/DgCAgJQX1+PO++8E2VlZTh+/Dj279+POXPm4Mcff8TEiROxa9cuuN1unqdms9nQ2NiI0aNHQ6lUIi8vD1KpFPn5+UhNTYW/vz927NiByspKGAwGXHrppfjtt99QVlYGnU7HZ8YYjUbceOONOHPmDH755RdYrVbExsbyiyWh11XII5swYQJ+/fVXmM1mKJVKbNu2jefwhIeHIzc3l+eLud1u/j2ePn06Nm7ciIaGBvzvf//DY489hsTERPznP//Bxx9/jMzMTBQVFWHLli2YMmUKjhw5ArvdDovFArfbjREjRmDnzp2QyWQ8V054fj8/P9TU1CAmJga33HIL1q5di4aGBuzfvx8ulwtBQUGoqqrivT0BAQFwOBz8vBAaGorw8HDMmjWL5xYqlUqsXLkSAQEBEIlECAwMxMGDBzFkyBCcOHECQ4cOxbp16xAQEIDs7GwAQGBgIBoaGmCxWBAXF4fIyEhYrVbs27cPQUFBKCoq4j3BBoMBmZmZfPLBgAEDsHfvXgQGBqK4uBhlZWWIiYlBbm4uDAYDGhoaUFtbi8bGRv65BAYGoqqqiu/fI0eO5DlPbrcb/fv3x6FDh9C/f38cOHAAwcHBKCkp4edi4bg4ZswYTJ48Gddff33HT/5dkXPRnr1797L4+Hg2a9YsZjKZ2MMPP8zH2oYMGcJ0Oh0LDw/n47vTp0/n43lBQUFMoVDw8TfhB7+P+wnjtXFxcUwmkzG5XM7Hj4S8hC+++ILp9Xrm5+fHx9GFccrg4GA+/i+MaQtjesL4ptAW4fWE8VVhfFzIxxDyBYS8EiG3QKFQsPT0dBYQEMCGDBnCxo4dyxYuXMgiIiKYTCZjgwYNYjqdjhkMBhYXF8eio6OZwWBgRqOR50oI7UpKSmLJycksISGBRUZGstDQUD6OKoztC2NycrmcmUwmPnYr5BckJCTwXAFhbFq4j/A+DQYDCwsLYwEBAU1yZ4Txb2GMUPh8hJyBwMBAlpyczIxGI0tISGA6nY4plUo+Ti+MFRoMBhYUFMRSUlJYZGQk0+l0PK9GKpXytgj/er62SqXiuStqtZrnWAhj0TNnzuT7gfB5DR48mA0ZMoRZrVZmNBpZaGgoz6EIDAxkGo2GzZw5k40fP56NGTOmSQ7BlClTmFgsZsOGDWMikYjndkydOpXpdDrm7+/PPv/8c2Y0GplcLmdJSUk8Z0DYB3U6HZNKpWzMmDFMp9OxkSNH8nHQ4cOHM71ez7RaLTMajSw8PJwFBwfz3IgZM2Ywk8nEx/ElEgkfW1coFMxoNDI/Pz8+xiuM4wv3mzp1Kn9+IZ/FYDDwfV4Yexe2fVhYGFOr1QwACwkJYXK5nEVHRzOtVsuCg4N5LlBwcDAfu/bME7jzzjuZSqViwcHBLDw8nPn5+TG5XM5uvPFGplar+ecnjEeLxWKWmJjIv0uxsbEsKiqKGY1GZjabWWxsLJszZw7PaRG2qbDvxsfHs9jYWHbttdfy9x4QEMBUKhWLiopicrmcqVQqJpPJmF6vZzNnzmTDhw9n/fr14+PdZrOZjRkzhk2fPp3JZDKWmJjIc16kUimLj49ner2eyeVy5u/vz+bNm8cuuugiZjKZ+Pi18BknJSXx44HnGLxarWZBQUE8T0bIRfE8pnju+2azmclkMjZ79myWkpLCxGIxi4qKYv7+/vzzjY6O5sdJIWfAbDazsLAwnisgfL5CfoxEImH+/v48r0A4/gjPL7R9/vz5TCQSsXnz5jGZTMbi4uKYVqtlYWFhTC6XM51Ox0QiEXv55ZeZRqNhYWFhLD4+nt100038OKPT6fjxCQBLSkpqclycNm0az0WLiYlhEomE+fn58Zwe4ZjtmRsj7Cc333wzUygUTKVSsYiICJ4TJhwHJ0+ezAwGA0tJSWEpKSlNcnmuvPJKptFo+PE1JCSEH2eEfBYhl0H4zgg5K9dddx276667WFpaGgsLC+PtEbah8F6FnDLhuYRjrLDfCvuGQqHgeW0qlYrFxsY2eYzwHQ0MDOTnFbPZzPedkSNH8n0eAM89DAoKYmq1mufXjB07lufexMfHM6lUyo8dwjYzmUxMrVazgIAAFh0dzSZMmMC0Wi2LiYlhr7zyCmOMsfnz53t1/u+WnIxrrrkGZrMZ//vf/1BVVYW9e/diypQpEIlEOHDgAOrr63lkarfb8csvv/CeCbvdjoCAALjdbhiNRiQlJeGWW27BrbfeCplMhuDgYAwePJhnMEskEjQ2NkIulyM2NhYTJ07E+PHj4XQ6UV1dzbuFQ0NDodFocNVVV/FcA6fTyccahSsSkUiE6Oho9OvXD9HR0VAoFLjqqqsQHByMsLAwBAQECMEaXC4X/10Y65XJZHC5XCgsLERlZSVOnDiBbdu2YdmyZaisrAQAZGVlwel08tkUwvi3xWKBXC6H0+nk8+ntdju/8qqvr+f5I2KxGMHBwYiMjIRSqURERARkMhlqamogkUhgt9shFovhdDp5drHQiyMSiSCVSvmMBGEMVZg1U1FRwfMqTCYThgwZArVaDaPRiHHjxvGpgsL4uJDJXVBQwLP6hfbcfPPNcDgcsFqtqKio4OOeISEhCAsLw6BBgyCRSBAVFQU/Pz+kp6dDqVTyLPPhw4cjJiYGEokEl1xyCR/LlEqluOqqq6DX6yGTyRAfH4/6+nqe43DixAnk5ORgxowZqKmp4d3+4eHhcLvdiIyMxPr163H48GEcOnQIDoeDP16YRipkyMfExIAxxmf/2O12/Otf/0JtbS3i4+NRWVmJcePGQaFQICgoCEFBQZgxYwbcbjcOHDjA84QqKipQVlaG3bt38/2ntrYW5eXlvLdFLpdj7dq1qKur47M5FAoFrFYrRCIRHy8Wst/dbjfvTRKG4kJDQ2G1WnmuBGOMz0oRpslefPHFSE5ORmpqKkaOHInhw4fDZDJBr9eDMYaUlBQoFAo0NDTwduh0OiQmJkKn0/F9UyaT4euvv+aZ8zabjY8Nr1y5EuHh4ejXrx/PkhfyUaqqqhAYGAidToeSkhJUV1fj6quvhsPhQHZ2NjZu3AiRSISUlBRcfPHFiI2NhUQigUwmw5kzZ1BaWooNGzZALBbzKYnCzBSlUonw8HA4nU6oVCoUFBRg//79vDdE+J6bTCY+7JSZmQnGGJ+xJJfL4XA4YLfbUVlZiS+++AInT57ks2mEWSlSqRQZGRn8sxkzZgxuv/12mM1mBAQEYOjQoQDO9nRIpVKMGDECYrGY59z069cPfn5+CAwMhMvlgsvlQlZWFk6fPg3GGEaPHo3IyEgEBwdDJBLxfUT4/o8cORLl5eUoLS1FeXk5iouL+eyo/v37Y+nSpfy4IMxsEK6Sg4ODMXbsWBgMBgQGBvKaL1u2bIHL5UJJSQnq6+ub5EMJPcNisRhWqxWnT5/Gb7/9xnPRQkJCMHDgQKSkpMBsNqO6uhrjx49HWFgY3G43oqOjAYAfk/z8/BAXF4dBgwYhKCiIzzjs378/UlJSEBISwoeVt27d2qQnZM6cORg6dCjsdjs/7jidTuTn5yMjIwOMMdTW1qKhoQFr1qwBAEyZMgXp6emYOXMmJkyYgAEDBiA+Ph4BAQFISkqCUqnkszlyc3PhdruxZ88erF69GvX19fw4KPS4CUMwwNmZgFqtFhEREbjllluazCoMDQ2FUqnks4fq6+uhVCqRkJAAtVoNmUyGQYMGISUlBf3794derwdwNp9I6KEXnq++vh5jxozB9OnTIZfLce2110IikfCeHblcDrfbjd27d6O2thY5OTnIycmBSCRCv379oNPpoNfrYTKZsGrVKojFYjQ0NEChUGDv3r2w2WxITExE//79sWbNGt4z3lHdMlxit9sxefJk5ObmoqCgADqdDiKRCHK5HEqlEo8//jjWrFmD1atXIzU1FRMmTEBBQQGGDRuGq6++Gm+++SZ27dqFuLg4lJeXY8aMGZg2bRpmzpwJqVSKgwcPon///ti+fTtMJhMCAgL4Sfurr77CSy+9hM2bN/MhgejoaISHh2PVqlUICAhAVFQUysrKoFQqodfrcfr0aXz//ff45ZdfMHv2bNxyyy144YUXEBgYiLvvvhsOhwMnTpyAUqlEfn4+7rvvPnz77beIi4vD6tWrERwczLtsH330UTz55JM8wcnPzw8mkwmZmZk80em+++5DaGgoQkND8c033/BkSuFDf+655/D999/DaDRi2LBhOHnyJObOnYsnn3ySTwG2WCz8ID9o0CAcP34cEydOhMPhwPDhw3HixAlERUXh1VdfRUxMDOrq6ngdiPLycgQHB2Po0KH45z//yV9j6tSpKC8vx7p16zB48GDs2LED8fHxmDZtGm6++WbMnj0beXl5MJlMPEh78sknsWbNGgQEBOCLL77gibBC17mwjaOjo7F7925cc8012Lx5MxITE7Fr1y4kJSXh6NGjMBqNsFqtSE1NxdGjR/lBPCAgAOXl5fzLCoBPCxROaiEhIUhLS8OJEydQVFSEuLg4ZGRk8JOkTqfjJw/gbJf1uHHjIJVKMXjwYHz//ffIyclBREQET9QtLS3FqFGjeAJldnY2IiIiUFBQALlczoM24GyQKSTvDh48mA/DWa1WDBkyBNu3b8eAAQNw6tQpniw5b948TJ48mQ/rRUVFITs7GzqdDgMHDsTWrVsxbNgw1NfXIyoqCps3b0ZFRQVEIhEiIyNRWlrKp2KPGjUKW7Zs4VPqhKEvp9OJo0ePYsSIEbwOx6ZNm1BbWwu9Xs+ndw4aNAg7d+7khZCUSiWfRltRUYGamhpERETAZDIhPz8fgwYNQl1dHR82SkpKwsmTJzFs2DDMnTsXW7duxQcffIBZs2ahqqoKq1evRkREBJKSkhAeHo7ly5fj8ssvx1VXXYV3330XGzZsQEBAAFJSUnD69GnU1tYiKioKIpEIoaGh8PPz49OEb7zxRsyePRsymQynT5+GUqlEXV0dz+eKi4vD8ePH+ect1MQRpggLSZczZszAypUr+bDQxx9/jKCgIEybNo1PKRSmn6empkKhUEAmk2HOnDl48MEHMWXKFEgkEmzbtg1SqRQXXXQRTp8+zadwajQaTJ48GSUlJVizZg1CQkJw/fXX8yD+wIEDaGxshFgsxkUXXYTvvvsOjz32GP75z38iMTERtbW1OH36NK688kqkpqYiICAA+/fvx4kTJ3jis0gk4vVZjh8/zj9P4aLn2muvxdtvv4277roLZWVlOHToEHbu3ImpU6fimmuuwWuvvcanOEdGRiIgIIBfbAnDYytXroTRaORDMY2NjTzB9PLLL8dPP/3EA2axWIyIiAhUV1fzxOorrrgC+/fvR25uLh588EHceeedfIiiuLgYcrkcQ4YMQU5ODnJzc5GSkoLw8HDcdtttePPNN+FwOJCfnw+LxcKTX4WkWGE7WywWTJw4Efv370dJSQlEIhEfAhemquv1egQHByM0NBTr16/nwyrCVGSlUomGhgZce+21WLduHdLS0iAWi5GXl4eysjI+JdRut2PIkCEYM2YM4uLiYDAYcPDgQRw+fBhZWVmYOXMmNm7cyAOoAwcOwGazITIykk87dTgcSE9Pxy+//IKXX34Zr7zyCtLS0nD48GE4nU7k5uZi4MCBKCoqQmFhIRQKBQIDA1FQUMDfm0gk4sGVcAwRCMG4VqtFVFQUKioqMHz4cISGhvKyBk899RQP2BoaGvi+3r9/fxw5coRfTAuBYkeKDgq6Jch4/vnnMXfuXDDGsHnzZjzzzDNoaGiAzWbjc5ijoqJw4MABxMfH83nxbrcbl19+Oerr63kWc3x8PGJjY/HBBx+guroa8fHxfIz0lVde4VdtwhezqKgIaWlp2LZtG1wuF5xOJ0+mMRqNPOHIYDBgyZIlkMvlePDBB1FTUwOr1Yr+/fvj6NGjqKurg0ajgcFgwE033YSFCxdi0aJFePbZZ7FkyRLs27cP1157LV544QXMmzcP//3vf3HXXXchJSUFGzduxPLly3lRk7fffht33HEHT+az2+04fvw4Jk2ahP/+97+45557cOutt2Lu3LlYtWoVFi1axGeJlJWV8foG6enpiImJwfHjx1FfX4/p06fDZDJh3bp1fL61xWLhNSL8/PzgcDh4drEwO2Dbtm08C18qlSImJgZqtRoSiQRZWVn8C/rdd9/hrrvu4oVhAPCiNcKV2ZAhQ7B27VpecKayshKlpaWIi4uD3W7Hnj17kJqayueq19TUwN/fHykpKfDz88Pf/vY3Plb773//GyKRCMuWLcOePXug1Wqxfft2fPrpp8jLy+MzSIQZPkJS1dChQyGTyVBdXY3a2lqewBgTE8PHUQHgL3/5C06cOIErr7wSzz//PMLCwiASiRAbG4sTJ07g1ltvRXFxMb9SMRqNeOqppzBz5kxs3boVoaGh2LFjB6KiojB79mxoNBo4nU689dZbKCgoQGNjI0wmE0pKSjBz5kxs2LABwcHBkEqlyM3NRXp6OiorKzFy5Ei8++67+Pvf/44333yTX6HMnDkTR44cgcFgQGFhIWQyGe99GzlyJH9vl1xyCTZs2AAASExMxEMPPQSn09nkwHn48GHccccdeOONN6BWq+FwOHgdgMjISNTX1/NZF8KV7qhRo1BbW4vDhw/z4Ly8vByNjY3w9/dH//79ebEtIcFVmH1SX1+PBQsW4LPPPoPD4UBhYSGfHXP55ZdDq9XipZdegsvlgl6vx7p163iBu6ioKGRlZfHZNUIewtdff41vv/0WH3zwAerq6pCamoqamhrk5eVBrVbDz88P1dXVMJlMUCgUqKys5AWw/v3vf+Ozzz4DYwyPPvoo9u3bh5MnT+KRRx7Brl278OijjyI/P58/b2BgIA4dOgSpVIrAwEAMGDAA+/btg9VqRUlJCfz8/Hh+TkBAAM8DEvKHysrKIJfL+UnN4XDAbDbDZrOhpqYGGo2Gf4dcLhemTp2KTZs2YdGiRfjggw9w7NgxzJ07F/7+/tiwYQOsVisKCgpwzz33YMWKFcjNzUVISAj+9a9/4f7778eAAQNw5MgRnDx5Eqmpqbj//vuxadMmBAUFYe3atWhsbITRaER2djasViuv45GcnIykpCQMHjwYr776KkaOHMlrvhw+fBhSqRQpKSkwGo1Yu3YthgwZgpiYGNx///144403sGHDBlRVVaG2tpYHHePHj8esWbOQlZUFvV6P7777DllZWaiqquInRKFHWSqVIigoCKNGjcKRI0d4PaSKigrI5XLMnz8fK1asQHBwMMxmMxISEvDJJ5/A6XTi1VdfRW1tLcaPH48777wT27dvR3x8PIKCgrBr1y5ea+n222/HJ598goaGBn7eEWoSCcd1f39/DBw4EF9//TWMRiOfVSckkIeHh2P16tWoq6vD559/DrvdjjNnzqCoqAgmkwlTp07FnXfeiQEDBqC8vBwikYjXU9LpdBCLxYiLi0NYWBhuvPFGfPXVV5g0aRKWLVuGqVOn4r333uPFv44dO8YvpoCzvWNxcXG8UNnBgwcxduxY3HvvvfjHP/4BrVYLq9WKRYsW4ZtvvoHRaMTbb7+N22+/HRs3buQ9MJWVlejXrx8KCgpgNBqRkpKC9PR0ZGRkYM+ePcjNzYVcLodOp+MzwoRzRUNDAz777DNERkZ6ff7vliBDONhcdtllWLduHUaPHs2Trvz8/HiSy5kzZ2AymVBeXg6ZTAbGGKKjo5GRkYGQkBCoVCqcOXMGUqkULpeLdxULJWGF7kXhii4iIgLFxcWYOXMmL5rjcDgQEBDAo2DPoY2GhgaIRCKEh4ejpqYGer0eubm50Gg0sFqt+Pjjj/HEE0/wbrstW7ZgzJgxuP/++/Hss89iz549KCwsREREBPz8/CAWi6HX63HkyBFs3LgRiYmJSEtLA2MMZ86cgV6vbzJ0I0SjOTk5SElJwZEjR/gBKjAwELW1tTx5aPr06Th27Bg0Gg0UCgXS09NRXFyMNWvW8IOtUqmERqPBa6+9hr/+9a9wuVx8+puQXDR16lQEBQXhl19+AXC2FHJpaSmAszt3eHg4tm3bhmHDhvEpTwaDAUajEWfOnEFiYiIyMzN5oTHhC1lTUwOpVMoLnjmdTgQFBfFkQ4vFAoPBgKKiIkyaNIlX55w6dSrWrl0LtVoNm82GMWPG8ATPt956C2PGjEFUVBTWr1+PMWPGoKqqCrm5ufzALRQoq6+v5916whW9RqPhV6/CUJBwsGO/J/vJ5XJER0fjwIEDCAkJQWlpKS/OFR4ezqvS7t69m3dbClONhe7juLg4HD16FHK5nHfBy2QylJWV4fbbb8e3336L6dOnQyQSYevWrVAoFNi3bx9SU1NRXFyMGTNm4ODBg4iOjobRaMTixYtx7bXXwu12Y/Lkyfjss894ULlz504+uyMlJQVRUVHIycnB8uXLIZfLUV1dzQtE7dmzB1deeSXcbjcGDBiA3bt3Y+rUqVAqlTh06BBv+8UXX4zFixdj1KhR+PHHH1FfX49HH30UK1euhFQq5VUqhZkKMTExsFgsEIlEKCsrw5133onvvvsOVqsVMpkMQ4cOxcqVKzFo0CCcOnUKJpOJB0EhISH8Oy0Md4rFYoSGhvITzpdffonPPvsMq1atwvLly3HFFVfwk6bFYsEll1yC7OxsHDt2DIMHD0ZkZCQfXhH2S6GXQBguEhIQPYtCAeAnAJFIxAsEAkBUVBQkEgkyMzMRFxfHk0mrq6tRVlYGsVgMPz8/NDQ08MJhAPiJ1G63Y8qUKaitreXFtoSAv6SkBP/3f/+HV155BTExMcjJycG3336L22+/HcDZUuyhoaGoqKhASEgICgoKMGPGDHzzzTeQyWSwWq18/xV6MwwGA09WFE6kDQ0NSExMxPHjxzFw4EBkZWXB7XYjODgY5eXlfD+uqKiA2+2GVCrls910Oh1P3t20aRNkMhnq6uogEokwbNgwZGRkIDAwEJWVlfz7xX4vdqdUKpGWloY9e/bAaDSiuroaWq2W9yYJQYper0dtbS2MRiPmzJmDL774gidgChc9Qq9aRkYG76EFwIsPymQypKWl4ciRI5gxYwYeffRRPP300/wkKgx/TJ48GadOnYJKpUJOTg7Cw8ORmZmJkJAQJCcn4/Dhw/z4IMx+NBqNfAhfOG8I7VMoFJg8eTJWrFiBDz74APPmzUNiYiIKCwt5UUKpVIqqqiqkpaUhJiYGer0eO3bsgNVqRVFREbRaLS699FLs27ePDwEJvTw5OTk84BV6w1544QWsXr0aUVFRsNvtqKmpQX19PYKCgtDQ0NDkAkyYUu5Z5EwIioUyAzabDTqdjn8GwsU9AF6ioDNl+7slyOjXrx8GDhyI06dP49ChQxg+fDi/uhTm6AcHB/O5+3K5HKNHj8aKFSv4hymTyaDT6XitiKKiIp4xGxgYiODgYF51UhgKESrayeVyKBQKfkL0HK8SKj8yxtDY2AiZTIYRI0YgPDwcP/74I39+YfaHMINCLpfzcrUXXXQR1qxZg6KiIlgsFj5OJ1Sl0+l0ePLJJzF9+nRcdNFFvCZBXl4eAPAqfsJUV4VCAQCora3l3WYTJ07E5s2bERISwqdSnTx5EjKZDC+++CLeeecdHDx4EFqtllcZVKlUKC8vxzXXXIOtW7eiqqqKvw9hWKKxsZFPY2WMwWaz8TwFof6BMPQgTDcOCwvjtUoeeOABLFmyBKmpqTh9+jSf8y5UnhRmANhsNj52LJFI+OcqXPUJ4+bC8+3atQvx8fG46qqroFAo8OOPP0Kn0/HsbLvdjsGDB6OgoAB6vR6BgYGIjY3Fjz/+iMbGRvTr1w/Hjx+Hw+HgeSaBgYFgjEGlUkGv18NgMODMmTOoqqrC+PHjsXPnTlx77bVYunQpnxZXXFyMjz76CLfeeituueUWfPnll5gzZw7ef/99Xo63oaEB4eHhvKs9NzeXB0mjR4/GkSNHEBsbi+3bt2P79u2YN28e7zkQhnySkpJQWFjIx1rT0tJQXFwMl8vF85dqa2tx8OBBlJSUYMKECTyvQ8g/GDFiBKRSKY4dO4ZDhw7x3CRh+qkQRAlX0MLJViwWQyQSYfz48bzSqjCmLfQghoeH856D48ePIyQkBC6XCzU1NQgLC8Phw4f5TAaj0Yi6ujoMHDgQe/bs4d8vIYhITk7GyZMneeVOsViMsLAwXhJb+H44nU6eJV9XVweXywWTyYTKykre2+NyuZCamopjx47huuuuw8qVK3lOUkZGBu/+TU9Px+HDh3mVxOuvvx7Dhw/HU089xSv96vV6REVFwWq1wuFwoKSkBI2NjYiNjUVWVhaSk5ORkZEBqVQKt9uNuLg4DB06FCtWrOBX6bGxsTwHSNi/dTodCgsLeZ6CkOsj5EEJJ12hVxMAPwmEhoby44QQwBcWFmLUqFHYunUr4uPjodPpcOzYMSQkJCA/P59PZ0xLS8OAAQPw/fffQ6PR8BOzMBNGmEEzePBg7N27FwD490UkEuG6667D2rVr4efnh2PHjvFqtmVlZQAAk8mE2tpayOVyHtQLlXGFIaZLL70UkZGRsNls2L59e5PPraioCAkJCcjMzERlZSVuvPFGfPrpp5g/fz4UCgXeeustfszVarV8+wk5XUI+lEwmQ2VlJe6//36sXLkSY8aMwXvvvcen+ZaUlPDhA6PRCLPZzC+QTp06heuvvx5Lly7lVTSFWWrFxcXQ6/WYP38+Xn31VWg0GkRFRcFisaCmpoZ/P8LDw5GdnQ2lUgmLxYJBgwbhxIkTGD58OHbs2IGBAwciNzcXdrsd5eXlPMA4evQo3x6S36tFW61WaDQaxMXF8SFbjUaD/fv38/L0DQ0N/PgsBLdCMFhaWgq9Xo+CggJoNBqEhobi9OnTvIdayPVLTk7GiRMncMstt/BcpqysLNx44404ceIEKioqcOzYMQBAcHAwLr74YjgcDjz99NNITEz06vzfLUHGnXfeiaVLl6K4uBhmsxkKhYJfOQjRkxCNR0RE8G4t4FzyWnFxMZ+CKnwgBoMBJpOJX03ExcWhpKQEiYmJsFgsqKioQGVlJUaMGIEpU6bgf//7H/Lz83k3mmdeiNBVLFxhBQYG8gQyhUKBrKwsSKVSBAcH853+5MmTPKlNiKKFrkDPLysAXiistLSU1zsoKSmBUqnkZWXlcjmP6IVEPSEJU5geNnjwYPz666+w2+18apbwGgD4lcJFF12Ew4cP86I7Wq2WT/HTaDR8ipJwtSa0X7jSEz4fobT1iBEjsHXrVixZsgRFRUU4ePAgNm3axL/kqampOHDgAPz9/WGz2VBfX88TjpKTk9HQ0IDTp0/z5FOj0Yh+/fph586dkEgkCA4O5kNUwpdduPoU2idcdarVat7TVFdXh9jYWF4yOTMzE3q9ng/xaLVaFBUV8ecUPi+hMJlKpUJ9fT0KCwuh0+kQHR2NEydO8OI7QkAkRPcWiwWhoaH8AHnmzBmo1WpeltmzXoXb7cZdd92Fb775BkuXLsX06dMRHR2NsrIyBAcHIzs7m7++UGROOOgIpaCFdgiEfUqpVMJqtfIhj+ZX58DZnj3hQCwkRApdq0LbFi9ejIiICADgQzJCUqZQbEmYAi6UVRcS1yZNmoSff/6Zl6cXksyE/T86Ohrp6elYvXo1D/CFKbExMTH84N/Y2Ijhw4fDaDRizZo1MJlMkMvlGDlyJH8/JSUlWL16NYYPH46dO3fCbDajrKysyUlZ6MmKiopC//798eCDD+K///0v1q9fj4CAAKjVaoSEhGDlypV8G6elpWH79u18nzcYDEhJScGuXbtgNpuRmZmJwMBABAYGIjs7mx+vhMJ3QiAufB5CUSkhWJw1axb27NkDi8WCYcOGYcaMGTwHq6Kigh/3hCJclZWVvJeJMQaTycR7RwDwE4xnnoHQYzho0CDs2bMHarWaD9UJ+69QBqChoQH19fW850gYOtBoNHyKsrAvpaen48SJE4iPj8exY8f49MeLL74Yv/76Ky9wJyTPjh8/Hlu3bsW1116LQYMG4f777+f7jjBEJxwrhaRToedFyAEQglEh50Gj0WD06NHYsmULP6Yyxvj6OhKJBImJibjllluwePFivuyB2WxGaWkp7732/A4JF51yuRwVFRUYNmwYz7UQi8W8t8DlciEsLIyfqIUAp7a2FqGhoaivr4fVaoVSqeTnKOEiMSQkBGazGfX19cjLy0NVVRXi4+Nx6tQpuN1u3sMlBJzC90apVPIigkJ58ICAAFRUVPBtJxT4E0qVazQayOVyREZGwuFwID4+Hl988QX/LttsNqSnpyMzM5MH1AaDAQMGDMDgwYN5PtG9996LvLw8OJ1OREZGYtiwYThx4gQyMjJ4yYC4uDiMHTsWL7zwQofP/90SZGzevBlvvvkm3G43jh49ivDwcPTv358PN3zwwQdIT09HdnY2vzqcPXs2Pv30U9xxxx0QiUR4/fXX+Q51zTXXYP369di0aRP8/Pwgl8thNpuxbt06+Pv7Q6FQoKqqiq+pERYWhoqKCvTv3x9yuRwDBw7E9u3bUVFRgdraWoSHh6O2thYGgwGnTp1CfX09Lr30UowdOxYZGRn44osvEBwczL/EQh5CcXEx75IcPnw4rrrqKixevBiLFi3CI488gpdffhkJCQl44YUXsGrVKoSFhaGkpARDhgzBzp07cckll6CsrAwWiwXp6ek4evQoFixYgI8++ghxcXGoqqpCZGQkamtrcebMGTgcDowcORLr1q3DvHnzcOjQIWRkZPCTs5CElZ+fD5VKBavVyk9iKpUKwcHBsNlsfI0DITFMWJ9AONFKpVJcd911OHnyJDZs2MCzqIWkSWHePXBuxoVUKsWgQYNQUlICf39/hIaGYt26ddDpdLj++uvx888/89UPDQYDLBYLAgIC8Oijj+LNN9+ESqVCfHw8r5YpJK8KFTcHDx7MZ0AsWrQIeXl5sNlsOHbsGLRaLSoqKvhQi0wmg9FoxIABA5CTk4MRI0Zg48aNmDx5MiZOnIgHH3yQz9YR6lcIJ2NhaEEIcoScAfnvixE5nU6edCoEq0I3pnD1XVJSwue2C931wlBRcHAwiouLkZKSgpycHISFhfH1JqKiongNkAceeACvvfYa7HY7UlNTed2Yw4cPw2w2856FjIwM3HvvvXjooYf4egrz589HRkYGli9fzlfKVSgUfLaLv78/du/ejVmzZvF9W6imKxRxs1qteP311/HCCy9ApVIhMTERN9xwA2w2Gx555BF+gvL394dKpYLZbEZ+fj5OnDjBezS0Wi2GDRuGQ4cOITIyks90YowhOzsbgYGBfMjA398fkZGRkMvlWLVqFZxOJ/z8/PjnVF9fz0+E//jHPxAWFoZnn30Wx48f58OqIpGIFxGrqqqCRqNBeno69u3bx2ePaLVaHDx4kK+HERISgpKSEoSGhvJ9Twj0Kisr+QlB9HtFXaEyMHB2BVVhzF7YfxITEyESiXDo0CE+PFFcXMx7ZwYOHIjNmzdj1KhR0Gq1vNrub7/9BrFYjLS0NL7fz5w5E59//jmvkHnixAkwxrBkyRL83//9H6qrq6FWq1FRUQGFQsHXqhDy2YTvpkwmg9PpxL333st7AVasWMGHO2prazFs2DDk5+dDp9MhKCgIJ06cQGlpKcRiMQwGAzQaDZ9xJZVKeRXR/fv3Q6/X8xoxwjEXAJ/tplarcfToUb49hYBs8uTJOH78OAYNGoRt27bx9TqsVitfTTY8PBwymQxZWVm46KKLcO+99+K5557jdZCE9ycE6o888gi+/PJLjBs3jtcKkUgkyMjIQG1tLZxOJ7KysqDRaPgFT3JyMu+dEHolhVlRwjFAuPhJT09HRUUFBgwYgKNHj0KpVGL06NH88xOSVVNSUngv64kTJ6DRaHDxxRfjww8/xJQpU1BZWQmbzYZDhw5h69atuOaaa3jSrTDzw5Mwk004XzU2NvKhb6FuzIQJE5CWlobExES88cYbKC4uRmRkJHQ6HZ+h99NPP/HcPGE4Uwgyq6ureQVYlUqFuro6lJWVISIiAgsXLsTo0aPx+eef46abbvIqN6Nbgoxvv/0Wzz77LJRKJY4cOYLp06cjOzsbJ0+e5EWFhJOdRCLhXW1CN7fD4YBWq8Xy5cvx7bffYt26daipqeFfLmFMXS6Xo7S0FCKRCEajkWeER0VFIS8vDwaDAaGhoTh27BjCwsJ4gqHFYsHll1+O1atX864r4Wq0traWzxrw7NkQulIB8INbdXU1j4KF4jp6vZ4XS8nMzORVSIXqoULgIST6CON3ERERqK2tRXZ2NvR6PS93fscdd2D58uX86v/EiRNITk7m44TCLJLCwkL4+fkhNjaWR8tZWVmYMmUKVq1aheDgYH7lqlQqodPpkJOTA51Ox3uM/Pz8UFJSwnsZvv76a0yfPh0RERH45JNPMGvWLIwaNQrl5eXIzMxEWFgYjh49ijFjxmD//v2wWq0Qi8Ww2WwIDg7GQw89hA8//BCnTp3i45jClZRQ/lYul/MZANHR0SgsLOS5IWPHjuVdtT/99BO/ihV6OzwXDFIqlRg0aBA2bdqE66+/Hj/++CPvmaipqYHdbofb7UZaWhqqqqr4lFaLxcK7bZ988km8//77vFvy7rvvxr333ou0tDSUl5ejoKCA5yQIGfEPPvggbrrpJvzlL39Bfn4+SkpKUFhYiLi4OFRXV/NkRAB82M7pdPIpo1u3bsX06dP5yaW2thYOhwNjx47F7t27kZycDLfbzXNFhEq1Qq+Ry+XiC3IJC0YJ47AXXXQRTp48yad5C3k1ntOYhZ4Y4ep05syZ+PHHH1FbWws/Pz9UVlZCq9UiISEBTz31FJYsWQIAfKhOqM4r9AgKB39hnxcONyaTiSczu91uXrSssLAQCxcuxBdffMF7BIQeKKE3ThiGDAkJQWFhIR/OE4YxFAoFLr30Uuzdu5f3QApXxkLvjpBkZ7fbUVpaisjISIwZM4YnhxoMBp53IpR8Dw8PR1ZWFvr374+MjAxe0v+BBx7AvHnzeJXX/v37Y8+ePVCpVLxNABASEoKUlBT8+OOPvNdQCEzXrFmDSy65BP/5z3+wYMEChIeHo7i4GOHh4byUfnFxMT7++GOsXLkSMpkMhYWFGDFiBK+wKZVK+UyLb775BitXrsR//vMfvvBbUlISX2zRYrHAz88P06dP56scC9VRheGXgoIC1NTUYOLEiaitreW9JEKZ8+XLl2PUqFH8inzAgAE4fPgw7xUSelqFIEcoVy/0MLpcLsTExODgwYNISkpCTU0NxGIxAgICkJmZifLycrzyyit48skn4e/vj3HjxiE/Px/79+/nPdAVFRWoqqqCWCzmxRKFXjelUsnzy4TjukKhQEREBBwOB6ZMmYIDBw4gOTkZX3zxBWw2G2JjY5vkWQ0YMAC7du3iOXLCrJn4+HhkZGQgOjoaBQUFPPgqKyvDPffcg3Xr1uHNN9/EM888g+DgYBw+fBinT5/mheaCg4NRVFQEp9OJSZMmoaSkBLm5uVAoFDw/JiUlBQcOHGjSiyuUBX/ggQfg5+eHBQsW8KEvYdpzVVUVgoKCeC+LMGxdVVWFW2+9Fa+++io/po0YMQJr1qzh511hXxUWfhN6lwIDA6FQKHDnnXdi7969+Prrrzt8/u+WOhkvv/wyRo8ejUcffRQulwsJCQkoKSlBUFAQX+figw8+4Gs3REZG8vnz6enpGDlyJEwmEy9XLVx1GY1GaDQa3H333fDz88Pw4cPh7++PxMREfpWhUCgwYcIEREdH4/LLL0dISAjPhI+KioLT6cRFF12EXbt2wc/PD4mJiTzhyul0Yvz48Zg/fz7UajWP+k0mE19KXqVSQS6X8/K2Dz74IB8jFw624eHhvJtMmAKk1Wp5JC0k5AjJNuHh4WCMISsrCyaTCTKZDMOGDWtSe6CxsZFfNR49ehRFRUW86z4/Px9arRZDhw7lGdsulwtDhgzB5MmT+XBVaGgoAECr1fI8goaGBj7WnpOTw5MWa2pqMGXKFABnV8+dNGkS8vLykJiYyJPghG7Mbdu2obS0lB/IhASmf/7znzhy5Ai/mhHWromKisKzzz6LxMREzJ8/n2foCyWehW5focaBcNVjMpn4GjjCsJMwxl9YWIiff/4ZUqkUy5cv5zkyQlB6zTXXQK/X85LmTqcTpaWlCA4O5mP+GzZsQHFxMbKysnD8+HE88MADPEs7Ly8PR44c4fkdKSkpqKur43krKpWK56LY7XY+Q8TlcvE6BH5+fnxqZnx8PK+FIJFIeDArlMc3m818tlBmZiby8/PhcDh4eeiZM2ciJCQEbrcbgwcP5pUwQ0NDMW7cOF4OferUqQgPD+dDTEajEX5+fry6rtANP2DAAKjVavz888/8al0mkyEmJgbBwcEoKCjArFmzcPz4cezYsYMPqxiNRiQmJvKDrkql4uuSCIF1QEAABg8e3GQcv66uDnV1dfj3v/+NzMxMVFVVoby8HG63u0lvhsVi4dMqhZoERUVFsNlsiI6ORkREBJ9aLvRQAMDEiRMRFhaGzz77DElJSTwAE9pZUFCATz/9FGq1GuHh4Rg+fDjCwsIQHR3Nkyjz8/Nhs9n4NERhiqSwgueJEydw2WWXoby8HOPHjwdjDPHx8Vi1ahWvRFtZWYmgoCCkp6fzZHNhWQCn04knnniCr0ehVqtx+vRpqFQqvsqpEFwIVV/XrVvHhz6EoU2tVovLL78cr7zyClwuF2JjYxEYGIisrCxYLBYEBQUhODgYERERfBVboarn5s2b8fXXX+P48eO8/sz27dv5Zywc54qKinD11VdDJBLhxhtvxM6dO6FSqSASifDuu+8iMTERqampCAoK4jMZFAoFVCoV7xEsKiriQUBycjLv5Xr44YcxefJkxMTE4JVXXoFCoUBdXR2fzqzX61FRUcGDpbi4ON7TaDKZMGPGDLz66qt8Fe9LL70UQ4cO5cfarKwsuFwu/Pzzz9i3bx8+/fRTTJ06FXa7nefxCOtWnTp1Ck6nE9nZ2Ty/rK6ujtd3ysjIQHl5OZxOJ18v65dffoFUKsW4ceOQnJzMK7mGh4fzWVBCD4GQDCqVSnHvvfeivr4ekZGRsNvtaGhogNFoxPjx4/nsPWHiQWlpKTZu3IiAgACkp6cjKCgIpaWl/EK3vLwckyZNQlpaGsaOHYvk5GRERUXh9ddf5+ddi8WCffv2YebMmaivr8fs2bORnZ2N06dPQ6/X8+NOeno6qqqqeK/u559/7l0AwLrJ3//+d3bxxRczkUjEV0Hs168fi4yMZFdccQX76aef2IIFC5jJZGIxMTHMZDIxvV7PZsyYwUJDQ5lSqWyy+qVQmVKosCiRSNiAAQNYamoqGzZsGIuNjeUr5gUFBfEqgWKxmMXHx/MKkp7PlZ6ezqZOncpuv/12FhkZybRaLZs2bRq77777eMXAsLAwFhERwVd8FB7r+SORSJhcLmcSiYQZjUZeaVCoTBoTE8OCgoKYTqdjV111FYuMjGQmk4mZTCZexU2oXCis5im897S0NCaVSllQUBAbMWIES0xMZDExMUylUvFV+EwmE399eKwQGxwczKKjo1lAQABLSEhgSqWSxcTEsOjoaL5Sq0ql4pUHTSYTMxqNbNiwYXz7JycnM71ez6sGmkwmdtVVV7EFCxawiRMnsttuu42vgOjv789XDBTa4llJT+yxmqvnbUKVPGF7CttY2C4hISG8CqxSqWSXXHIJX/1Qq9WyqKgoFhAQwFcWveSSS1i/fv2YVqtler2ehYaGsosvvphX3dPpdEyv17MxY8awcePGsfj4+CZVMIUVZqdPn86kUilLT09nYrGYzZo1i2m1WhYYGMjCw8OZUqnkqxsKFQ2FlVwNBgOveCqsxCu8V+F9C5+xUK1U+D/P/VT4VyqV/mFFWM/9Rq/XM41Gw6sQSn9faVeo1hofH88rtcrlcnbppZcyg8HAq4yKfq9QKRKJ+OcTGBjI95OEhARebXHOnDlMqVSyUaNGsUmTJrFXX32VabVaFhcXx6tg4vcVd2UyGQsJCWF6vZ75+/uzd999l1eEjY+PZzExMczPz49FRkbySqESiYSFh4ez0NBQtnz5chYYGMjS09PZ0KFD2YwZM5hWq2WjR49md911FwsPD2eRkZHMYDCw9PR0vopxQEAAk8lkTKPR8N/lcjm78sor2fjx43mFTaHiYVBQEIuLi+OfpWeVTsnvK4YqFApeEVepVPIqu8I21+v17LnnnmPjxo1jgYGBTKlUsgcffJD5+/vz7dL8cxRWvRS2tefnKuw/QuVOpVLJBg4cyG666SYWGhrK3nrrLTZp0iReNTQiIoIZDAYWEBDAIiIi2Lx589ioUaOY0WhkCoWCTZw4kel0OnbZZZex6OhoXgVZLpez22+/na9y/MADD7CoqCg2cOBAvk8B4MfXyMhIXslX+Cz9/PxYSEgIr6Isk8l4hUmxWMyrZ3quhCwcH+S/r4JrNpvZlVdeyeLi4lhgYCBfvVuoLDtgwABegVWoFBoaGsomTpzIfvnlFxYeHs6rfU6fPp1XvxSLxSwuLo6vdJuSksI2bNjAX1v8+2q3wnYU2h0SEsL8/f1ZUlISu+eee9jkyZNZcHAwCwsLYxqNhoWGhvJVdMViMbvpppuYUqlkY8eO5fsUgCarUnt+viqVimk0GpacnMwkEgkzm81Mo9Gw8PBwFhsbyyQSCUtKSmLx8fEsKSmJaTQaZjKZ2L/+9S9+vBD/Xl1Y8nsl3ICAAObn58fCw8PZrbfeyjQaDT93CecG4bMLDw/nqwErlUq+ErXJZGJms5mFhoayd955x+tzf7cMl/zvf//D3LlzsXDhQrz99tu8i95ms8FmsyEmJgbl5eWoqamByWRCcHAwjzaFGu82m43P2hg4cCDq6+uxbds2pKWl8Wllu3btQnp6Or/Sslgs/Apd6C2Qy+W49NJLsXr1amg0Gr6SpdvtRmNjI2699Vbk5ubyRZMCAwP5mLYwpCN0AQtX/sJVmtDlKCR4+vv7w2Kx8KqfwrjpO++8g+eeew4LFy5EYmIinnrqKVRUVMBoNPKKbFqtlndN19fX8+GjkJAQ/nojR47E4cOHERISwovnfP311zyaFaauWSwWPvuhqKgI7733HtasWYNRo0ahf//+ePzxx3H48GGeiKlQKJCcnIyKigpMnDgRy5Ytg9lsRnBwMNLT0/n0wOrqaqSnp/Pky4aGBvzzn//E888/j5qaGsyaNev/2zvvqKrOrA8/gAJWQDoqRXrR2LArYo0VkiCKBUw0WNFkbDGTWDKosUUTGwE1EgtGRQUrqBHRqGABFEUhSlNBunREvN8f5LwDSGbUxJT5zrNW1gp6vZx77jnn3e/ev/3beHp6il1TbGwsGhoaQvGtra0tXB6luQ7fffcdsbGxzJkzB319ffT19Vm3bh2nTp0iJSWFdevW0atXL86dO0dpaakoRUhdGpJASkrtS5kySQAmTdA0NjYW/g5nzpwRLoJubm6cPHlSTKStqKgQbYZS66XUedGkSRNRBpNKPNJ1JulO2rdvz+XLl1FXVxetyhkZGeTk5IjZMKWlpSxYsIBvvvlG+BhIs3q0tLQ4cOAAH374IXPnzmXhwoW89957xMXF4enpiZGREUuXLkVdXR0LCwu0tLTYvn07LVq04ObNm1hZWYmhSsOHD0ddXZ1Dhw6hpqZGYWGhKK2Zm5uTm5srds2NGjVi586duLq6imtPaoeTynOZmZlCw9KuXTsMDQ2JjY3l4cOH4vzk5+eL8y91CkjlIkBkCKWZIJLYVDLmkjJpktDNwsKCp0+fivZaIyMj7t+/T/fu3TEwMBBtnYpfBIKShsjGxoaWLVty6tQphg8fTl5eHufOnRODsPT09MREzzt37qCjo0NBQQGTJk3iq6++Aqp9ZhYsWMCaNWvo2LGjuA4UCgWpqamiDAjVWRZTU1PMzc1JS0tj+PDh7Nixg4kTJ/Ltt98Kb5j33nuP1atXk5aWhpmZmZjfNHr0aAoLC0WHXcOGDcUOtKKigq5du6Kjo4OKigoxMTHCE+add97h/PnzPHv2jHbt2pGZmYm7uzs3b94kODhYtPlLLYtKSkq0a9eOiooKEhISRNpdshGQvtcDBw6gq6uLk5MTVVVVZGVliY60CRMmsGLFClRVVSksLMTAwIDMzEyaNm0qnEWlUpl0DQEiayc9Ozt06EBycrJwV46Pjxf6GGdnZ9He6evry6hRo4iNjSU+Pl6Uv6RMmqQlUVVVxczMjKysLMaPH8/atWvFfJpdu3YxatQoWrVqRXZ2ttC1NW7cGA0NDVFWkp75UmeHjY0NFRUVYrJzZmYmZ8+epXnz5iJrWVFRIcrf9vb2Ytic5IpaWVkpyrrSfSIZ6yl+EcVK3SPSuWnUqBFVVVV0795dlMKTk5PFuiE1CAwePJg2bdoQEREhpgp/+OGHfP/995iYmAgjssuXL9cymZMErllZWTx58oScnBxRfpM6pySjR319ffbs2fPS6/8fEmTMnDmTr776irlz54qBT0lJSYwcOZK9e/eSnJxcS0Clr69P27ZtOXHihGirtLOz49mzZ6SkpNT64qHa1c/V1ZVTp04xadIkcaPv3r0bIyMjoqKiKC8vx9LSkh9//JEWLVpgZGQkBHlnz55lxYoVrF+/Hk1NTfT19UlJSRF90lBtPzt9+nRmzZqFvr4+ERERQojTvHlzUcpwcXEhJCSEJUuWsG7dOjQ1NdHR0RFj5aUWXRMTE44fP46bm5t4UEhWvs7OzkJ89+TJE1Ev3rZtG3369BFCn0ePHokJnADNmzcXC1tMTAxFRUXY2tqio6PD7NmzWbJkCXPmzGH37t2cPn2aRo0acfr0aWbNmiUcRh8+fCgcHyWtQ9OmTbGzswMgPj4eZ2dnrl27JjpkSktLycnJwd3dXbzOxMSEAwcO8Pz5c06dOiXS7VKApKGhIR5Skq1tcXGx8NWQdAJSACipz6XSlOS7IdV3vby8yM7OJjAwkObNmwsh7bhx49izZw/Pnz8XZaYLFy4wbNgwnJ2dOXfuHL179yYhIYEHDx6IGx+gV69eoqYfHR0tjknSAkgK85EjR3L69GkxGK1Xr178+OOPNGjQgFatWomOFS0tLVFOOnHiBB4eHkJroKqqKqYIS7oeyRxKas22tLQU5mhKv1hKS+UkaTCfurq6SEc3bNhQmO8UFxdjbm4ueuTNzc15+PAhRkZGXLp0CQcHByIjI7G1tSUhIQETExPy8/OF6ZSdnR03b95EW1sbZWVlunXrRnl5OQMHDuTWrVscPnyY9PR0pk6dip+fH3379iU7O5uUlBTatGnD48ePcXBwEK6x0kh1a2triouLSU9Px8jIiKysLOH0mZeXJ3QL6urqQvEvlaKkTjVdXV2uXbuGiYmJeJBLZnSSxumtt94iLS2N1NRU2rVrJ1r2zM3NmThxImpqavj4+IgOmjlz5lBWVoa/vz+5ublivHXNgWOSC6OknTAxMUGhqJ4+WlZWhq6uLgMHDiQyMlIERW5ubhw8eFC07TZq1Eh0GiQlJdG9e3du374tJrtqaGjQrVs3Ll68yN27d4U9/eXLl8nIyCAjI0MIQ9PT0xk4cCAxMTE8ePBAmIRJpYSmTZvi5OREWFiY6FpYtmwZW7Zs4ebNm7z99tvo6uoSGhpKTk4Ozs7OREZG0qpVK7KysujRo4doZZXa5CWx9KNHj7CysuLevXtCJCnpMCStlSQUluwD1NXVcXFxITg4WLSHN2vWjOfPnwsPkpKSEoyMjCgoKMDIyIj4+Hg2btzIypUruXDhApqammhoaJCamkqvXr1E2cbIyKiWKHL8+PEEBwezfv16FixYgLGxMXfv3qVXr17CfyQ8PBxra2ueP39OYmKi2DBIgt1r165RVVVVy01TEpACopV7xYoVrFmzhtmzZ3P58mUCAwMZOHCgCATz8/PR1tYW51Aa8S6VBqXgXPGLzgyqRbTPnz9nxIgRFBcX88UXXwixtOR+++jRI/T09NDV1eXy5cvY2Nigrq7O22+/zZ49e0R7dY8ePYiLixOu1RkZGWIju2zZMsLCwigrK+Pp06ckJiYKX4/Ro0fj4uLyyuv/H6LJUFZWZsSIEaL2GBoaSnx8PMuXL6d9+/ZERUVhampKly5dsLCwYPbs2Zw7dw6FQsFHH31EWlqauHClaY1S/fjJkyf07duXjh070qdPH06cOIGJiQlTp04VI8F1dXVRV1cnLi4OJycnIUSTamozZszgwYMHmJqaCkveyspKzMzM2LRpE+rq6sTExDBp0iRSU1O5cOECampq2NnZoaSkJBZHaXffqFEjgoODsbS0ZMOGDZiampKfn4+Ghgaurq40atRI+MBLLbGSsVHDhg25desWOTk54jMnJSURGBiIsrIyn3zyCbNmzRI2366urgwbNgx3d3dGjx6Ntra2ED6tX79e+HTMnDmT8vJy9u/fL8bIh4eH4+fnJ1qtpDkxUscGVN9Eq1at4smTJ3Tu3JnevXszefJknJ2dadmyJUuWLKF9+/a89957VFVVER8fj5+fH2FhYWIMuWSEJYno1NTUMDMzE10FgLDx1dHREeZFbdu2Fb3oUtvWZ599RseOHYWFb4cOHbCysiI3N5c7d+6I3X9FRQUuLi4cOnRI9KZLxlNSq6nkPOrl5UXv3r0ZMGAApqamVFRUkJeXR2pqKlAtTLS0tGTEiBH06dMHLy8vTE1NGTt2LPn5+Rw4cACFQsH3338vPm/jxo1xdnYWLWbSmO24uDgGDBjAmDFjRKdKXl6eMMExMzOja9euwvpayoJ17tyZtLQ0HBwcKC0tFYZqkg3+wIEDRZZFXV0dExMTSkpKMDMzw8LCAj8/P4qKiujTpw+pqamiXTInJ4eqqiq+/vprJk6ciLW1Ne7u7jx9+pTOnTtTVFRESUkJN27cEO146enp7Nu3j7CwMBYuXEhVVRWamppAtdIfEOMDGjduTL9+/ejbty9z585FX1+fu3fvUlRURHFxsdg5bd26lfz8fMzNzYWpkIqKCsbGxuKBW1lZiaOjIyoqKtjZ2fHo0SO++uornjx5Qp8+fcSEzKqqKhwdHWnXrp0QwklmTOXl5URHR3P9+nVUVVVJSkoiLCyM4OBgunXrJoLXlStXsmbNGoqKiti9ezfvv/++0A+ZmZlha2tLixYt6Ny5M4aGhsItMSkpSXTIJCcn4+/vj6GhIVeuXKG0tJRjx44JB97Hjx+Tn5/PuXPniI2N5enTp9y8eZOsrCw2bNhAUlKS8KrIy8sTfhVSEHnkyBGgulvL1NSUBg0acOXKFdLS0kRArqamxtChQwHE+0sZFlNTU7Zu3cro0aOFziwyMpK+ffvy9OlTTp8+DSDEnomJifTr14+33nqLLl26sH79euHZImUkACEe7tSpE6dPn2b48OHCxnvw4MFMmjQJNzc3GjduLEScUVFR2NjYYGdnR0BAAMbGxsyYMUOMqJ85cybJyck4ODiwatUq7t27J/RMUgDl6OiIp6cnGzdupGXLlhQWFqKpqYmRkRF5eXmoqqpiY2NDVVUVXl5eYmJtSkqKCEzU1NTIzs6md+/e6OnpMW3aNDHHycnJifHjxzNmzBjWr1/P/Pnz0dPTo2XLljx9+lQs8F988QUGBgacOHGCS5cu4eLiIpxBpXuipqZEoVAwZcoUTExMGD9+PAMGDODDDz9k1KhR9OzZkxYtWpCSkoK+vj5NmjQhPDycTZs2MXHiRCEylzZv2trajBgxAm1tbQoKCnj8+DFXr16lvLycUaNGkZiYyLFjxxg0aJDw/ZCujefPn+Pg4ICvry+6urqcOXOG/v37o6KiQkhICOvXr8fJyQlLS8tXW/9fOWJ4DZR/GRS2efNmvL296d+/P7q6uqxYsYLIyEi8vb3JysrC19eXq1evCgGcu7s7S5YsQVdXVyjLnZ2dgWrzorKyMoyMjNi9ezdffvklW7duZe/evdy+fZugoCDRnlZQUICtrS02Njbk5OQIkZyWlhaNGjUS9rdbt26lb9++aGhoCH8ChUIhWj8l4Z5kuypZ6Ur/aWpqkpqayltvvUVZWRl2dnZs2bKFoqIioqKi0NbW5vnz5/j4+Ah3x9LSUgoKCmjWrBk5OTnk5OSI3Yc04EdNTU2YSWVkZIiyyrJlyygsLOTw4cNC8FVcXMzUqVNrWTN37twZS0tLBg0aRFZWFgMGDBA7MH19fdzd3WndujW9e/dGSUlJmAJpaWmhpqbGjh07UFVV5fz582RkZLB48WIOHTrE4cOHCQgIICQkhH/+8588fPhQzJbYs2cPPXr0ICMjAzU1NcaOHYutrS26urooKytz48YNoYqWnE6lTAFAs2bNePjwIS1atBBtyU2bNuXHH3/EysqKgQMHMmjQIM6dO0dERAT+/v5oa2uLXXFcXBw//PADrVq1QllZmbi4OA4ePEhxcTG9e/emuLiYTZs20bhxYzZv3kxkZCQ3b94U5QHJA2HMmDGiDfSjjz5i3rx5FBcXk5qayv79+2ndujXGxsZkZ2czaNAg4QdSVlZGUlISenp6XLlyBT09PaysrGjUqJHoOpDcRaW0pJqaGomJieK4TE1Nadq0qeiCaNKkCc+fP0dXVxc9PT3Ky8tFoCF5rkjpXKlFWLIw/8c//iGyBE5OTjg6OjJ69GgyMzPp3r07a9asQU9Pj4sXL9KgQQOxi9TQ0MDGxgZlZWU6dOiAtrY2PXr0QKFQYGVlhaamJnp6ehQXF3PmzBmePn3KmjVraNiwochWnDx5kp9//pmRI0eir6/PiBEjxCwFyU3y+++/R01NjdjYWJ49e8a+ffvEOdHX1xfKeqg2LMrKyqJDhw4EBweLlm3p+6+qqqJJkyZcv35ddO+UlpbSsGFDbG1t8fDwoEmTJiKToa+vT2FhIceOHcPDw4OioiK6d++Op6cn48aN44svvsDY2Ji0tDSx2E+ePJm0tDQxhKxZs2ZERERQUFAgsmXSEC4/Pz8aNGjAlClTRGtlhw4d6NOnD7179xbZTjc3N8rKyvj888/55ptvMDQ0ZO3atSKjKQVn48ePp02bNmLYmrTRWrZsGUOGDBFzPKRgLSwsTBjw5efn895773Hr1i0GDBiAvb29KD1mZ2czdOhQjh49irW1NZ06daJDhw6MHz+eLl26UFBQwLFjx7h9+zZnzpxh06ZNlJeXU1JSwpMnT3jw4AHl5eV07txZCBH79u2LiooKgwcPFiUNSfRuYWFBWVmZGHNuZ2cnynBNmjTBy8sLBwcHka2Vsnampqb4+vqKoW56enrcv3+fiooKhg4dyty5c2uZVEkbR2VlZS5evMigQYNQV1cnNTUVb29vevbsyf3799HR0WHJkiUYGBhgaWmJoaGhyIK6uLigoqLC+fPnuX79uhgBkJ2dTVZWFm3atCE7O1sIevPy8ujSpQvvvvsuEydORElJifbt24syqXT/S/bze/fupbKykkuXLnHo0CFxjiS5wKxZsygqKiIzM5NRo0axYsUKfvjhBwIDA0lKSsLS0pLevXtTXl5Oeno6DRs2xNzcnOXLl+Pq6opCoaB///4MGDCAhg0bEh0dTXJyMgYGBpiYmODl5YWNjQ2hoaFiDouWlhbvvvsuH374Ib169aJjx46UlpayatWqV1r/G/xegcR/YvXq1QwcOJCFCxdSXFzM8OHDKS4uRkNDQ7gtPn78mFmzZjFu3Dhx0R8/fpzu3btTWFhIaWkpz5494+bNm4wYMYKSkhKRyrKysmLq1KlMnjwZgDlz5rBhwwYePnyIl5dXrWM5fvw4mzZtEj//4x//YOPGjbRq1YqtW7fi5uaGvb29mGoZEREhpnRKbUGSoZakBHZxceHOnTvMmDEDqK41btmyBX9/f3766Sd69uwJwOXLl1m7dm2tY8rMzKSsrAxvb2/Cw8Oxt7dHTU2NYcOG8f7772NnZ4eRkZGwiDU0NGTmzJlUVFTw7bffsmXLFhFwSa81MjLCycmJzZs3ExgYSEFBgRhqNGvWLKZMmSJ+/9mzZ/npp5/o0qULAIWFhaxdu5bjx48zdOhQzp07R0xMDAMGDGD16tVUVVXh7+/PsmXLcHBwECm7Hj16YGVlxddffy3eOy0tjb59+1JZWSnafocPH050dLRQ2ks1eskcLT09XQQdUmuzdE6h2jDK2toaCwsLHB0da3237777LomJiTRv3py4uDi0tLS4d+8ec+bMwc/PjxYtWgiHSqk919HRUXyfkjuslpYWQ4YMYc+ePdy6dQt7e3vU1dW5cOEC169fFx0/qampYgLwtGnTUCgUbN26lXv37tG0aVOhC3J2dkZPT0+oua9du4auri5paWncunULhUJBXFxcLSOiJ0+eCHfTZ8+ekZubCyBKTNI5efz4MQAPHjwAED9LrcGXLl0CqtP8lZWVbNy4kczMTFxcXIRNtBTIJicnY2pqyv3798UsDMkV98aNG5iYmDBt2jTCwsJo1KgRt2/fRktLi7CwMOGtoKSkRHR0tEjdSr4H2traBAYGEhYWRnx8vAhSpM8v+cVI51Wa1wDV+oTc3FyuXLki5hrV9Llo0KABI0eOpGHDhly4cIHi4mKuXbtG48aNadeuHWfPnhUlh3v37ongZ/jw4eTk5HDjxg1Rp09LS2PEiBEoFApOnDghfDM+++wz+vfvL/581KhRVFVV0bVrVzIyMoT25tmzZ5iZmZGcnCz8NKytrYWdfPPmzdHR0eHRo0fcu3cPOzs75syZQ1RUFMOGDSM9PV187nnz5rF7927u378PINpqfX19mTZtmrju09PT0dXVxdLSklGjRgkPjT179og2e4VCQWlpKQYGBuzbtw9jY2MeP37MiRMnMDY2xsPDQ9j8q6ioYGZmJsp0DRo0IDExEXNzc5EpTU5OJj8/n+LiYkaMGMHRo0dZvHgxs2bNIiYmBj09PUpKSoSmrUuXLhQXF3P79m0iIyMpKirC2NiYCRMmCFt7R0dHke2TurueP3/OkCFD+PTTT9mxYwfNmjUT2ScVFRVWrVpFZGSk0Ev5+vri4OAgTLEkB04pY3jz5k127NhB06ZNWbduHQ8fPqRPnz5cuXKF4OBggoKCKC8vZ9euXTRu3Bh9fX00NTW5cOECrVu3pnXr1qK7TZr/06xZM/Gdnj9/HhsbG3R1dWnfvj2enp4ALF++nFGjRomuJMUvxlsODg64uLjw5ZdfYm1tTXx8PMbGxuzZs0ecIykrkZ6eLkz/PDw8hPdLbm6umM0l6ZcUv7RoS2vNzp072blzJyYmJty+fRtzc3M8PDwYNmwYOTk53L17V7gFr1ixgqioKMLDwzE0NCQpKYlbt27h6urK6tWrRXnoZflDggzJT97V1ZUtW7awZ88e0tPTOXr0KO+88w79+vUTJYzk5GRSUlKwtbWlY8eOAFy4cEEIB6Ojo1FWVmbUqFEUFRXRv39/rl27RmpqqhixC+Dj4yMmhkpUVlbyww8/8MEHH4gaofTwjo2NJTQ0lNDQUJKSkoSR1f3799HV1cXKyopevXrRrVs3tm3bxtdff820adOwsbFh6tSpBAYGkpGRIX7XO++8AyACDKj205g7d26tc5OQkMDevXvp2bMnp0+fZvr06Xz33Xf07NmTVatWcf/+fZKTkykqKqJRo0ZkZGTg5OQE/Dvd2q1bN6ytrYmJiUFHR4fMzEzRburl5UXz5s0ZMWKEmGvx6aefCsc2W1tbrl+/zrBhwwgPD2fx4sV4eXlx5coVvLy8uHjxIjExMcyaNYuQkBB27tzJpEmTWLZsGdu2bWPAgAHC4vry5ct8/PHHpKenc+DAASwsLDh16hTW1tY4ODjQqVMnAgMDadmyJf369RM19kuXLqGtrS1aR1u0aCE0IlL9VlNTU1gWSw54urq6tc6ll5cXaWlpJCUlERUVxeTJk5k0aRLdu3dn6dKlHD58GF9fX8aMGUNmZibt27dnz549vP/++wCcOnWKiIgIlJSUePjwIe3atcPKygpdXV127dpFQkICWlpaxMfH061bN8aOHUtISAhQrVXx9PQUWbPQ0FDhHzBixAg8PT0pLy/HyMiIoqIinJ2dxRRFqLaWr6qqIjs7m4kTJ7Jv3z4hyouPjxc7Q2nAn+RmKM3nkXQk0i5W8qwoLy8nLy+PTp06oaKiwoQJE9i0aRMGBgbC6vjOnTsidZyYmMjo0aNxcnJi2rRpODg4UFFRwSeffEJ4eDgWFhYADB06lPHjxzNixIh67/kzZ84QHBzM5s2bxZ9FRkYyadIk/Pz8ROunlBG0trYmKSlJmI4FBARgZmbG/fv3adu2LZcvXyY5ORkjIyOsrKzE8CgdHR1GjhwpNhheXl48ffqUSZMmERERgaGhoZjX0aRJE0aOHMmhQ4cwNDTk0qVLaGhocPXqVZSUlMjIyOD58+ccOXKEK1euoKWlRVBQkCijtmnThvj4eABycnLIy8sjKCiIwsJCLl26hKqqKo8ePeLjjz9m3bp14nN37tyZvLw81qxZQ9OmTXF2dhZeILa2trRt21Y8PzQ0NESLoIWFBTY2NiIj8msUFBQwfPhwtLS0uHHjBhcuXMDAwIAjR47g6enJtWvXcHZ2plOnTly/fp2mTZvy+PFjunTpIkrUGzZs4OTJk9jZ2eHh4cG1a9fQ1tYmKioKqHYadXZ2xtHRkT179jBhwgQuXbqEiooKa9asoUGDBqxatQoVFRXGjRtHTEwM5eXlVFZWYmNjg42NjdD4aGlpCRFoRkaGuAcWLlzIzJkzefz4McrKyiQkJLB27VqmTJlCTk4O3t7e+Pv7M2DAAAYPHoy+vj6ffPKJCMoGDx5M165dxXmRBjLm5ubW6+vw2WefsXTpUpKTk+nSpYsIIq5fvw5Ur1tvv/027du3R0lJSWgV2rdvL+wCpBbk7OxsmjdvjpubG/379ycwMJDy8nL8/f3FfKSMjAySkpJwdnZm586duLu7Y2hoSEJCAuvWrcPX1xc3Nzehz5HOkeIX87oxY8aI4P2jjz4iOjoaIyMjunXrxpMnT0RGQxpJIF07il/mr0RHRwuR7NWrV7l8+TKdO3dGW1ub/Px84cKcnJxM27ZtxXo2e/ZsodmKjIxESUnplYSff0iQAbB27VoCAgJ49OgR48aNQ0VFhePHj2NqaoqXl1et3b2KigrDhg3j4MGDbNq0iTlz5lBaWoqtrS0XL16kb9++QLUwb926dcTHx7No0SIxXVWisrKSuXPn8vDhQ+zt7blz5w6mpqbs2rVLvGb69OkAtablSXXl/v37s3Xr1hc+S1ZWFpMmTaK0tJSpU6cCvJAxeVny8vKENuP8+fPcuXNH/BwaGkqXLl2ws7NDXV0dLS2tF86VlZUVAQEBYtzznDlzhB2zxI0bN1iyZAkpKSmoqqqydOlS8Xd1gxxJaJSVlcXp06fFQv7s2TPmzJlDx44d+f7774W19bVr14RbXrt27UhLSxMDoEpKSoiNjWX69Ol4eXlx8OBBwsPD8ff35+LFi1y6dIkWLVrg6OiIo6OjWJhNTU1JSUkRfeWSLuDmzZt8/vnnhIaGcubMGXGsNZEMhMLDw8nLy2P58uX4+PiIvnups2Xt2rUoKytz9epVvLy8WL16Nbt27WLbtm1UVFQQHR1NXFycCFxHjhxJQEAAJSUlVFRUsHbtWm7cuEFoaCiDBw/Gzs4OT09PJkyYwK5du8jOzmb79u307t1bHJu6ujqZmZl88MEHBAcH07NnT9q3b09cXBypqakUFBSwdOlSsds+cOAAgYGBtGvXDnNzc5o3b05+fr6YCty1a1fu3btHixYt6NOnD7m5uWLhHD58uFC+9+jRQ2hFCgoKmDNnDhkZGYwaNYrjx49jaGhIv379allC//DDD5iamnLjxg2h+jc1NSUzM5PCwkLOnj1LcnIyQUFB9T5wgoODOXXqFGPHjhWZl5CQEFxdXUlJSWHRokV88cUXdOnShUWLFnHw4EFhpDRy5EigOiiXArSTJ09ibm6OpqYmcXFxdOjQgVu3bjFx4sQXfreqqio7d+7ko48+4sSJEygpKTFz5kz2799Pfn4+6urqwgtn6NChLF++HE9PT4KDgzl79iyOjo5YWFjg7u4uruXg4GCCg4NJS0tj5syZJCYmitk0JSUljB49mpiYGHGv1MTHx4fc3FzWrFnDoEGDWL16NX369GHv3r3Y2dkRERGBj48PUN2JJ1FYWEi7du3ErhWqn2njx4+vJcCbP38+vr6+JCQkEBERgb29PZcuXaK0tBRTU1MiIyNZsGAB7u7uwrlUMuCC6vLT/Pnzad++vdilLl26lP3792NlZSUMA5WVlfHy8sLT0xMlJSV8fHx4/vw5S5cu5eTJk+zfvx9jY2O6devG9u3bGTx4MGFhYS98P/UxceLEWoFETfLz8wkPDycnJ4fw8HC8vLzYunUr27ZtIyQkBFNTU4B6RYnS91wfdZ/ZCQkJtG3bluzsbBITE7G3t68VLD548AB7e3vhqSS5Lffo0UNk72o+P2tmX7Zs2YKTkxNTpkxBT0+Prl27igA9KioKQ0ND2rZti4eHBwCDBg164Xg9PDx4+PAhp06dIjo6GldXV/bs2UNoaCht2rTh888/p3Hjxnh7exMYGAggAgXFL664KioqzJ8/n6+//lqUQq2trcXvkDawNXn8+DHz588X57vm61+GPyzIUFVVZcaMGSQkJDBjxgymTJlCaGiouLlqoqSkJOY8jBs3TqTSs7KyyMnJ4aeffgKq05Dh4eEMGjQIf39/hgwZIi5QhUJBUFAQlpaWBAUFMWTIEI4fP87s2bNr/S6pzuvj4yPqxmvXrsXMzKzezxEeHi6cShs2bMiKFSuEydXr8O6775KRkSHEQVIE6eLiwpYtWwgNDWX8+PG1Shw1CQwMZPv27eLnGTNmvBBk+Pr6sm7dOrS0tMjPz2f+/PkEBAQALwY5UH2BL1myhPDwcD755BOgepdQc1f6+PFjunbtKjowhg4dKnavo0eP5ueff0ZDQwNAPLikzwrV5724uJiCggI6d+7M8ePHKS4upn///igrK5Obm0tpaalQ80sOfFFRUS8ca01UVVVrndNdu3aRmZkpFg5DQ0P09PQICQmhWbNmYs6HJPKMjo4W12XNzFjN8zx9+nR27twpFs82bdqgrKzMmTNnhLjR3t6esrIycX3WPL6CggJ8fHw4ceIE7u7uopPk6dOnYucYERFBbm6usCOXatlPnjyhuLgYExMTUcaSOq709fVFl86DBw/IyckRAZuOjg52dnbigSEtMBEREURFRdGzZ0+cnZ155513CAsLEy3AUtp14sSJrFq1SnyWI0eOoKSkRHJyslgEawYbp06dAv5d5iotLWXevHnY29uzYMECoqOjcXR05N69e0yfPp2uXbvSvXt3EhMT8ff3f8FWOSMjg59//pkuXbpQUVFBfHy8sGr+NVavXk1eXh5nz57l/PnzeHt7ixkVn376KaGhofj5+WFpacmsWbP47rvvsLe3JyIiAn19fXbs2EGTJk2IjY1lx44dKBQKhg0bBsBXX33F8OHDUSgU3Lp1i9atW3PlyhVOnz4trikJLy8vwsPDUVJSYs2aNbRo0YJ//vOfHDt27IXSh5+fHwDl5eV888033Lhxg+Dg4Frv9+GHH9ZaUAMDA5k4cSJOTk5Mnz6dpKQkSkpKePvtt1m/fj3ffPMN+/btEx1gjRs35tSpU+K79PLyokOHDuzbt48vv/wSgMWLF7N48WIyMjLYtm0bYWFhxMTEcObMGRwdHUW3lZGRERkZGcTExGBlZYWysjKxsbHivnhZ6gYS8O97u+4zMiUlBfj3vTpr1qyX/j11kQKNiooKgoKC2LZtG02bNuXLL7+koqKi1rFIJcmff/5Z/HupRKGlpVXrmSSJ9+sGTVu2bCE1NZU7d+6wZs0aqqqq8PT0rPX56l77EkFBQUycOJGYmBi8vb2F3tDOzg4dHR0xIdXb2/uFAEqa9vz06VO6du0qdI59+vT5r+coLS0NeP3z/YcFGRJ1d8g1e+YlOnToAICDgwP+/v74+Pjw9ttvc+HCBVFWUVJSEqUBqE6b6unpYWNjIx5sGhoa3Lt3D39/fyorKwkICODmzZviRrl69apQRNfcQfwnMjIycHBwED/XrL2+DnUvhpo/u7u7s2jRIuGOV9/FJ2VdJOo+4KDa0VOyM9fS0hKdI/DiDSz9Dsl/X6LuLmHlypUMGTKEjh07kpKSwpgxY4iOjgaqF+FNmzaxYMEC4N/fp/TZpAW/Q4cOZGZmitkWaWlpHDx4EAsLC/Ly8tDV1SU9PV3UJgcOHIiysrJwHKzvfCh+GbInfb/W1tZ4enqKa+LZs2ecO3eO06dPC48C+O/XZc3znJaWJnrMw8PDefDgAQcPHiQgIED4vPTp06feY3R0dCQjI4NOnTqRnJwsnCHNzc2xtbUVn2/q1KlC3Pjo0SMqKytFLVtZWZmSkhKKi4u5ceMGBQUF3L59W7xPw4YNadmyJUlJSSQlJWFoaCjaRseNG1drgakZZEZFRREaGoqxsTGdOnXixo0brFy5kp49e+Ll5UVUVBSbN2/mypUrXLp0ifLycj7//HO6du36wv0zYMAAjIyMsLGxISgoiOjoaGF/bWtry4MHD/Dw8KCqqooFCxaIlklfX99axyeRmJgofBuys7Pp0aMHx44de+H7r/udff/99+Lev3XrFg4ODmRkZNChQwdCQ0M5fPgww4cPx8bGhrKyMnR0dFBVVcXf359hw4axcuVKZs2aVeu5AtXBYvPmzWnTpg2ampp8+umnPHnyhE6dOolrqiZSd8Hy5cuZPn06ixcvJj8/n7Fjx4oJqFC9cdq5cydHjhzBx8enViasvmtRoub5j4yMxMvLi3PnzvHdd9/RqlUrli9fLhxjCwsLa12XOjo6LFy4sNYGbO7cuZw+fRo3NzeOHj0qtC93797l4MGDODg4sHr1asaNG0fLli3p2bMnsbGxKCkpkZaWJu6Ll+XXnkPw4jPy2LFjL2RZX5fr16+zc+dOMVflvffeIyQkhISEBBYuXFjrWNLS0sjOzubBgwfidyoUChISEmjevHmt1/5a0GRhYUFpaakY5Ni2bdtXyoLXfN/KykqGDBlCRkYGiYmJtbKAdTE0NGTnzp1MmzaNwsJCkpKS6NWr10v9zvqy2q/CH+KTUZPbt28THh7O5MmTadq0KSEhIa/Ve1sfK1asYN68eWIo1ZUrV5g0aRLffvttrUXmzp073L59Gzs7OyZPnixGq//V0NPTY8mSJdjb2wPVgVXdyNPHxwdXV1exqB48eJCNGzfWeo2Li8sLotTDhw//pmPbuXMnly5dYvPmzaJc9ezZM0JDQzl9+jShoaHMmDGj3tSalMrbtWsXubm5GBkZ0bZtW/bu3YuGhgaampq1popK/29paYmSkhK9evUSgqq6PH36lICAgFf+fv/bdVnzPC9btoxr164xYcIE8fcv+6AIDAzkwoULQPXCKQk1pZZsSfujUChqva7muYiLixOfVQq6DQwMxPtIkznV1dU5e/Yss2fPpmPHjuzevZtx48ahpKQkzp/0XQB8/fXX4ruAat1BaWkp3377rTgnK1euxM3NjaysLK5fv46enh7x8fFs3bpVdDsA/Otf/2LDhg2Ym5tTWlqKpaUlqampogwhmXANGjSo1jEAtY5P4unTp3z88cfcvXuXFi1aiIzC65QpXVxcuHr1KsrKyhQXF2NgYEB+fj4rV65k+/btHDp0SOiCpGFwW7ZsqfUejo6OKCkpsXDhQmE3ffDgwZeyXLazs6N9+/bimZSamsrFixeBav1Gu3btcHd3FxnA7777rla5pF+/frWyH78VqZPk+vXrQgfXpEkT1NTUuHfvHq1btxabvQ0bNtT6tzXvm+Dg4FrvAa9fRv5P/J5riJmZGYsWLWL8+PHMnj27Vra2LtK9m5iYKDa3BgYG4p6t+Vl/7Zr29vbGw8OD7t27C5+nl8XDw0OIwauqqrCzs2P//v3s379feDb5+flx9OjRev99zWejqqoqX3755e/ybPxv/OFBxptEMgKSmDFjBvHx8YwbN67W67y9vf/oQ3stDA0NcXZ2xsnJSTxk6h77yyyqdS94eL2b//Hjx2zdupUHDx7QqlUr1NTUhJA1JycHHx8fgoKCcHNzY8OGDSxatEiUZerDx8eHlJQUdu/e/asB0l+F1w1e3gQtW7bEycmpVqmgvnLajBkzcHV1pWvXrkRHR3Po0KFanVV1mTZt2guLaV327dvH0aNHCQkJwdLSksuXL+Pj4/PCvzMzM2P8+PFi52VmZoa7u7soT0L9wcR/4sqVK+zfv59nz56xdu3aX00r/zcCAwN/tXuqrmjz448/ZuXKlSKYq/keNXmVzzJ9+nSxmD169AgDAwMRUNT3vpmZmS/UxOvLlrwukh9CTfr16ydE1wYGBri6uhIbG/sfF+G/I6GhoZSVlXHgwAESExMJCAgQnXZvAicnp9dej1atWvXCdRAUFMSiRYuA6g3I/v37hb9JXTIzM9mxYweHDx/Gzc3thSaEN8UfXi55k9R9EKioqAjF7N8xlkpNTeXw4cOcO3cOY2Nj5s+f/8JrJK3Lf+L32k3MmzePBQsWYGJiQlpaGitXrhR/p6OjI3Zx+vr6YgS4NCm17ncD1WLgLVu2sHDhQuzs7Fi7du3vcpxvgpc5z38UycnJHD58mMjISFq3bi3KUnVZt24dAQEBHD58GDs7O2GP/WvUbD/+Ndzd3XF3d+fYsWNisnL//v1rZVWgOhjZuXOncKwdM2YMx44de6XpjTWpKcasqyl4VX6tewrg/v37Ir0tqfrru3Z/yz21d+9eUeIqKSnh+fPn3Lt371ffd+bMmUBtvdDvSXBw8AtBxpAhQ1i2bBmjRo3i3XffFe2c/2ucPHmSzZs3ExERwVdffcWOHTveaJDxW9ajutoIHx8fqqqqhJGjQqEQLct1GTt2LK1atcLDw4PU1NQ/LMCA/7EgQ6FQvKC3MDIyeilxy18RyXv/6tWrIq3+Z6KioiJKN3Z2dvXWhgHRdaNQKJg6dSrHjh2r9/hVVVVfEOLK/HfqXhc1S0t1X/cqgVF9i+mvMWzYMIYNG8bkyZNxcnJi7NixtQIIqWNIcpkNCQl57QAD/r3Q1qcpeB1+rc5cU5wM/25F/z2RpsFOmTKFzz77DHd391c61vp0bL+Ful1AkltoQUEB3t7etWwA/teQ9CPp6ekkJCTUaoF9E/yW9ai+60AaGyHxa900HTt2JCYmhrNnz1JSUvJav/91+Z8ql9SX0pZumL8jdXdvUg37z+LcuXOi/KGkpMQHH3wgHFhrsmrVqlreG927dxetwjK/nT/zuqjp2aBQKNi4caOwt/478Sa1Yf8NBwcHMWdDEuvW7SD5I4+1ZpksPT2dHTt2cPXqVWbMmIGfn59o0/8znz1vit9S9nod6mb8XoX6roMhQ4a81PtdvnyZTp06cfXqVVHyftn24t/K/1SQ8b/GH30DvCrSpMC6vP/++8JQrKb3hszvw595Xfxe+p7/zzRv3hwNDQ0mTJjAqVOn0NLS+q+GW2+Sf/3rX9jY2BASEiJEvLdu3ar1mr/as0fm1airV/zggw9qWR+8Sf6nyiX/a/zVH94zZ86sVwj2a94bMr8Pf+Z18Ve/Jv8ObN++nZCQEIKCgmjZsuWfrkXavn07n3/+OTt27MDHx4fOnTvX6hSS+ftTN9tR08bgTSMHGTIvTadOnURLqkKhqNXfX5P/1PMuI/P/HTc3N9zc3PD29qZ3794sXbr0N+lVfiuSSPfs2bPCrv7vWmKWqZ/69Ip/FHK5ROalqZty++yzz/D19f0Tj0hG5u+HNE9iw4YNoqvlr5Dp+71EujJ/Pf7MFnw5yJCRkZH5A/mra61kZH5P5CBD5qXZvn07H3zwAaGhoQQEBNCnTx/mzZv3Zx+WjIyMjMxflFcbDC/z/xppBPKRI0c4cuSIsLiVkZGRkZGpDznIkHlpsrOz652uKiMjIyMjUx/yKiHz0kjTVSVLWmm6qoyMjIyMTH3IQYbMS1NZWUlVVRVPnjwBYNKkSX/yEcnIyMjI/JWRgwyZl8bPz4+FCxeyZs2aP/tQZGRkZGT+BshBhsxLU3e6qjRhVUZGRkZGpj7kFlaZl+b9999HSUlJTP38temqMjIyMjIyIGcyZF4BW1tbSktL6devH5WVlSxevPjPPiQZGRkZmb8w8uwSmZcmISGBvXv3ytNVZWRkZGReCjmTIfPS1J2u+meOp5aRkZGR+esjazJkXhp55oKMjIyMzKsgBxkyMjIyMjIybwS5XCIjIyMjIyPzRpCDDBkZGRkZGZk3ghxkyMjIyMjIyLwR5CBDRkZGRkZG5o0gBxkyMjIyMjIybwQ5yJCRkZGRkZF5I8hBhoyMjIyMjMwb4f8Ay5DPKqo76S0AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import scipy\n", + "\n", + "den = scipy.cluster.hierarchy.dendrogram(\n", + " clustermap.dendrogram_col.linkage, labels=distance_matrix.index, color_threshold=1.5\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "1902446f-8dcf-47fb-bd03-b3e31b40124c", + "metadata": {}, + "outputs": [], + "source": [ + "# extract clusters and perform gprofiler\n", + "from collections import defaultdict\n", + "\n", + "\n", + "def get_cluster_classes(den, label=\"ivl\"):\n", + " cluster_idxs = defaultdict(list)\n", + " for c, pi in zip(den[\"color_list\"], den[\"icoord\"]):\n", + " for leg in pi[1:3]:\n", + " i = (leg - 5.0) / 10.0\n", + " if abs(i - int(i)) < 1e-5:\n", + " cluster_idxs[c].append(int(i))\n", + " cluster_classes = {}\n", + " for c, l in cluster_idxs.items(): # noqa\n", + " i_l = [den[label][i] for i in l]\n", + " cluster_classes[c] = i_l\n", + " return cluster_classes\n", + "\n", + "\n", + "clusters = get_cluster_classes(den)\n", + "# extract functions for clusters\n", + "cluster_process = {}\n", + "for c in clusters:\n", + " cluster_df = pd.DataFrame(clusters[c], columns=[\"gene_symbol\"])\n", + " res = get_gprofiler(cluster_df)\n", + " cluster_process[c] = res[res[\"p_value\"] <= 0.05]" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "cdb473cb-93a3-4752-9612-5a58fc81cd30", + "metadata": {}, + "outputs": [], + "source": [ + "# save grpofiler results\n", + "with open(\n", + " \"figures/pert_embedding_cluster_gprofiler/scETM_replogle_pert_emb_gprofiler.pickle\",\n", + " \"wb\",\n", + ") as handle:\n", + " pickle.dump(cluster_process, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "markdown", + "id": "dc6d699c-ee76-487e-80ff-807f1c112995", + "metadata": {}, + "source": [ + "## PR Curve+AUC\n", + "- AUCPR using prior graph as binary label" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "62640fc7-412b-4a40-88da-bbd35fb23749", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " AUC\n", + "StringDB 0.252574\n", + "CORUM 0.240762\n", + " F1 Threshold\n", + "StringDB 0.278519 0.818003\n", + "CORUM 0.264378 0.839899\n" + ] + } + ], + "source": [ + "aucpr_df, f1_df, pr_dict = auprc(distance_matrix)\n", + "print(aucpr_df)\n", + "print(f1_df)\n", + "aucpr_df.to_csv(\"figures/pert_embedding_recall/scETM_replogle_aucpr.csv\")\n", + "f1_df.to_csv(\"figures/pert_embedding_recall/scETM_replogle_f1.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "8fdff66a-8f20-4268-8520-d33a0fa4969a", + "metadata": {}, + "source": [ + "# Interpretability of our learned latent space" + ] + }, + { + "cell_type": "markdown", + "id": "1ec54f80-8a0f-41f0-8c55-edd2b1fbfb3c", + "metadata": {}, + "source": [ + "## Factor Enrichment\n", + "- GSEA on log transformed latent-by-gene factors\n", + "- Then associate each perturbation to its top latent factors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35eb8169-15b1-4b5b-b926-d5b7238f372a", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "factor_to_go = factor_enrichment_gsea(adata, np.abs(adata.uns[\"gene_emb\"]), fdr=5e-2)" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "id": "814189ce-53f3-4716-834a-a4f0ea6efcf9", + "metadata": {}, + "outputs": [], + "source": [ + "# filter and add description to processes\n", + "go_df = read_aws_csv(\"s3://pert-spectra/references/GO_to_Description.txt\")\n", + "go_df.set_index(\"Term\", inplace=True)\n", + "go_dict = go_df.to_dict()[\"Description\"]\n", + "\n", + "filtered_factor_to_go = {}\n", + "for i in factor_to_go:\n", + " proc = factor_to_go[i]\n", + " proc[\"descr\"] = [go_dict[x] for x in proc[\"GO_ID\"]]\n", + " filtered_factor_to_go[i] = proc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a06b9724-d2a6-41e9-a38e-aac8d9f10818", + "metadata": {}, + "outputs": [], + "source": [ + "# save latent enrichment results\n", + "with open(\n", + " \"figures/factor_enrichments/scETM_replogle_factor_enrichment.pickle\", \"wb\"\n", + ") as handle:\n", + " pickle.dump(filtered_factor_to_go, handle, protocol=pickle.HIGHEST_PROTOCOL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47efd34e-e012-42b3-aa20-b5f7bc597832", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b966356 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +anndata==0.10.9 +boto3==1.35.23 +botocore==1.35.23 +gprofiler-official==1.0.0 +gseapy==1.1.3 +matplotlib==3.8.4 +numpy==1.26.4 +opt_einsum==3.4.0 +pandas==1.5.3 +scanpy==1.10.3 +scETM +scipy==1.14.1 +scikit-learn==1.5.2 +statsmodels==0.14.2 +tqdm==4.66.5 +umap-learn==0.5.6 +--find-links https://download.pytorch.org/whl/torch_stable.html +torch==2.2.2+cu121 +tensorboard +plotly diff --git a/scETM/README.md b/scETM/README.md new file mode 100644 index 0000000..d13a743 --- /dev/null +++ b/scETM/README.md @@ -0,0 +1,8 @@ +# [scETM: single-cell Embedded Topic Model](https://github.com/hui2000ji/scETM/tree/master?tab=readme-ov-file#scetm-single-cell-embedded-topic-model) + +``` +Zhao, Y., Cai, H., Zhang, Z., and Li, Y. Learning interpretable cellular and gene signature embeddings from single-cell transcriptomic data. Nature Communications, 12(5261), 2021. +``` + +Training scripts for scETM to use as a benchmark. +Link to paper: [paper](https://www.biorxiv.org/content/10.1101/2021.01.13.426593v1) diff --git a/scETM/scetm_inhouse.ipynb b/scETM/scetm_inhouse.ipynb new file mode 100644 index 0000000..afe889c --- /dev/null +++ b/scETM/scetm_inhouse.ipynb @@ -0,0 +1,316 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "41d53f3e", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6e32fbd", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "import anndata as ad\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scanpy as sc\n", + "import torch\n", + "from scetm_utils import read_aws_h5ad\n", + "\n", + "from scETM import UnsupervisedTrainer, scETM\n", + "from scETM.batch_sampler import CellSampler\n", + "\n", + "sys.path.append(\"..\")\n", + "from utils import (\n", + " filter_noisy_genes,\n", + " generate_k_fold,\n", + " write_adata_to_s3,\n", + ")\n", + "\n", + "sc.set_figure_params(\n", + " dpi=120, dpi_save=250, fontsize=10, figsize=(10, 10), facecolor=\"white\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca066698", + "metadata": {}, + "outputs": [], + "source": [ + "# use anndata generate by ..data_processing/inhouse_prior_graph_preprocessing.ipynb\n", + "unfilterd_adata = read_aws_h5ad(\"path to preprocessed h5ad\")\n", + "adata = filter_noisy_genes(unfilterd_adata)\n", + "adata.layers[\"logcounts\"] = adata.X.copy()\n", + "adata.X = adata.X.todense()\n", + "device = torch.device(\"cuda:0\")\n", + "gene_network = adata.uns[\"sparse_gene_network\"].todense()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8fba4cc4-03d3-447a-924d-c878f9cd3c9d", + "metadata": {}, + "outputs": [], + "source": [ + "# powered perturbations\n", + "adata.obs[\"condition\"] = adata.obs[\"condition\"].astype(str)\n", + "adata.obs[\"Treatment\"] = adata.obs[\"Treatment\"].astype(str)\n", + "adata.obs[\"pert_treat\"] = adata.obs[\"condition\"] + \"+\" + adata.obs[\"Treatment\"]\n", + "obs_df = pd.DataFrame(adata.obs[\"pert_treat\"])\n", + "category_counts = obs_df[\"pert_treat\"].value_counts()\n", + "filtered_categories = category_counts[category_counts >= 50].index\n", + "adata = adata[adata.obs[\"pert_treat\"].isin(filtered_categories)]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cbb5ef4f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "View of AnnData object with n_obs × n_vars = 65648 × 4997\n", + " obs: 'num_features', 'feature_call', 'num_umis', 'target_gene_name', 'SampleIndex', 'ssid', 'Treatment', 'assigned_archetype', 'node_centrality', 'clusters', 'condition', 'control', 'pert_treat'\n", + " var: 'gene_symbol', 'feature_types', 'genome', 'gene_id', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_symbols'\n", + " uns: 'hvg', 'metadata', 'obsm_annot', 'sparse_gene_network', 'varm_annot'\n", + " obsm: 'ACTION', 'ACTION_B', 'ACTION_normalized', 'C_stacked', 'C_unified', 'H_stacked', 'H_unified', 'X_ACTIONet2D', 'X_ACTIONet3D', 'X_ACTIONred', 'X_denovo_color', 'archetype_footprint'\n", + " varm: 'ACTION_A', 'ACTION_V', 'unified_feature_profile', 'unified_feature_specificity'\n", + " layers: 'counts', 'logcounts'\n", + " obsp: 'ACTIONet'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb6a0457", + "metadata": {}, + "outputs": [], + "source": [ + "adata.X = adata.layers[\"counts\"].todense()\n", + "sc.pp.normalize_total(adata, target_sum=1e4)\n", + "adata.X = np.array(adata.X)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "afef639c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_13522/996553587.py:1: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n", + " adata.obs['cell_types'] = ['A549' for _ in range(adata.shape[0])]\n" + ] + } + ], + "source": [ + "adata.obs[\"cell_types\"] = [\"A549\" for _ in range(adata.shape[0])]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "e9d26f2a", + "metadata": {}, + "outputs": [], + "source": [ + "train_idx, val_idx, test_idx = generate_k_fold(\n", + " adata, adata.X, adata.obs[\"condition\"], fold_idx=4\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "cc6ed3ff", + "metadata": {}, + "outputs": [], + "source": [ + "adata_train = ad.AnnData(np.array(adata[train_idx].X))\n", + "adata_train.obs[\"condition\"] = list(adata[train_idx].obs[\"condition\"])\n", + "adata_train.obs[\"Treatment\"] = list(adata[train_idx].obs[\"Treatment\"])\n", + "adata_train.obs[\"cell_types\"] = [\"A549\" for _ in range(adata_train.shape[0])]\n", + "adata_test = ad.AnnData(np.array(adata[test_idx].X))\n", + "adata_test.obs[\"condition\"] = list(adata[test_idx].obs[\"condition\"])\n", + "adata_test.obs[\"Treatment\"] = list(adata[test_idx].obs[\"Treatment\"])\n", + "adata_test.obs[\"cell_types\"] = [\"A549\" for _ in range(adata_test.shape[0])]\n", + "# for scETM, subset to TNFA+ for better signal\n", + "adata_train = adata_train[adata_train.obs[\"Treatment\"] == \"TNFA+\"]\n", + "adata_test = adata_test[adata_test.obs[\"Treatment\"] == \"TNFA+\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "019763d8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2024-11-13 00:25:06,897] INFO - scETM.src.scETM.logging_utils: scETM.__init__(4997, 151, n_topics = 200, trainable_gene_emb_dim = 400)\n", + "[2024-11-13 00:25:06,927] INFO - scETM.src.scETM.logging_utils: UnsupervisedTrainer.__init__(scETM(\n", + " (q_delta): Sequential(\n", + " (0): Linear(in_features=4997, out_features=128, bias=True)\n", + " (1): ReLU()\n", + " (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (3): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (mu_q_delta): Linear(in_features=128, out_features=200, bias=True)\n", + " (logsigma_q_delta): Linear(in_features=128, out_features=200, bias=True)\n", + " (rho_trainable_emb): PartlyTrainableParameter2D(height=400, fixed=0, trainable=4997)\n", + "), View of AnnData object with n_obs × n_vars = 25701 × 4997\n", + " obs: 'condition', 'Treatment', 'cell_types', test_ratio = 0.2, seed = 0)\n", + "[2024-11-13 00:25:06,928] INFO - scETM.src.scETM.trainers.trainer_utils: Set seed to 0.\n", + "[2024-11-13 00:25:06,940] INFO - scETM.src.scETM.trainers.trainer_utils: Keeping 5140 cells (0.2) as test data.\n" + ] + } + ], + "source": [ + "inhouse_model = scETM(\n", + " adata_train.n_vars,\n", + " adata_train.obs.condition.nunique(),\n", + " n_topics=200,\n", + " trainable_gene_emb_dim=400,\n", + ")\n", + "trainer = UnsupervisedTrainer(inhouse_model, adata_train, test_ratio=0.2, seed=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4594136c", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import time\n", + "\n", + "start = time.time()\n", + "trainer.train(\n", + " n_epochs=12000,\n", + " eval_every=2000,\n", + " batch_col=\"condition\",\n", + " eval_kwargs=dict(batch_col=\"condition\"),\n", + " save_model_ckpt=False,\n", + ")\n", + "end = time.time()\n", + "print(f\"Training time: {end-start}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe4981b6", + "metadata": {}, + "outputs": [], + "source": [ + "# retrieve reconstructed gene expression\n", + "recon = []\n", + "theta = []\n", + "for i in range(2):\n", + " adata_sub = adata_test[i * 10000 : min((i + 1) * 10000, len(adata))]\n", + " sampler = CellSampler(\n", + " adata_sub, 10000, sample_batch_id=True, n_epochs=1, batch_col=\"condition\"\n", + " )\n", + " dataloader = iter(sampler)\n", + " data_dict = {k: v.to(torch.device(\"cuda:0\")) for k, v in next(dataloader).items()}\n", + " out = inhouse_model.forward(data_dict=data_dict, hyper_param_dict={\"decode\": True})\n", + " recon.append(out[\"recon_log\"].clone().detach().cpu().numpy())\n", + " theta.append(out[\"theta\"].clone().detach().cpu().numpy())\n", + "all_recon = np.concatenate(recon)\n", + "all_theta = np.concatenate(theta)\n", + "assert len(adata_test) == all_recon.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adff89de", + "metadata": {}, + "outputs": [], + "source": [ + "# save model parameters\n", + "adata_test.uns[\"topics\"] = inhouse_model.alpha.clone().detach().cpu().numpy()\n", + "adata_test.uns[\"gene_emb\"] = (\n", + " inhouse_model.rho_trainable_emb.trainable.clone().detach().cpu().numpy()\n", + ")\n", + "adata_test.uns[\"cell_emb\"] = all_theta\n", + "adata_test.uns[\"recon\"] = all_recon" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbade626", + "metadata": {}, + "outputs": [], + "source": [ + "# save to s3\n", + "write_adata_to_s3(\n", + " s3_url=\"s3://pert-spectra/scETM_checkpoints/scETM_inhouse/\",\n", + " adata_name=\"fold_4\",\n", + " adata=adata_test,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5dc11d8d-6e5b-4073-b923-f43c69bc22c0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scETM/scetm_norman.ipynb b/scETM/scetm_norman.ipynb new file mode 100644 index 0000000..07827ae --- /dev/null +++ b/scETM/scetm_norman.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "41d53f3e", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6e32fbd", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "import anndata as ad\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scanpy as sc\n", + "import torch\n", + "from scetm_utils import read_aws_h5ad\n", + "\n", + "from scETM import UnsupervisedTrainer, scETM\n", + "from scETM.batch_sampler import CellSampler\n", + "\n", + "sys.path.append(\"..\")\n", + "from utils import (\n", + " filter_noisy_genes,\n", + " generate_k_fold,\n", + " write_adata_to_s3,\n", + ")\n", + "\n", + "sc.set_figure_params(\n", + " dpi=120, dpi_save=250, fontsize=10, figsize=(10, 10), facecolor=\"white\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca066698", + "metadata": {}, + "outputs": [], + "source": [ + "# use anndata generate by ..data_processing/norman_prior_graph_preprocessing.ipynb\n", + "unfiltered_adata = read_aws_h5ad(\"path to h5ad\")\n", + "adata = filter_noisy_genes(unfiltered_adata)\n", + "adata.layers[\"logcounts\"] = adata.X.copy()\n", + "adata.X = adata.X.todense()\n", + "device = torch.device(\"cuda:0\")\n", + "gene_network = adata.uns[\"sparse_gene_network\"].todense()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "03d7f2d0", + "metadata": {}, + "outputs": [], + "source": [ + "# subset to powered perturbations\n", + "obs_df = pd.DataFrame(adata.obs[\"perturbation_name\"])\n", + "category_counts = obs_df[\"perturbation_name\"].value_counts()\n", + "filtered_categories = category_counts[category_counts >= 50].index\n", + "adata = adata[adata.obs[\"perturbation_name\"].isin(filtered_categories)]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "eb6a0457", + "metadata": {}, + "outputs": [], + "source": [ + "adata.X = adata.layers[\"counts\"].todense()\n", + "sc.pp.normalize_total(adata, target_sum=1e4)\n", + "adata.X = np.array(adata.X)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ad199270", + "metadata": {}, + "outputs": [], + "source": [ + "train_idx, val_idx, test_idx = generate_k_fold(\n", + " adata,\n", + " adata.X,\n", + " adata.obs[\"perturbation_name\"],\n", + " fold_idx=4,\n", + " perturbation_key=\"perturbation_name\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cf5abfb8", + "metadata": {}, + "outputs": [], + "source": [ + "adata_train = ad.AnnData(np.array(adata[train_idx].X))\n", + "adata_train.obs[\"perturbation_name\"] = list(adata[train_idx].obs[\"perturbation_name\"])\n", + "adata_train.obs[\"cell_types\"] = [\"K562\" for _ in range(adata_train.shape[0])]\n", + "adata_test = ad.AnnData(np.array(adata[test_idx].X))\n", + "adata_test.obs[\"perturbation_name\"] = list(adata[test_idx].obs[\"perturbation_name\"])\n", + "adata_test.obs[\"cell_types\"] = [\"K562\" for _ in range(adata_test.shape[0])]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "019763d8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2024-11-13 07:17:51,864] INFO - scETM.src.scETM.logging_utils: scETM.__init__(4990, 210, n_topics = 200, trainable_gene_emb_dim = 400)\n", + "[2024-11-13 07:17:52,037] INFO - scETM.src.scETM.logging_utils: UnsupervisedTrainer.__init__(scETM(\n", + " (q_delta): Sequential(\n", + " (0): Linear(in_features=4990, out_features=128, bias=True)\n", + " (1): ReLU()\n", + " (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (3): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (mu_q_delta): Linear(in_features=128, out_features=200, bias=True)\n", + " (logsigma_q_delta): Linear(in_features=128, out_features=200, bias=True)\n", + " (rho_trainable_emb): PartlyTrainableParameter2D(height=400, fixed=0, trainable=4990)\n", + "), AnnData object with n_obs × n_vars = 64585 × 4990\n", + " obs: 'perturbation_name', 'cell_types', test_ratio = 0.1, seed = 0)\n", + "[2024-11-13 07:17:52,038] INFO - scETM.src.scETM.trainers.trainer_utils: Set seed to 0.\n", + "[2024-11-13 07:17:52,059] INFO - scETM.src.scETM.trainers.trainer_utils: Keeping 6458 cells (0.1) as test data.\n" + ] + } + ], + "source": [ + "norman_model = scETM(\n", + " adata_train.n_vars,\n", + " adata_train.obs.perturbation_name.nunique(),\n", + " n_topics=200,\n", + " trainable_gene_emb_dim=400,\n", + ")\n", + "trainer = UnsupervisedTrainer(norman_model, adata_train, test_ratio=0.1, seed=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4594136c", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "start = time.time()\n", + "trainer.train(\n", + " n_epochs=12000,\n", + " eval_every=2000,\n", + " batch_col=\"perturbation_name\",\n", + " eval_kwargs=dict(batch_col=\"perturbation_name\"),\n", + " save_model_ckpt=False,\n", + ")\n", + "end = time.time()\n", + "print(f\"Training time: {end-start}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5566fa6", + "metadata": {}, + "outputs": [], + "source": [ + "# retrieve reconstructed gene expression\n", + "recon = []\n", + "theta = []\n", + "for i in range(3):\n", + " adata_sub = adata_test[i * 10000 : min((i + 1) * 10000, len(adata))]\n", + " sampler = CellSampler(\n", + " adata_sub,\n", + " 10000,\n", + " sample_batch_id=True,\n", + " n_epochs=1,\n", + " batch_col=\"perturbation_name\",\n", + " )\n", + " dataloader = iter(sampler)\n", + " data_dict = {k: v.to(torch.device(\"cuda:0\")) for k, v in next(dataloader).items()}\n", + " out = norman_model.forward(data_dict=data_dict, hyper_param_dict={\"decode\": True})\n", + " recon.append(out[\"recon_log\"].clone().detach().cpu().numpy())\n", + " theta.append(out[\"theta\"].clone().detach().cpu().numpy())\n", + "all_recon = np.concatenate(recon)\n", + "all_theta = np.concatenate(theta)\n", + "assert len(adata_test) == all_recon.shape[0]\n", + "assert len(adata_test) == all_theta.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adff89de", + "metadata": {}, + "outputs": [], + "source": [ + "# save model parameters\n", + "adata_test.uns[\"topics\"] = norman_model.alpha.clone().detach().cpu().numpy()\n", + "adata_test.uns[\"gene_emb\"] = (\n", + " norman_model.rho_trainable_emb.trainable.clone().detach().cpu().numpy()\n", + ")\n", + "adata_test.uns[\"cell_emb\"] = all_theta\n", + "adata_test.uns[\"recon\"] = all_recon\n", + "adata_test.X = np.array(adata_test.X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "267de607", + "metadata": {}, + "outputs": [], + "source": [ + "# save to s3\n", + "write_adata_to_s3(\n", + " s3_url=\"s3://pert-spectra/scETM_checkpoints/scETM_norman/\",\n", + " adata_name=\"fold_4\",\n", + " adata=adata_test,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b84f007a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scETM/scetm_replogle.ipynb b/scETM/scetm_replogle.ipynb new file mode 100644 index 0000000..064fe88 --- /dev/null +++ b/scETM/scetm_replogle.ipynb @@ -0,0 +1,267 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "41d53f3e", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6e32fbd", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "import anndata as ad\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scanpy as sc\n", + "import torch\n", + "from scetm_utils import read_aws_h5ad\n", + "\n", + "from scETM import UnsupervisedTrainer, scETM\n", + "from scETM.batch_sampler import CellSampler\n", + "\n", + "sys.path.append(\"..\")\n", + "from utils import filter_noisy_genes, split_data_by_cell, write_adata_to_s3\n", + "\n", + "sc.set_figure_params(\n", + " dpi=120, dpi_save=250, fontsize=10, figsize=(10, 10), facecolor=\"white\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca066698", + "metadata": {}, + "outputs": [], + "source": [ + "# use anndata generate by ..data_processing/replogle_prior_graph_preprocessing.ipynb\n", + "unfiltered_adata = read_aws_h5ad(\"path to h5ad\")\n", + "adata = filter_noisy_genes(unfiltered_adata)\n", + "adata.layers[\"logcounts\"] = adata.X.copy()\n", + "adata.X = adata.X.todense()\n", + "device = torch.device(\"cuda:0\")\n", + "gene_network = adata.uns[\"sparse_gene_network\"].todense()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "03d7f2d0", + "metadata": {}, + "outputs": [], + "source": [ + "# filter adata to perturbations with at least 50 samples\n", + "obs_df = pd.DataFrame(adata.obs[\"gene\"])\n", + "category_counts = obs_df[\"gene\"].value_counts()\n", + "filtered_categories = category_counts[category_counts >= 50].index\n", + "adata = adata[adata.obs[\"gene\"].isin(filtered_categories)]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0f15f5b5-aead-49bb-98ca-6926a3791ec4", + "metadata": {}, + "outputs": [], + "source": [ + "# reference the svae filtered replogle anndata to subset to those cells (see ../data for instructions on generating the anndata object)\n", + "filtered_replogle = read_aws_h5ad(\"path to svae filtered replogle h5ad\")\n", + "filtered_perts = set(filtered_replogle.obs[\"gene\"].unique()).union(\n", + " set([\"SKP2\", \"CUL1\", \"UBE2N\"])\n", + ")\n", + "adata = adata[adata.obs[\"gene\"].isin(filtered_perts)]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "eb6a0457", + "metadata": {}, + "outputs": [], + "source": [ + "adata.X = adata.layers[\"counts\"]\n", + "sc.pp.normalize_total(adata, target_sum=1e4)\n", + "adata.X = np.asarray(adata.X)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "281e46e2", + "metadata": {}, + "outputs": [], + "source": [ + "train_idx, val_idx, test_idx = split_data_by_cell(\n", + " adata.X, adata.obs[\"gene\"], test_size=0.2, val_size=0.2\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "038e5082", + "metadata": {}, + "outputs": [], + "source": [ + "adata_train = ad.AnnData(np.array(adata[train_idx].X))\n", + "adata_train.obs[\"gene\"] = list(adata[train_idx].obs[\"gene\"])\n", + "adata_train.obs[\"cell_types\"] = [\"K562\" for _ in range(adata_train.shape[0])]\n", + "adata_test = ad.AnnData(np.array(adata[test_idx].X))\n", + "adata_test.obs[\"gene\"] = list(adata[test_idx].obs[\"gene\"])\n", + "adata_test.obs[\"cell_types\"] = [\"K562\" for _ in range(adata_test.shape[0])]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "019763d8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2024-11-13 07:15:56,365] INFO - scETM.src.scETM.logging_utils: scETM.__init__(4935, 517, n_topics = 200, trainable_gene_emb_dim = 400)\n", + "[2024-11-13 07:15:57,358] INFO - scETM.src.scETM.logging_utils: UnsupervisedTrainer.__init__(scETM(\n", + " (q_delta): Sequential(\n", + " (0): Linear(in_features=4935, out_features=128, bias=True)\n", + " (1): ReLU()\n", + " (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (3): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (mu_q_delta): Linear(in_features=128, out_features=200, bias=True)\n", + " (logsigma_q_delta): Linear(in_features=128, out_features=200, bias=True)\n", + " (rho_trainable_emb): PartlyTrainableParameter2D(height=400, fixed=0, trainable=4935)\n", + "), AnnData object with n_obs × n_vars = 67803 × 4935\n", + " obs: 'gene', 'cell_types', test_ratio = 0.1, seed = 0)\n", + "[2024-11-13 07:15:57,361] INFO - scETM.src.scETM.trainers.trainer_utils: Set seed to 0.\n", + "[2024-11-13 07:15:57,422] INFO - scETM.src.scETM.trainers.trainer_utils: Keeping 6780 cells (0.1) as test data.\n" + ] + } + ], + "source": [ + "model = scETM(\n", + " adata_train.n_vars,\n", + " adata_train.obs.gene.nunique(),\n", + " n_topics=200,\n", + " trainable_gene_emb_dim=400,\n", + ")\n", + "trainer = UnsupervisedTrainer(model, adata_train, test_ratio=0.1, seed=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e8b62a5-7a64-438b-aeec-6519faed6dbe", + "metadata": {}, + "outputs": [], + "source": [ + "trainer.train(\n", + " n_epochs=12000,\n", + " eval_every=2000,\n", + " batch_col=\"gene\",\n", + " eval_kwargs=dict(batch_col=\"gene\"),\n", + " save_model_ckpt=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "15d29ee4", + "metadata": {}, + "outputs": [], + "source": [ + "# retrieve reconstructed gene expression\n", + "recon = []\n", + "theta = []\n", + "for i in range(3):\n", + " adata_sub = adata_test[i * 10000 : min((i + 1) * 10000, len(adata))]\n", + " sampler = CellSampler(\n", + " adata_sub, 10000, sample_batch_id=True, n_epochs=1, batch_col=\"gene\"\n", + " )\n", + " dataloader = iter(sampler)\n", + " data_dict = {k: v.to(torch.device(\"cuda:0\")) for k, v in next(dataloader).items()}\n", + " out = model.forward(data_dict=data_dict, hyper_param_dict={\"decode\": True})\n", + " recon.append(out[\"recon_log\"].clone().detach().cpu().numpy())\n", + " theta.append(out[\"theta\"].clone().detach().cpu().numpy())\n", + "all_recon = np.concatenate(recon)\n", + "all_theta = np.concatenate(theta)\n", + "assert len(adata_test) == all_recon.shape[0]\n", + "assert len(adata_test) == all_theta.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "adff89de", + "metadata": {}, + "outputs": [], + "source": [ + "# save model parameters\n", + "adata_test.uns[\"topics\"] = model.alpha.clone().detach().cpu().numpy()\n", + "adata_test.uns[\"gene_emb\"] = (\n", + " model.rho_trainable_emb.trainable.clone().detach().cpu().numpy()\n", + ")\n", + "adata_test.uns[\"cell_emb\"] = all_theta\n", + "adata_test.uns[\"recon\"] = all_recon\n", + "adata_test.X = np.array(adata_test.X)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "267de607", + "metadata": {}, + "outputs": [], + "source": [ + "# save to s3\n", + "write_adata_to_s3(\n", + " s3_url=\"s3://pert-spectra/scETM_checkpoints/scETM_replogle/\",\n", + " adata_name=\"scETM_replogle\",\n", + " adata=adata_test,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b01b22c0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertspectra", + "language": "python", + "name": "pertspectra" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scETM/scetm_utils.py b/scETM/scetm_utils.py new file mode 100644 index 0000000..49f704c --- /dev/null +++ b/scETM/scetm_utils.py @@ -0,0 +1,67 @@ +import os + +import anndata as ad +import boto3 +import botocore +import numpy as np +import pandas as pd + +SPECTRA_DEFAULT_DIR = os.path.join(os.path.expanduser("~"), "spectra_cache") + + +def read_aws_h5ad(s3_url): + save_path = os.path.join(SPECTRA_DEFAULT_DIR, s3_url.split("/")[-1]) + + os.makedirs(os.path.dirname(save_path), exist_ok=True) + s3 = boto3.resource("s3") + + # Get the bucket name and key from the s3 url + bucket_name, key = s3_url.removeprefix("s3://").split("/", 1) + + s3_object = s3.Object(bucket_name=bucket_name, key=key) + s3_object.download_file(save_path) + + adata = ad.read_h5ad(save_path) + return adata + + +def read_aws_csv(s3_url, sep=","): + save_path = os.path.join(SPECTRA_DEFAULT_DIR, s3_url.split("/")[-1]) + + os.makedirs(os.path.dirname(save_path), exist_ok=True) + s3 = boto3.resource("s3") + + # Get the bucket name and key from the s3 url + bucket_name, key = s3_url.removeprefix("s3://").split("/", 1) + + s3_object = s3.Object(bucket_name=bucket_name, key=key) + try: + s3_object.download_file(save_path) + df = pd.read_csv(save_path, sep=sep) + return df + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + print("object does not exist") + return None + return None + + +def read_aws_npz(s3_url, sep=","): + save_path = os.path.join(SPECTRA_DEFAULT_DIR, s3_url.split("/")[-1]) + + os.makedirs(os.path.dirname(save_path), exist_ok=True) + s3 = boto3.resource("s3") + + # Get the bucket name and key from the s3 url + bucket_name, key = s3_url.removeprefix("s3://").split("/", 1) + + s3_object = s3.Object(bucket_name=bucket_name, key=key) + try: + s3_object.download_file(save_path) + mtx = np.laod(save_path) + return mtx + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + print("object does not exist") + return None + return None diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000..090b397 --- /dev/null +++ b/src/README.md @@ -0,0 +1,7 @@ +# PertSpectra: Guided triplet factor analysis of perturb-seq data with a prior +Code for PertSpectra model. +This method is an extension on [Kunes, R. et al](https://doi.org/10.1101/2022.12.20.521311). +``` +Kunes, R. et al. Supervised discovery of interpretable gene programs from single-cell data. Nature Biotechnology, 42:1084—-1095, 2024. +``` +The codebase for the model is modified from [Spectra](https://github.com/dpeerlab/spectra/). diff --git a/src/Spectra/Spectra_Pert.py b/src/Spectra/Spectra_Pert.py new file mode 100644 index 0000000..9f1a929 --- /dev/null +++ b/src/Spectra/Spectra_Pert.py @@ -0,0 +1,1459 @@ +from collections import OrderedDict + +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from opt_einsum import contract +from torch.distributions.normal import Normal +from torch.utils.data import DataLoader, TensorDataset +from tqdm import tqdm + +from . import Spectra_util # noqa +from .initialization import compute_init_scores, compute_init_scores_noct + + +class SPECTRA(nn.Module): + """ + + Parameters + ---------- + X : np.ndarray or torch.Tensor + the ``(n, p)`` -shaped matrix containing logged expression count data. Used + for initialization of + self.n and self.p but not stored as an attribute + labels : np.ndarray or NoneType + the ``(n, )`` -shaped array containing cell type labels. If use_cell_types == + False, then should + be set to None + + L : dict or OrderedDict [if use_cell_types == False, then int] + ``number of cell types + 1``-shaped dictionary. Must have "global" as a key, + indicating the number + of global factors + { + "global": 15, + "CD8": 5 + ... + } + > Format matches output of K_est.py to estimate the number of + > Must match cell type labels provided during training + > Recommended practice is to assign at minimum 2 factors per cell type + > Note that L contains the number of factors that describe the graph. + adj_matrix : dict or OrderedDict + ``a dictionary of adjacency matrices, one for every cell type + a "global" + { + "global": ``(p, p)``-shaped binary np.ndarray + "CD8": ... + + } + weights : dict or OrderedDict or NoneType [if use_cell_types == False, then + ``(p, p)``-shaped array] + the ``(p, p)``-shaped set of edge weights per . If weight[i,j] is non-zero + when adj_matrix[i,j] = 0 + this weight is ignored. + + if weights == None, no weights are used + lam : float + lambda parameter of the model, which controls the relative influence of the + graph vs expression + loss functions. This term multiplies the expression loss, so smaller values + of lambda upweight the prior information + delta : float + delta parameter of the model, which controls a lower bound for gene scaling + factors. If delta is small then the maximum ratio between gene scaling factors + is larger and lowly expressed genes can be put on the same scale as highly + expressed genes. + kappa : float or NoneType + kappa controls the background rate of edges in the graph. if kappa is a float, + kappa is fixed to the given float value. If kappa == None, then kappa is a + parameter that is estimated from the data. + rho : float or NoneType + rho controls the bakcground rate of non-edges in the graph. if rho is a float, + rho is fixed to + the given float value. If rho == None, then rho is a parameter that is estimated + from the data. + use_cell_types: bool + use_cell_types is a Boolean variable that determines whether cell type labels + are used to fit + the model. If False, then parameters are initialized as nn.Parameter rather + than as + nn.ParameterDict with cell type keys that index nn.Parameter values + determinant_penalty : float + determinant penalty affects the selection parameters that are fit when + L[cell_type] > + K[cell_type]. A determinantally regularized selection parameter is fit + with determinant + penalty that encourages sparsity and diversity. + Attributes + ---------- + model.delta : delta parameter of the model + + model.lam : lambda parameter of the model + + model.determinant_penalty : determinant penalty of the model + + model.L : L parameter, either int, dict or OrderedDict() + + model.p : number of genes + + model.n : number of cells + + model.use_cell_types : if True then cell types are considered, else cell types + ignored. Affects + the dimensions of the initialized parameters. + + model.kappa : if not kappa, nn.ParameterDict() if use_cell_types, else + nn.Parameter(). If + kappa is a float, it is fixed throughout training + + model.rho : if not rho, nn.ParamterDict() if use_cell_types, else + nn.Parameter. If rho is a + float it is fixed throughout training + + model.adj_matrix : adjacency matrix with diagonal removed. dict containing + torch.Tensors + + model.adj_matrix_1m : 1 - adjacency matrix with diagonal removed. dict + containing torch.Tensors + + model.weights : contains edge weights. format matches adj_matrix + + model.cell_types : np.ndarray containing array of unique cell types + + model.cell_type_counts : dict {key = cell type, values = number of cells} + + model.theta : nn.ParameterDict() or nn.Parameter() containing the factor + weights + + model.alpha : nn.ParameterDict() or nn.Parameter() containing the cell + loadings + + model.eta : nn.ParameterDict() or nn.Parameter() containing the + interaction matrix between + factors + + model.gene_scaling : nn.ParameterDict() or nn.Parameter() containing + the gene scale factors + + model.selection : nn.ParameterDict() or nn.Parameter() containing + the attention weights. + Only initialized when L[cell_type] > K[cell_type] for some cell type + or when L > K and + use_cell_types == False + + model.kgeql_flag : dict or bool. dictionary of boolean values indicating + whether K >= L. + When use_cell_types == False, it is a boolean value + + Methods + ---------- + + model.loss(self, X, labels) : called by fit if use_cell_types = True. + Evalutes the loss of + the model + + model.loss_no_cell_types(self,X) : called by fit if use_cell_types = + False. Evalutes the loss + of the model + + model.initialize(self, gene_sets,val) : initialize the model based + on given dictionary of gene + sets. val is a float that determines the strength of the initialization. + + model.initialize_no_celltypes(self, gs_list, val) : initialize the + model based on given list + of gene sets. val is a float that determines the strength of the + initialization. + + + To do: + __________ + + > Alternative initialization functions + + > comment SPECTRA-EM code + + > test lower bound constraint [see pyspade_global.py implementation] + + > Overlap threshold test statistic + + + """ + + def __init__( # noqa + self, + X, + labels, + adj_matrix, + L, + pert_idx, + pert_labels=None, + weights=None, + lam=0.01, + psi=0.01, + delta=0.001, + kappa=None, + rho=0.001, + use_cell_types=True, + device=torch.device("cuda:0"), + ): + super(SPECTRA, self).__init__() + + # hyperparameters + self.delta = delta + self.lam = lam + self.psi = psi + self.L = L + # for memory efficiency we don't store X in the object attributes, but require X + # dimensions to + # be known at initialization + self.p = X.shape[1] + self.n = X.shape[0] + self.use_cell_types = use_cell_types + self.device = device + + self.pert_idx = pert_idx + self.pert_labels = pert_labels + # add one dim for ctrl one-hot + self.n_p = len(pert_idx) + + if not use_cell_types: + # check that L is an int + assert isinstance(self.L, int) + + # trust the user to input a np.ndarray for adj_matrix + self.adj_matrix = torch.Tensor(adj_matrix).to(self.device) - torch.Tensor( + np.diag(np.diag(adj_matrix)) + ).to(self.device) + adj_matrix_1m = 1.0 - adj_matrix + self.adj_matrix_1m = torch.Tensor( + adj_matrix_1m - np.diag(np.diag(adj_matrix_1m)) + ).to(self.device) + if weights is not None: + self.weights = torch.Tensor(weights).to(self.device) - torch.Tensor( + np.diag(np.diag(adj_matrix)) + ).to(self.device) + else: + self.weights = self.adj_matrix + + self.theta = nn.Parameter(Normal(0.0, 1.0).sample([self.p, self.L])) + self.alpha = nn.Parameter(Normal(0.0, 1.0).sample([self.n_p, self.L])) + self.eta = nn.Parameter(Normal(0.0, 1.0).sample([self.L, self.L])) + self.gene_scaling = nn.Parameter(torch.zeros(self.p)) + + if kappa is None: + self.kappa = nn.Parameter(Normal(0.0, 1.0).sample()) + else: + self.kappa = torch.tensor(np.log(kappa / (1 - kappa))).to(self.device) + if rho is None: + self.rho = nn.Parameter(Normal(0.0, 1.0).sample()) + else: + self.rho = torch.tensor(np.log(rho / (1 - rho))).to(self.device) + + if use_cell_types: + # convert adjacency matrices to pytorch tensors to make optimization easier later + self.adj_matrix = { + cell_type: ( + torch.Tensor(adj_matrix[cell_type]).to(self.device) + - torch.Tensor(np.diag(np.diag(adj_matrix[cell_type]))).to( + self.device + ) + if len(adj_matrix[cell_type]) > 0 + else [] + ) + for cell_type in adj_matrix.keys() + } + # for convenience store 1 - adjacency matrix elements [except on diagonal, + # where we store 0] + adj_matrix_1m = { + cell_type: ( + 1.0 - adj_matrix[cell_type] + if len(adj_matrix[cell_type]) > 0 + else [] + ) + for cell_type in adj_matrix.keys() + } # one adj_matrix per cell type + self.adj_matrix_1m = { + cell_type: ( + torch.Tensor( + adj_matrix_1m[cell_type] + - np.diag(np.diag(adj_matrix_1m[cell_type])) + ).to(self.device) + if len(adj_matrix_1m[cell_type]) > 0 + else [] + ) + for cell_type in adj_matrix_1m.keys() + } # one adj_matrix per cell type + + # if weights are provided, convert these to tensors, else set weights = to + # adjacency matrices + if weights: + self.weights = { + cell_type: ( + torch.Tensor(weights[cell_type]).to(self.device) + - torch.Tensor(np.diag(np.diag(weights[cell_type]))).to( + self.device + ) + if len(weights[cell_type]) > 0 + else [] + ) + for cell_type in weights.keys() + } + else: + self.weights = self.adj_matrix + + self.cell_types = np.unique( + labels + ) # cell types are the unique labels, again require knowledge of labels at + # initialization + # but do not store them + + # store a dictionary containing the counts of each cell type + self.cell_type_counts = {} + for cell_type in self.cell_types: + n_c = sum(labels == cell_type) + self.cell_type_counts[cell_type] = n_c + + # initialize parameters randomly, we use torch's ParameterDict() for storage + # for intuitive + # accessing cell type specific parameters + self.theta = nn.ParameterDict() + self.alpha = nn.ParameterDict() + self.eta = nn.ParameterDict() + self.gene_scaling = nn.ParameterDict() + + if kappa is None: + self.kappa = nn.ParameterDict() + if rho is None: + self.rho = nn.ParameterDict() + # initialize global params + self.theta["global"] = nn.Parameter( + Normal(0.0, 1.0).sample([self.p, self.L["global"]]) + ) + self.eta["global"] = nn.Parameter( + Normal(0.0, 1.0).sample([self.L["global"], self.L["global"]]) + ) + self.gene_scaling["global"] = nn.Parameter( + Normal(0.0, 1.0).sample([self.p]) + ) + if kappa is None: + self.kappa["global"] = nn.Parameter(Normal(0.0, 1.0).sample()) + if rho is None: + self.rho["global"] = nn.Parameter(Normal(0.0, 1.0).sample()) + + # initialize all cell type specific params + for cell_type in self.cell_types: + self.theta[cell_type] = nn.Parameter( + Normal(0.0, 1.0).sample([self.p, self.L[cell_type]]) + ) + self.eta[cell_type] = nn.Parameter( + Normal(0.0, 1.0).sample([self.L[cell_type], self.L[cell_type]]) + ) + n_c = sum(labels == cell_type) + self.alpha[cell_type] = nn.Parameter( + Normal(0.0, 1.0).sample( + [self.n_p, self.L["global"] + self.L[cell_type]] + ) + ) + self.gene_scaling[cell_type] = nn.Parameter( + Normal(0.0, 1.0).sample([self.p]) + ) + + if kappa is None: + self.kappa[cell_type] = nn.Parameter(Normal(0.0, 1.0).sample()) + + if rho is None: + self.rho[cell_type] = nn.Parameter(Normal(0.0, 1.0).sample()) + + # if kappa and rho are provided, hold these fixed during training, else fit as + # free parameters + # to unify the cases, we put this in the same format + if kappa is not None: + self.kappa = {} + self.kappa["global"] = torch.tensor(np.log(kappa / (1 - kappa))) + for cell_type in self.cell_types: + self.kappa[cell_type] = torch.tensor(np.log(kappa / (1 - kappa))) + # self.kappa = nn.ParameterDict(self.kappa) + if rho is not None: + self.rho = {} + self.rho["global"] = torch.tensor(np.log(rho / (1 - rho))) + for cell_type in self.cell_types: + self.rho[cell_type] = torch.tensor(np.log(rho / (1 - rho))) + # self.rho = nn.ParameterDict(self.rho) + + def loss(self, X, labels, loss_weights, D=None, forward=False): + assert self.use_cell_types # if this is False, fail because model has not been initialized to use cell types + X = X.to(self.device) + D = D.to(self.device) + loss_weights = loss_weights.to(self.device) + + # initialize loss and fetch global parameters + loss = 0.0 + theta_global = torch.softmax(self.theta["global"], dim=1) + eta_global = (self.eta["global"]).exp() / (1.0 + (self.eta["global"]).exp()) + eta_global = 0.5 * (eta_global + eta_global.T) + gene_scaling_global = self.gene_scaling["global"].exp() / ( + 1.0 + self.gene_scaling["global"].exp() + ) + kappa_global = self.kappa["global"].exp() / (1 + self.kappa["global"].exp()) + rho_global = self.rho["global"].exp() / (1 + self.rho["global"].exp()) + + recon_dict = {} + term1_dict = {} + term2_dict = {} + term3_dict = {} + term4_dict = {} + + # loop through cell types and evaluate loss at every cell type + for cell_type in self.cell_types: + kappa = self.kappa[cell_type].exp() / (1 + self.kappa[cell_type].exp()) + rho = self.rho[cell_type].exp() / (1 + self.rho[cell_type].exp()) + gene_scaling_ct = self.gene_scaling[cell_type].exp() / ( + 1.0 + self.gene_scaling[cell_type].exp() + ) + X_c = X[labels == cell_type] + loss_weights_c = loss_weights[labels == cell_type] + adj_matrix = self.adj_matrix[cell_type] + weights = self.weights[cell_type] + adj_matrix_1m = self.adj_matrix_1m[cell_type] + theta_ct = torch.softmax(self.theta[cell_type], dim=1) + eta_ct = (self.eta[cell_type]).exp() / (1.0 + (self.eta[cell_type]).exp()) + eta_ct = 0.5 * (eta_ct + eta_ct.T) + theta_global_ = contract( + "jk,j->jk", theta_global, gene_scaling_global + self.delta + ) + theta_ct_ = contract("jk,j->jk", theta_ct, gene_scaling_ct + self.delta) + theta = torch.cat((theta_global_, theta_ct_), 1) + alpha = torch.exp(self.alpha[cell_type]) + + # terms for perturbation + if D.shape[0] > 0: + D_c = D[labels == cell_type] + + p_alpha = D_c @ alpha + recon = contract("ik,jk->ij", p_alpha, theta) + term1 = ( + -1.0 + * ((torch.xlogy(X_c, recon) - recon) * loss_weights_c[:, None]).sum() + ) + + if len(adj_matrix) > 0: + mat = contract("il,lj,kj->ik", theta_ct, eta_ct, theta_ct) + term2 = ( + -1.0 + * ( + torch.xlogy( + adj_matrix * weights, + (1.0 - rho) * (1.0 - kappa) * mat + (1.0 - rho) * kappa, + ) + ).sum() + ) + term3 = ( + -1.0 + * ( + torch.xlogy( + adj_matrix_1m, + (1.0 - kappa) * (1.0 - rho) * (1.0 - mat) + rho, + ) + ).sum() + ) + # perturbation autocorrelation term + # include only perturbations that are in the graph + if D.shape[0] > 0: + ctrl_indices = [i for i, x in enumerate(self.pert_idx) if x == -1] + pert_subset_idx = [ + i for i, x in enumerate(self.pert_idx) if x != -1 + ] + alpha_subset_idx = [ + i for i in range(alpha.shape[0]) if i not in ctrl_indices + ] + term4 = ( + -1 + * Spectra_util.geary_autocorrelation_multivariate( + mat[pert_subset_idx][:, pert_subset_idx], + alpha[alpha_subset_idx], + )["stat"] + ) + else: + term4 = 0.0 + else: + term2 = 0.0 + term3 = 0.0 + term4 = 0.0 + recon_dict[cell_type] = recon.clone() + term1_dict[cell_type] = term1 + term2_dict[cell_type] = term2 + term3_dict[cell_type] = term3 + term4_dict[cell_type] = term4 + loss = ( + loss + + self.lam * term1 + + (self.cell_type_counts[cell_type] / float(self.n)) * (term2 + term3) + ) + + # compute loss associated with global graph + adj_matrix = self.adj_matrix["global"] + adj_matrix_1m = self.adj_matrix_1m["global"] + weights = self.weights["global"] + + if len(adj_matrix) > 0: + mat = contract( + "il,lj,kj->ik", + theta_global, + eta_global, + theta_global, + ) + term2 = ( + -1.0 + * ( + torch.xlogy( + adj_matrix * weights, + (1.0 - rho_global) * (1.0 - kappa_global) * mat + + (1.0 - rho_global) * kappa_global, + ) + ).sum() + ) + term3 = ( + -1.0 + * ( + torch.xlogy( + adj_matrix_1m, + (1.0 - kappa_global) * (1.0 - rho_global) * (1.0 - mat) + + rho_global, + ) + ).sum() + ) + term2_dict["global"] = term2 + term3_dict["global"] = term3 + + loss = loss + term2 + term3 + + if forward: + return term1_dict, recon_dict + # returns loss, recon, graph likelihoods, and autocorr + return loss, term1_dict, term2_dict, term3_dict, term4_dict + + def loss_no_cell_types(self, X, loss_weights, D=None, forward=False): + assert not self.use_cell_types # if this is True, just fail + X = X.to(self.device) + D = D.to(self.device) + loss_weights = loss_weights.to(self.device) + + theta = torch.softmax(self.theta, dim=1) + eta = self.eta.exp() / (1.0 + (self.eta).exp()) + eta = 0.5 * (eta + eta.T) + gene_scaling = self.gene_scaling.exp() / (1.0 + self.gene_scaling.exp()) + kappa = self.kappa.exp() / (1 + self.kappa.exp()) + rho = (self.rho.exp() / (1 + self.rho.exp())).to(self.device) + alpha = torch.exp(self.alpha) + + adj_matrix = self.adj_matrix.to(self.device) + weights = self.weights.to(self.device) + adj_matrix_1m = self.adj_matrix_1m.to(self.device) + theta_ = contract("jk,j->jk", theta, gene_scaling + self.delta) + + p_alpha = D @ alpha + recon = contract("ik,jk->ij", p_alpha, theta_) + term1 = -1.0 * ((torch.xlogy(X, recon) - recon) * loss_weights[:, None]).sum() + + if len(adj_matrix) > 0: + mat = contract("il,lj,kj->ik", theta, eta, theta) + term2 = ( + -1.0 + * ( + torch.xlogy( + adj_matrix * weights, + (1.0 - rho) * (1.0 - kappa) * mat + (1.0 - rho) * kappa, + ) + ).sum() + ) + term3 = ( + -1.0 + * ( + torch.xlogy( + adj_matrix_1m, (1.0 - kappa) * (1.0 - rho) * (1.0 - mat) + rho + ) + ).sum() + ) + if D.shape[0] > 0: + # remove ctrl or pert not present + rm_indices = [] + for i, x in enumerate(self.pert_idx): + if x == -1: + rm_indices.append(i) + alpha_subset_idx = [ + i + for i in range(alpha.shape[0]) + if i not in rm_indices + [self.n_p - 1] + ] + pert_adj_idx = [i for i in self.pert_idx if i != -1] + term4 = ( + -1 + * Spectra_util.geary_autocorrelation_multivariate( + mat[pert_adj_idx][:, pert_adj_idx], alpha[alpha_subset_idx] + )["stat"] + ) + + else: + term2 = 0.0 + term3 = 0.0 + term4 = 0.0 + + loss = self.lam * term1 + term2 + term3 + if forward: + return term1, recon + return loss, term1, term2, term3, term4 + + def initialize(self, gene_sets, val): + """ + form of gene_sets: + + cell_type (inc. global) : set of sets of idxs + """ + + for ct in self.cell_types: + assert self.L[ct] >= len(gene_sets[ct]) + count = 0 + if self.L[ct] > 0: + if len(self.adj_matrix[ct]) > 0: + for gene_set in gene_sets[ct]: + self.theta[ct].data[:, count][gene_set] = val + count = count + 1 + for i in range(self.L[ct]): + self.eta[ct].data[i, -1] = -val + self.eta[ct].data[-1, i] = -val + self.theta[ct].data[:, -1][self.adj_matrix[ct].sum(axis=1) == 0] = ( + val + ) + self.theta[ct].data[:, -1][ + self.adj_matrix[ct].sum(axis=1) != 0 + ] = -val + + assert self.L["global"] >= len(gene_sets["global"]) + count = 0 + for gene_set in gene_sets["global"]: + self.theta["global"].data[:, count][gene_set] = val + count = count + 1 + for i in range(self.L["global"]): + self.eta["global"].data[i, -1] = -val + self.eta["global"].data[-1, i] = -val + self.theta["global"].data[:, -1][self.adj_matrix["global"].sum(axis=1) == 0] = ( + val + ) + self.theta["global"].data[:, -1][ + self.adj_matrix["global"].sum(axis=1) != 0 + ] = -val + + def initialize_no_geneset(self, gene_sets, val): + for ct in self.cell_types: + torch.nn.init.xavier_uniform_(self.theta[ct]) + torch.nn.init.xavier_uniform_(self.eta[ct]) + torch.nn.init.xavier_uniform_(self.theta["global"]) + torch.nn.init.xavier_uniform_(self.eta["global"]) + + def initialize_no_celltypes(self, gs_list, val): + assert self.L >= len(gs_list) + count = 0 + for gene_set in gs_list: + self.theta.data[:, count][gene_set] = val + count = count + 1 + for i in range(self.L): + self.eta.data[i, -1] = -val + self.eta.data[-1, i] = -val + self.theta.data[:, -1][self.adj_matrix.sum(axis=1) == 0] = val + self.theta.data[:, -1][self.adj_matrix.sum(axis=1) != 0] = -val + + def initialize_no_celltypes_no_geneset(self, gs_list, val): + torch.nn.init.xavier_uniform_(self.theta) + torch.nn.init.xavier_uniform_(self.eta) + + +class SPECTRA_Model: + """ + + Parameters + ---------- + X : np.ndarray or torch.Tensor + the ``(n, p)`` -shaped matrix containing logged expression count data. Used + for initialization + of self.n and self.p but not stored as an attribute + labels : np.ndarray or NoneType + the ``(n, )`` -shaped array containing cell type labels. If use_cell_types + == False, then + should be set to None + + L : dict or OrderedDict [if use_cell_types == False, then int] + ``number of cell types + 1``-shaped dictionary. Must have "global" as a key + , indicating the + number of global factors + { + "global": 15, + "CD8": 5 + ... + } + > Format matches output of K_est.py to estimate the number of + > Must match cell type labels provided during training + > Recommended practice is to assign at minimum 2 factors per cell type + > L contains the number of factors that describe the graph. + adj_matrix : dict or OrderedDict + ``a dictionary of adjacency matrices, one for every cell type + a "global" + { + "global": ``(p, p)``-shaped binary np.ndarray + "CD8": ... + + } + weights : dict or OrderedDict or NoneType [if use_cell_types == False, then + ``(p, p)``-shaped array] + the ``(p, p)``-shaped set of edge weights per . If weight[i,j] is + non-zero when adj_matrix[i,j] + = 0 this weight is ignored. + + if weights == None, no weights are used + lam : float + lambda parameter of the model, which controls the relative influence + of the graph vs expression + loss functions. This term multiplies the expression loss, so smaller + values of lambda upweight + the prior information + delta : float + delta parameter of the model, which controls a lower bound for gene + scaling factors. If delta + is small then the maximum ratio between gene scaling factors is + larger and lowly expressed + genes can be put on the same scale as highly expressed genes. + kappa : float or NoneType + kappa controls the background rate of edges in the graph. if kappa + is a float, kappa is fixed + to the given float value. If kappa == None, then kappa is a parameter + that is estimated from + the data. + rho : float or NoneType + rho controls the bakcground rate of non-edges in the graph. if rho + is a float, rho is fixed + to the given float value. If rho == None, then rho is a parameter + that is estimated from the + data. + use_cell_types: bool + use_cell_types is a Boolean variable that determines whether cell + type labels are used to + fit the model. If False, then parameters are initialized as + nn.Parameter rather than as + nn.ParameterDict with cell type keys that index nn.Parameter values + determinant_penalty : float + determinant penalty affects the selection parameters that are fit + when L[cell_type] > + K[cell_type]. A determinantally regularized selection parameter + is fit with determinant + penalty that encourages sparsity and diversity. + Attributes + ---------- + model.delta : delta parameter of the model + + model.lam : lambda parameter of the model + + model.determinant_penalty : determinant penalty of the model + + model.L : L parameter, either int, dict or OrderedDict() + + model.p : number of genes + + model.n : number of cells + + model.use_cell_types : if True then cell types are considered, else cell + types ignored. Affects + the dimensions of the initialized parameters. + + model.kappa : if not kappa, nn.ParameterDict() if use_cell_types, else + nn.Parameter(). If kappa + is a float, it is fixed throughout training + + model.rho : if not rho, nn.ParamterDict() if use_cell_types, else + nn.Parameter. If rho is a float + it is fixed throughout training + + model.adj_matrix : adjacency matrix with diagonal removed. dict + containing torch.Tensors + + model.adj_matrix_1m : 1 - adjacency matrix with diagonal removed. + dict containing torch.Tensors + + model.weights : contains edge weights. format matches adj_matrix + + model.cell_types : np.ndarray containing array of unique cell types + + model.cell_type_counts : dict {key = cell type, values = number of + cells} + + model.factors : nn.ParameterDict() or nn.Parameter() containing the + factor weights + + model.cell_scores : nn.ParameterDict() or nn.Parameter() containing + the cell loadings + + model.eta : nn.ParameterDict() or nn.Parameter() containing the + interaction matrix between + factors + + model.gene_scaling : nn.ParameterDict() or nn.Parameter() containing + the gene scale factors + + model.selection : nn.ParameterDict() or nn.Parameter() containing + the attention weights. + Only initialized when L[cell_type] > K[cell_type] for some cell + type or when L > K and + use_cell_types == False + + + + Methods + ---------- + + model.train(self, X, labels, lr_schedule,num_epochs, verbose) : + model.save() + model.load() + model.initialize + model.return_selection() + model.return_eta_diag() + model.return_cell_scores() + model.return_factors() + model.return_eta() + model.return_rho() + model.return_kappa() + model.return_gene_scalings() + model.return_graph(ct = "global") : + model.matching(markers, gene_names_dict, threshold = 0.4): + + """ + + def __init__( + self, + X, + labels, + L, + pert_idx, + pert_labels, + vocab=None, + gs_dict=None, + use_weights=True, + adj_matrix=None, + weights=None, + lam=0.01, + psi=0.01, + delta=0.001, + kappa=None, + rho=0.001, + use_cell_types=True, + ): + self.L = L + self.lam = lam + self.delta = delta + self.kappa = kappa + self.rho = rho + self.use_cell_types = use_cell_types + + # if gs_dict is provided instead of adj_matrix, convert to adj_matrix, overrides + # adj_matrix and weights + if gs_dict is not None: + gene2id = dict((v, idx) for idx, v in enumerate(vocab)) + + if use_cell_types: + adj_matrix, weights = Spectra_util.process_gene_sets( + gs_dict=gs_dict, gene2id=gene2id, weighted=use_weights + ) + else: + adj_matrix, weights = Spectra_util.process_gene_sets_no_celltypes( + gs_dict=gs_dict, gene2id=gene2id, weighted=use_weights + ) + + self.internal_model = SPECTRA( + X=X, + labels=labels, + pert_idx=pert_idx, + pert_labels=pert_labels, + adj_matrix=adj_matrix, + L=L, + weights=weights, + lam=lam, + psi=psi, + delta=delta, + kappa=kappa, + rho=rho, + use_cell_types=use_cell_types, + ) + + self.cell_scores = None + self.factors = None + self.B_diag = None + self.eta_matrices = None + self.gene_scalings = None + self.rho = None + self.kappa = None + + def train( + self, + X, + D, + loss_weights, + X_val, + D_val, + loss_weights_val, + labels=None, + labels_val=None, + lr_schedule=[1.0, 0.5, 0.1, 0.01, 0.001, 0.0001], + num_epochs=10000, + verbose=True, + ): + opt = torch.optim.AdamW( + self.internal_model.parameters(), lr=lr_schedule[0], weight_decay=0.001 + ) + counter = 0 + last = np.inf + train_losses = [] + val_losses = [] + + # batch if too large + X = torch.from_numpy(X) + D = torch.from_numpy(D) + X_val = torch.from_numpy(X_val) + D_val = torch.from_numpy(D_val) + loss_weights = torch.from_numpy(loss_weights) + loss_weights_val = torch.from_numpy(loss_weights_val) + if X.shape[0] > 2e5: + train_dataset = TensorDataset(X, D, loss_weights) + val_dataset = TensorDataset(X_val, D_val, loss_weights_val) + train_dataloader = DataLoader( + train_dataset, batch_size=int(5e4), num_workers=4, shuffle=True + ) + val_dataloader = DataLoader( + val_dataset, batch_size=int(5e4), num_workers=4, shuffle=False + ) + else: + train_dataloader = None + val_dataloader = None + + # train loop + for i in tqdm(range(num_epochs)): + train_epoch_loss = 0 + val_epoch_loss = 0 + # batch if data too large + if train_dataloader: + # train + for batch in train_dataloader: + X_batch = batch[0] + D_batch = batch[1] + loss_weights_batch = batch[2] + opt.zero_grad() + if self.internal_model.use_cell_types: + assert len(labels) == X.shape[0] + loss, term1_dict, term2_dict, term3_dict, term4_dict = ( + self.internal_model.loss( + X=X_batch, + D=D_batch, + loss_weights=loss_weights_batch, + labels=labels, + ) + ) + elif not self.internal_model.use_cell_types: + loss, term1, term2, term3, term4 = ( + self.internal_model.loss_no_cell_types( + X=X_batch, D=D_batch, loss_weights=loss_weights_batch + ) + ) + + loss.backward() + opt.step() + train_epoch_loss += loss.item() + train_epoch_loss = train_epoch_loss / len(train_dataloader) + + # val + with torch.no_grad(): + for batch in val_dataloader: + X_batch = batch[0] + D_batch = batch[1] + loss_weights_batch = batch[2] + if self.internal_model.use_cell_types: + assert len(labels) == X.shape[0] + loss, term1_dict, term2_dict, term3_dict, term4_dict = ( + self.internal_model.loss( + X=X_batch, + D=D_batch, + loss_weights=loss_weights_batch, + labels=labels, + ) + ) + elif not self.internal_model.use_cell_types: + loss, term1, term2, term3, term4 = ( + self.internal_model.loss_no_cell_types( + X=X_batch, + D=D_batch, + loss_weights=loss_weights_batch, + ) + ) + val_epoch_loss += loss.item() + val_epoch_loss = val_epoch_loss / len(val_dataloader) + else: + # train + opt.zero_grad() + if self.internal_model.use_cell_types: + assert len(labels) == X.shape[0] + loss, term1_dict, term2_dict, term3_dict, term4_dict = ( + self.internal_model.loss( + X=X, D=D, loss_weights=loss_weights, labels=labels + ) + ) + elif not self.internal_model.use_cell_types: + loss, term1, term2, term3, term4 = ( + self.internal_model.loss_no_cell_types( + X=X, D=D, loss_weights=loss_weights + ) + ) + loss.backward() + opt.step() + train_epoch_loss = loss.item() + + # val + with torch.no_grad(): + if self.internal_model.use_cell_types: + assert len(labels_val) == X_val.shape[0] + loss, term1_dict, term2_dict, term3_dict, term4_dict = ( + self.internal_model.loss( + X=X_val, + D=D_val, + loss_weights=loss_weights_val, + labels=labels_val, + ) + ) + elif not self.internal_model.use_cell_types: + loss, term1, term2, term3, term4 = ( + self.internal_model.loss_no_cell_types( + X=X_val, D=D_val, loss_weights=loss_weights_val + ) + ) + val_epoch_loss = loss.item() + + # lr adjustment + if train_epoch_loss >= last: + counter += 1 + if int(counter / 10) >= len(lr_schedule): + print("EARLY STOPPING") + break + if counter % 10 == 0: + opt = torch.optim.AdamW( + self.internal_model.parameters(), + lr=lr_schedule[int(counter / 10)], + ) + if verbose: + print("UPDATING LR TO " + str(lr_schedule[int(counter / 10)])) + + last = train_epoch_loss + train_losses.append(train_epoch_loss) + val_losses.append(val_epoch_loss) + + # add all model parameters as attributes + + if self.use_cell_types: + self.__store_parameters(labels) + else: + self.__store_parameters_no_celltypes() + return train_losses, val_losses + + def save(self, fp): + torch.save(self.internal_model.state_dict(), fp) + + def load(self, fp, labels=None): + self.internal_model.load_state_dict(torch.load(fp)) + if self.use_cell_types: + assert labels is not None + self.__store_parameters(labels) + else: + self.__store_parameters_no_celltypes() + + def __store_parameters(self, labels): + """ + Replaces __cell_scores() and __compute factors() and __compute_theta() + store parameters after fitting the model: + cell scores + factors + eta + scalings + gene scalings + kappa + rho + """ + + model = self.internal_model + + # compute the loading matrix + + k = sum(list(model.L.values())) + out = np.zeros((model.n_p, k)) + + global_idx = model.L["global"] + + tot = global_idx + f = ["global"] * model.L["global"] + for i, cell_type in enumerate(model.cell_types): + alpha = torch.exp(model.alpha[cell_type]).detach().cpu().numpy() + out[:, :global_idx] = alpha[:, :global_idx] + out[:, tot : tot + model.L[cell_type]] = alpha[:, global_idx:] + + tot += model.L[cell_type] + + f = f + [cell_type] * model.L[cell_type] + + out2 = np.zeros((k, model.p)) + + theta_ct = torch.softmax(model.theta["global"], dim=1) + theta = theta_ct.detach().cpu().numpy().T + tot = 0 + out2[0 : theta.shape[0], :] = theta + tot += theta.shape[0] + + for cell_type in model.cell_types: + theta_ct = torch.softmax(model.theta[cell_type], dim=1) + theta = theta_ct.detach().cpu().numpy().T + out2[tot : tot + theta.shape[0], :] = theta + tot += theta.shape[0] + + factors = out2 + lst = [] + for i in range(len(f)): + ct = f[i] + scaled = ( + factors[i, :] + * ( + model.gene_scaling[ct].exp().detach() + / (1.0 + model.gene_scaling[ct].exp().detach()) + + model.delta + ) + .cpu() + .numpy() + ) + + lst.append(scaled) + scaled = np.array(lst) + new_factors = scaled / (scaled.sum(axis=0, keepdims=True) + 1.0) + cell_scores = out * scaled.mean(axis=1).reshape(1, -1) + self.cell_scores = cell_scores + self.factors = new_factors + self.B_diag = self.__B_diag() + self.eta_matrices = self.__eta_matrices() + self.gene_scalings = { + ct: model.gene_scaling[ct].exp().detach().cpu().numpy() + / (1.0 + model.gene_scaling[ct].exp().cpu().detach().numpy()) + for ct in model.gene_scaling.keys() + } + self.rho = { + ct: model.rho[ct].exp().detach().cpu().numpy() + / (1.0 + model.rho[ct].exp().detach().cpu().numpy()) + for ct in model.rho.keys() + } + self.kappa = { + ct: model.kappa[ct].exp().detach().cpu().numpy() + / (1.0 + model.kappa[ct].exp().detach().cpu().numpy()) + for ct in model.kappa.keys() + } + + def __B_diag(self): + model = self.internal_model + k = sum(list(model.L.values())) + out = np.zeros(k) + + Bg = model.eta["global"].exp() / (1.0 + model.eta["global"].exp()) + Bg = 0.5 * (Bg + Bg.T) + B = torch.diag(Bg).detach().cpu().numpy() + tot = 0 + out[0 : B.shape[0]] = B + tot += B.shape[0] + + for cell_type in model.cell_types: + Bg = model.eta[cell_type].exp() / (1.0 + model.eta[cell_type].exp()) + Bg = 0.5 * (Bg + Bg.T) + B = torch.diag(Bg).detach().cpu().numpy() + out[tot : tot + B.shape[0]] = B + + tot += B.shape[0] + + return out + + def __eta_matrices(self): + model = self.internal_model + eta = OrderedDict() + Bg = model.eta["global"].exp() / (1.0 + model.eta["global"].exp()) + Bg = 0.5 * (Bg + Bg.T) + eta["global"] = Bg.detach().cpu().numpy() + + for cell_type in model.cell_types: + Bg = model.eta[cell_type].exp() / (1.0 + model.eta[cell_type].exp()) + Bg = 0.5 * (Bg + Bg.T) + eta[cell_type] = Bg.detach().cpu().numpy() + return eta + + def __store_parameters_no_celltypes(self): + """ + store parameters after fitting the model: + cell scores + factors + eta + scalings + gene scalings + kappa + rho + """ + model = self.internal_model + theta_ct = torch.softmax(model.theta, dim=1) + theta = theta_ct.detach().cpu().numpy().T + alpha = torch.exp(model.alpha).detach().cpu().numpy() + out = alpha + factors = theta + + scaled = factors * ( + model.gene_scaling.exp().detach().cpu() + / (1.0 + model.gene_scaling.exp().detach().cpu()) + + model.delta + ).numpy().reshape(1, -1) + new_factors = scaled / (scaled.sum(axis=0, keepdims=True) + 1.0) + + self.factors = new_factors + self.cell_scores = out * scaled.mean(axis=1).reshape(1, -1) + Bg = model.eta.exp() / (1.0 + model.eta.exp()) + Bg = 0.5 * (Bg + Bg.T) + self.B_diag = torch.diag(Bg).detach().cpu().numpy() + self.eta_matrices = Bg.detach().cpu().numpy() + self.gene_scalings = ( + model.gene_scaling.exp().detach().cpu() + / (1.0 + model.gene_scaling.exp().detach().cpu()) + ).numpy() + self.rho = ( + (model.rho.exp().detach() / (1.0 + model.rho.exp().detach())).cpu().numpy() + ) + self.kappa = ( + (model.kappa.exp().detach() / (1.0 + model.kappa.exp().detach())) + .cpu() + .numpy() + ) + + def initialize(self, annotations, word2id, W, init_scores, val=25): + """ + self.use_cell_types must be True + create form of gene_sets: + + cell_type (inc. global) : set of sets of idxs + + filter based on L_ct + """ + if self.use_cell_types: + if annotations: + if init_scores is None: + init_scores = compute_init_scores( + annotations, word2id, torch.Tensor(W) + ) # noqa + gs_dict = OrderedDict() + for ct in annotations.keys(): + mval = max(self.L[ct] - 1, 0) + sorted_init_scores = sorted( + init_scores[ct].items(), key=lambda x: x[1] + ) + sorted_init_scores = sorted_init_scores[-1 * mval :] + names = set([k[0] for k in sorted_init_scores]) + lst_ct = [] + for key in annotations[ct].keys(): + if key in names: + words = annotations[ct][key] + idxs = [] + for word in words: + if word in word2id: + idxs.append(word2id[word]) + lst_ct.append(idxs) + gs_dict[ct] = lst_ct + self.internal_model.initialize(gene_sets=gs_dict, val=val) + else: + self.internal_model.initialize_no_geneset(gene_sets=[], val=val) + else: + if annotations: + if init_scores is None: + init_scores = compute_init_scores_noct( + annotations, word2id, torch.Tensor(W) + ) # noqa + lst = [] + mval = max(self.L - 1, 0) + sorted_init_scores = sorted(init_scores.items(), key=lambda x: x[1]) + sorted_init_scores = sorted_init_scores[-1 * mval :] + names = set([k[0] for k in sorted_init_scores]) + for key in annotations.keys(): + if key in names: + words = annotations[key] + idxs = [] + for word in words: + if word in word2id: + idxs.append(word2id[word]) + lst.append(idxs) + self.internal_model.initialize_no_celltypes(gs_list=lst, val=val) + else: + self.internal_model.initialize_no_celltypes_no_geneset( + gs_list=[], val=val + ) + + def return_eta_diag(self): + return self.B_diag + + def return_cell_scores(self): + return self.cell_scores + + def return_factors(self): + return self.factors + + def return_eta(self): + return self.eta_matrices + + def return_rho(self): + return self.rho + + def return_kappa(self): + return self.kappa + + def return_gene_scalings(self): + return self.gene_scalings + + def return_graph(self, ct="global"): + model = self.internal_model + if self.use_cell_types: + eta = (model.eta[ct]).exp() / (1.0 + (model.eta[ct]).exp()) + eta = 0.5 * (eta + eta.T) + theta = torch.softmax(model.theta[ct].data, dim=1) + mat = contract("il,lj,kj->ik", theta, eta, theta).detach().cpu().numpy() + else: + eta = model.eta.exp() / (1.0 + model.eta.exp()) + eta = 0.5 * (eta + eta.T) + theta = torch.softmax(model.theta, dim=1) + mat = contract("il,lj,kj->ik", theta, eta, theta).detach().cpu().numpy() + + return mat + + def matching(self, markers, gene_names_dict, threshold=0.4): + """ + best match based on overlap coefficient + """ + markers = pd.DataFrame(markers) + if self.use_cell_types: + matches = [] + jaccards = [] + for i in range(markers.shape[0]): + max_jacc = 0.0 + best = "" + for key in gene_names_dict.keys(): + for gs in gene_names_dict[key].keys(): + t = gene_names_dict[key][gs] + + jacc = Spectra_util.overlap_coefficient( + list(markers.iloc[i, :]), t + ) + if jacc > max_jacc: + max_jacc = jacc + best = gs + matches.append(best) + jaccards.append(max_jacc) + + else: + matches = [] + jaccards = [] + for i in range(markers.shape[0]): + max_jacc = 0.0 + best = "" + for key in gene_names_dict.keys(): + t = gene_names_dict[key] + + jacc = Spectra_util.overlap_coefficient(list(markers.iloc[i, :]), t) + if jacc > max_jacc: + max_jacc = jacc + best = key + matches.append(best) + jaccards.append(max_jacc) + output = [] + for j in range(markers.shape[0]): + if jaccards[j] > threshold: + output.append(matches[j]) + else: + output.append("0") + return np.array(output) + + +def return_markers(factor_matrix, id2word, n_top_vals=100): + idx_matrix = np.argsort(factor_matrix, axis=1)[:, ::-1][:, :n_top_vals] + df = pd.DataFrame(np.zeros(idx_matrix.shape)) + for i in range(idx_matrix.shape[0]): + for j in range(idx_matrix.shape[1]): + df.iloc[i, j] = id2word[idx_matrix[i, j]] + return df.values + + +def vectorize_perts(adata, key, control_key): + """ + Vectorizing perturbation labels + + Returns: matrix of perturbation labels as vectors, labels for each column + """ + # create guide one hots (for encoding combos as superpositions) + perts = set() + for t in adata.obs[key]: + if t not in control_key: + guides = t.split("+") + guide1 = guides[0] + guide2 = None + if len(guides) == 2: + guide2 = guides[1] + perts.add(guide1) + if guide2: + perts.add(guide2) + for p in perts: + guides_p = [] + for t in adata.obs[key]: + if t == control_key: + guides_p.append(0) + else: + if p in t: + guides_p.append(1) + else: + guides_p.append(0) + adata.obs[f"guide_{p}"] = np.array(guides_p) + + guide_one_hot_cols = get_guide_one_hot_cols(adata.obs) + adata.obs["num_guides"] = adata.obs[guide_one_hot_cols].sum(1) + # combinations encoded as application of two individual guides + D = adata.obs[guide_one_hot_cols].to_numpy().astype(np.float32) + + return D, guide_one_hot_cols + + +def vectorize_perts_combinations(adata, key, control_key): + """ + Vectorizing perturbation labels, with combinations considered a unique perturbation + + Returns: matrix of perturbation labels as vectors, labels for each column + """ + # encode combinations as unique + D_df = pd.get_dummies(adata.obs[key]) + D_df = D_df.drop(columns=control_key) + # encode non-targeting as no perturbation for consistency with other encoding + d_var_info = np.array(D_df.T[[]].index) + + # get singletons-only binarization + for index, row in D_df.iterrows(): + pert_idx = np.where(row)[0] + if len(pert_idx) > 0: + pert_label = d_var_info[pert_idx][0].split("+") + single_idx = [list(d_var_info).index(i) for i in pert_label] + for i in single_idx: + row.iloc[i] = 1 + + return D_df.to_numpy().astype(np.float32), list(d_var_info) + + +def get_guide_one_hot_cols(obs: pd.DataFrame): + guide_one_hot_cols = [ + col + for col in obs.columns + if "guide_" in col and col not in ("guide_identity", "guide_ids") + ] + return guide_one_hot_cols diff --git a/src/Spectra/Spectra_util.py b/src/Spectra/Spectra_util.py new file mode 100644 index 0000000..2396cd6 --- /dev/null +++ b/src/Spectra/Spectra_util.py @@ -0,0 +1,433 @@ +### graph functions #### +from collections import OrderedDict + +import numpy as np +import pandas as pd +import scipy +import torch + +# from adjustText import adjust_text + + +def _compute_geary(L: torch.Tensor, Z: torch.Tensor): + """Compute Geary's C multivariate autocorrelation. + + Parameters + ---------- + L : np.ndarray + Graph Laplacian. + Z : np.ndarray + Feature matrix. + Returns + ------- + np.ndarray + Geary's C values. + """ + c = torch.linalg.multi_dot([Z.T, L, Z]).diagonal() + + return c + + +def geary_autocorrelation_multivariate( + G: torch.Tensor, + X: torch.Tensor, + max_z: int = 100, +): + """Compute Geary's C multivariate autocorrelation. + + Parameters + ---------- + G : torch.Tensor + Graph or adjacency matrix. + X : torch.Tensor + Feature matrix. + max_z : int + Maximum value for z score, by default 100 + + Returns + ------- + pd.Series + Geary's C values. + """ + + n, p = X.shape + d = torch.sum(G, dim=0) + D = torch.diag(d) + L = D - G + + Z = (X - X.mean(dim=0)) / X.std(dim=0) + Z = Z.nan_to_num() + + c_unnormalized = _compute_geary(L, Z) + c_normalized = c_unnormalized / d.sum() # per annotation + + out_dict = { + "stat": c_normalized.sum(), + "pval": 1, + "z": 0, + "stat_annotation": c_normalized, + } + + return out_dict + + +""" +methods +_______ + +amatrix(gene_set_list, gene2id) + +amatrix_weighted(gene_set_list, gene2id) + +unravel_dict(dict_) + +process_gene_sets() + +process_gene_sets_no_celltypes() + +overlap_coefficient() + +label_marker_genes() + + +""" + + +def overlap_coefficient(list1, list2): + """ + Computes overlap coefficient between two lists + """ + intersection = len(list(set(list1).intersection(set(list2)))) + union = min(len(list1), len(list2)) # + len(list2)) - intersection + return float(intersection) / union + + +def check_gene_set_dictionary( + adata, + annotations, + obs_key="cell_type_annotations", + global_key="global", + return_dict=True, + min_len=3, + use_cell_types=True, +): + """ + Filters annotations dictionary to contain only genes contained in the + adata. + Checks that annotations dictionary cell type keys and adata cell types + are identical. + Checks that all gene sets in annotations dictionary contain >2 genes + after filtering. + + + adata: AnnData , data to use with Spectra + annotations: dict , gene set annotations dictionary to use with Spectra + obs_key: str , column name for cell type annotations in adata.obs + global_key: str , key for global gene sests in gene set annotation + dictionary + return_dict: bool , return filtered gene set annotation dictionary + min_len: int, minimum length of gene sets + + returns: dict , filtered gene set annotation dictionary + + """ + # test if keys match + if use_cell_types: + adata_labels = list(set(adata.obs[obs_key])) + [ + global_key + ] # cell type labels in adata object + else: + annotations = {global_key: annotations} + adata_labels = [global_key] + annotation_labels = list(annotations.keys()) + # matching_celltype_labels = list(set(adata_labels).intersection(annotation_labels)) + if set(annotation_labels) != set(adata_labels): + missing_adata = set(adata_labels) - set(annotation_labels) + missing_dict = set(annotation_labels) - set(adata_labels) + raise ValueError( + "The following adata labels are missing in the gene set \ + annotation dictionary:", + missing_dict, + "The following gene set annotation dictionary keys are \ + missing in the adata labels:", + missing_adata, + ) + dict_keys_OK = False + else: + print( + "Cell type labels in gene set annotation dictionary and \ + AnnData object are identical" + ) + dict_keys_OK = True + + # check that gene sets in dictionary have len >2 + annotations_new = {} + for k, v in annotations.items(): + annotations_new[k] = {} + for k2, v2 in v.items(): + gs = [x for x in v2 if x in adata.var_names] + if len(gs) < min_len: + print( + "removing gene set", + k2, + "for cell type", + k, + "which is of length", + len(v2), + len(gs), + "genes are found in the data.", + "minimum length is", + min_len, + ) + else: + annotations_new[k][k2] = gs + + # raise error if no gene sets remain + if not use_cell_types and len(annotations_new[global_key]) == 0: + raise ValueError( + "No gene sets remain in the gene set annotation dictionary. \ + Please make sure that gene names correspond to names found \ + in `adata.var_names`. See: https://github.com/dpeerlab/spectra/issues/34." + ) + + if dict_keys_OK: + print("Your gene set annotation dictionary is now correctly formatted.") + if return_dict: + if not use_cell_types: + return annotations_new[global_key] + else: + return annotations_new + + +def label_marker_genes(marker_genes, gs_dict, threshold=0.4): + """ + label an array of marker genes using the gene_set_dictionary in est_spectra + returns a dataframe of overlap coefficients for each gene set annotation + and marker gene + marker_genes: array factors x marker genes or a KnowledgeBase object + label an array containing marker genes by its overlap with a dictionary of + gene sets from the knowledge base: + KnowledgeBase.celltype_process_dict + """ + + overlap_df = pd.DataFrame() + marker_set_len_dict = {} # len of gene sets to resolve ties + for i, v in pd.DataFrame(marker_genes).T.items(): + overlap_temp = [] + gs_names_temp = [] + + for gs_name, gs in gs_dict.items(): + marker_set_len_dict[gs_name] = len(gs) + overlap_temp.append(overlap_coefficient(set(gs), set(v))) + gs_names_temp.append(gs_name) + overlap_df_temp = pd.DataFrame(overlap_temp, columns=[i], index=gs_names_temp).T + overlap_df = pd.concat([overlap_df, overlap_df_temp]) + overlap_df.loc["gene_set_length"] = list( + overlap_df.columns.map(marker_set_len_dict) + ) + + # find maximum overlap coefficient gene set label for each factor, resolve + # ties by gene set length + marker_gene_labels = [] # gene sets + + marker_gene_list = list(overlap_df.index) + marker_gene_list.remove("gene_set_length") + for marker_set in marker_gene_list: + # resolve ties in overlap_coefficient by selecting the bigger gene set + max_overlap = ( + overlap_df.loc[[marker_set, "gene_set_length"]] + .T.sort_values(by="gene_set_length", ascending=True) + .sort_values(by=marker_set, ascending=True)[marker_set] + .index[-1] + ) + + if overlap_df.loc[marker_set].sort_values().values[-1] > threshold: + marker_gene_labels.append(max_overlap) + else: + marker_gene_labels.append(marker_set) + overlap_df = overlap_df.drop(index="gene_set_length") + overlap_df.index = marker_gene_labels + return overlap_df + + +def amatrix(gene_set_list, gene2id): + """ + creates adjacency matrix from gene set list + """ + n = len(gene2id) + adj_matrix = np.zeros((n, n)) + for gene_set in gene_set_list: + for i in range(len(gene_set)): + for j in range(len(gene_set)): + g1 = gene_set[i] + g2 = gene_set[j] + if (g1 in gene2id) & (g2 in gene2id): + adj_matrix[gene2id[g1], gene2id[g2]] = 1 + return adj_matrix + + +def amatrix_weighted(gene_set_list, gene2id): + """ + Creates weighted adjacency matrix from gene sets + uses 1/{n choose 2} as edge weights - edge weights accumulate additively + """ + n = len(gene2id) + adj_matrix = np.zeros((n, n)) + ws = [] + for gene_set in gene_set_list: + if len(gene_set) > 1: + w = 1.0 / (len(gene_set) * (len(gene_set) - 1) / 2.0) + else: + w = 1.0 + ws.append(w) + for i in range(len(gene_set)): + for j in range(len(gene_set)): + g1 = gene_set[i] + g2 = gene_set[j] + if (g1 in gene2id) & (g2 in gene2id): + adj_matrix[gene2id[g1], gene2id[g2]] += w + med = np.median(np.array(ws)) + return adj_matrix / float(med) + + +def unravel_dict(dict_): + lst = [] + for key in dict_.keys(): + lst.append(dict_[key]) + return lst + + +def process_gene_sets(gs_dict, gene2id, weighted=True): + """ + { "global": {"": [, , ...]} + } + """ + adict = OrderedDict() + adict["global"] = amatrix(unravel_dict(gs_dict["global"]), gene2id) + weights = None + + if weighted: + weights = OrderedDict() + weights["global"] = amatrix_weighted(unravel_dict(gs_dict["global"]), gene2id) + + for key in gs_dict.keys(): + if len(gs_dict[key]) > 0: + adict[key] = amatrix(unravel_dict(gs_dict[key]), gene2id) + if weighted: + weights[key] = amatrix_weighted(unravel_dict(gs_dict[key]), gene2id) + else: + adict[key] = [] + if weighted: + weights[key] = [] + + return adict, weights + + +def process_gene_sets_no_celltypes(gs_dict, gene2id, weighted=True): + """ + input: {"": [, , ...]} + } + gene2id {gene name: index in vocab} + + weighted: whether to return NoneType or weighted adjacency matrix + """ + adict = amatrix(unravel_dict(gs_dict), gene2id) + weights = None + if weighted: + weights = amatrix_weighted(unravel_dict(gs_dict), gene2id) + return adict, weights + + +def get_factor_celltypes(adata, obs_key, cellscore_obsm_key="SPECTRA_cell_scores"): + """ + Assigns Spectra factors to cell types by analyzing the factor cell scores. + Cell type specific factors will have zero cell scores except in their respective cell type + + adata: AnnData , object containing the Spectra output + obs_key: str , column name in adata.obs containing the cell type annotations + cellscore_obsm_key: str , key for adata.obsm containing the Spectra cell scores + + returns: dict , dictionary of {factor index : 'cell type'} + """ + + # get cellscores + import pandas as pd + + cell_scores_df = pd.DataFrame(adata.obsm[cellscore_obsm_key]) + cell_scores_df["celltype"] = list(adata.obs[obs_key]) + + # find global and cell type specific fators + global_factors_series = (cell_scores_df.groupby("celltype").mean() != 0).all() + global_factors = [ + factor + for factor in global_factors_series.index + if global_factors_series[factor] + ] + specific_cell_scores = ( + (cell_scores_df.groupby("celltype").mean()).T[~global_factors_series].T + ) + specific_factors = {} + + for i in set(cell_scores_df["celltype"]): + specific_factors[i] = [ + factor + for factor in specific_cell_scores.loc[i].index + if specific_cell_scores.loc[i, factor] + ] + + # inverse dict factor:celltype + factors_inv = {} + for i, v in specific_factors.items(): + for factor in v: + factors_inv[factor] = i + + # add global + + for factor in global_factors: + factors_inv[factor] = "global" + + return factors_inv + + +## importance and information score functions + + +def mimno_coherence_single(w1, w2, W): + # eps = 0.01 + dw1 = W[:, w1] > 0 + dw2 = W[:, w2] > 0 + # N = W.shape[0] + + dw1w2 = (dw1 & dw2).float().sum() + dw1 = dw1.float().sum() + dw2 = dw2.float().sum() + if (dw1 == 0) | (dw2 == 0): + return -0.1 * np.inf + return ((dw1w2 + 1) / (dw2)).log() + + +def mimno_coherence_2011(words, W): + score = 0 + V = len(words) + + for j1 in range(1, V): + for j2 in range(j1): + score += mimno_coherence_single(words[j1], words[j2], W) + denom = V * (V - 1) / 2 + return score / denom + + +def get_information_score(adata, idxs, cell_type): + if "spectra_vocab" not in adata.var.columns: + print("adata requires spectra_vocab attribute.") + return None + + idx_to_use = adata.var["spectra_vocab"] + X = adata.X[:, idx_to_use] + if X is scipy.sparse.csr.csr_matrix: + X = np.array(X.todense()) + X = torch.Tensor(X) + lst = [] + # for j in range(idxs.shape[0]): + # lst.append(mimno_coherence_2011(list(idxs[j,:]), X[labels == cell_type])) + return lst diff --git a/src/Spectra/__init__.py b/src/Spectra/__init__.py new file mode 100644 index 0000000..9f039d6 --- /dev/null +++ b/src/Spectra/__init__.py @@ -0,0 +1,13 @@ +# + +# from .load import default_gene_sets, sample_data +# from .Spectra import ( +# SPECTRA, +# SPECTRA_EM, +# SPECTRA_Model, +# est_spectra, +# gene_set_graph, +# get_factor_celltypes, +# graph_network, +# graph_network_multiple, +# load_from_pickle, +# ) diff --git a/src/Spectra/initialization.py b/src/Spectra/initialization.py new file mode 100644 index 0000000..2a3a206 --- /dev/null +++ b/src/Spectra/initialization.py @@ -0,0 +1,60 @@ +from collections import OrderedDict + + +def mimno_coherence_single(w1, w2, W): + dw1 = W[:, w1] > 0 + dw2 = W[:, w2] > 0 + + dw1w2 = (dw1 & dw2).float().sum() + dw1 = dw1.float().sum() + dw2 = dw2.float().sum() + + return ((dw1w2 + 1) / (dw2)).log() + + +def mimno_coherence_2011(words, W): + score = 0 + V = len(words) + + for j1 in range(1, V): + for j2 in range(j1): + score += mimno_coherence_single(words[j1], words[j2], W) + denom = V * (V - 1) / 2 + return score / denom + + +def compute_init_scores_noct(gs_dict, word2id, W): + init_scores = OrderedDict() + keys = list(gs_dict.keys()) + for key in keys: + gs = gs_dict[key] + idxs = [] + for word in gs: + if word in word2id: + idxs.append(word2id[word]) + # idxs = [word2id[word] for word in gs] + coh = mimno_coherence_2011(idxs, W) + init_scores[key] = coh.item() + return init_scores + + +def compute_init_scores(gs_dict, word2id, W): + keys = list(gs_dict.keys()) + init_scores = OrderedDict() + for key in keys: + if len(gs_dict[key]) > 0: + inner_keys = list(gs_dict[key].keys()) + init_scores[key] = OrderedDict() + for inner_key in inner_keys: + gs = gs_dict[key][inner_key] + idxs = [] + for word in gs: + if word in word2id: + idxs.append(word2id[word]) + # idxs = [word2id[word] for word in gs] + coh = mimno_coherence_2011(idxs, W) + init_scores[key][inner_key] = coh.item() + else: + init_scores[key] = {} + + return init_scores diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..96d58a5 --- /dev/null +++ b/utils.py @@ -0,0 +1,818 @@ +import itertools +import os +import pickle +import random +import re +from typing import Any, List + +import anndata as ad +import boto3 +import botocore +import numpy as np +import pandas as pd +import torch +from gprofiler import GProfiler +from scipy import sparse +from scipy.stats import false_discovery_control, hypergeom +from sklearn.metrics import auc, precision_recall_curve +from sklearn.model_selection import KFold, train_test_split + +SPECTRA_DEFAULT_DIR = os.path.join(os.path.expanduser("~"), "pertspectra_cache") +GSEA_COLUMN_DATATYPES_FOR_SERIALIZATION = { + "Name": "category", + "Term": "category", + "ES": "float32", + "NES": "float32", + "NOM p-val": "float32", + "FDR q-val": "float32", + "FWER p-val": "float32", + "Tag %": "str", + "Gene %": "str", + "Lead_genes": "str", + "gsea_weight": "category", + "pval": "float32", + "fdr_bh": "float32", + "fwer_bf": "float32", + "GO_ID": "category", + "Lead_g_id": "str", +} + + +def read_aws_h5ad(s3_url): + save_path = os.path.join(SPECTRA_DEFAULT_DIR, s3_url.split("/")[-1]) + + os.makedirs(os.path.dirname(save_path), exist_ok=True) + s3 = boto3.resource("s3") + + # Get the bucket name and key from the s3 url + bucket_name, key = s3_url.removeprefix("s3://").split("/", 1) + + s3_object = s3.Object(bucket_name=bucket_name, key=key) + s3_object.download_file(save_path) + + adata = ad.read_h5ad(save_path) + return adata + + +def read_aws_csv(s3_url, sep=",", zipped=False, header="infer"): + save_path = os.path.join(SPECTRA_DEFAULT_DIR, s3_url.split("/")[-1]) + + os.makedirs(os.path.dirname(save_path), exist_ok=True) + s3 = boto3.resource("s3") + + # Get the bucket name and key from the s3 url + bucket_name, key = s3_url.removeprefix("s3://").split("/", 1) + + s3_object = s3.Object(bucket_name=bucket_name, key=key) + try: + s3_object.download_file(save_path) + if not zipped: + df = pd.read_csv(save_path, sep=sep, header=header) + else: + df = pd.read_csv( + save_path, compression="gzip", delimiter="\t", header=header + ) + return df + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + print("object does not exist") + return None + return None + + +def read_aws_npz(s3_url, sep=","): + save_path = os.path.join(SPECTRA_DEFAULT_DIR, s3_url.split("/")[-1]) + + os.makedirs(os.path.dirname(save_path), exist_ok=True) + s3 = boto3.resource("s3") + + # Get the bucket name and key from the s3 url + bucket_name, key = s3_url.removeprefix("s3://").split("/", 1) + + s3_object = s3.Object(bucket_name=bucket_name, key=key) + try: + s3_object.download_file(save_path) + mtx = np.load(save_path) + return mtx + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + print("object does not exist") + return None + return None + + +def read_aws_pickle(s3_url, sep=","): + save_path = os.path.join(SPECTRA_DEFAULT_DIR, s3_url.split("/")[-1]) + + os.makedirs(os.path.dirname(save_path), exist_ok=True) + s3 = boto3.resource("s3") + + # Get the bucket name and key from the s3 url + bucket_name, key = s3_url.removeprefix("s3://").split("/", 1) + + s3_object = s3.Object(bucket_name=bucket_name, key=key) + try: + s3_object.download_file(save_path) + with open(save_path, "rb") as f: + pickle_obj = pickle.load(f) + return pickle_obj + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + print("object does not exist") + return None + return None + + +def read_aws_json(s3_url): + import json + + save_path = os.path.join(SPECTRA_DEFAULT_DIR, s3_url.split("/")[-1]) + + os.makedirs(os.path.dirname(save_path), exist_ok=True) + s3 = boto3.resource("s3") + + # Get the bucket name and key from the s3 url + bucket_name, key = s3_url.removeprefix("s3://").split("/", 1) + + s3_object = s3.Object(bucket_name=bucket_name, key=key) + try: + s3_object.download_file(save_path) + f = open(save_path) + data = json.load(f) + return data + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + print("object does not exist") + return None + return None + + +def write_model_pickle_to_s3(s3_url, model_name, model): + model_path = f"{model_name}.pickle" + temp_path = os.path.join(SPECTRA_DEFAULT_DIR, model_path) + + os.makedirs(os.path.dirname(temp_path), exist_ok=True) + s3 = boto3.resource("s3") + + with open(temp_path, "wb") as f: + pickle.dump(model, f, pickle.HIGHEST_PROTOCOL) + # Get the bucket name and key from the s3 url + bucket_name, key = s3_url.removeprefix("s3://").split("/", 1) + # Specify model version in key + key = key + model_path + # Write model to s3 + s3.Object(bucket_name=bucket_name, key=key).put(Body=open(temp_path, "rb")) + + +def write_adata_to_s3(s3_url, adata_name, adata): + adata_path = f"{adata_name}.h5ad" + temp_path = os.path.join(SPECTRA_DEFAULT_DIR, adata_path) + + os.makedirs(os.path.dirname(temp_path), exist_ok=True) + s3 = boto3.resource("s3") + + adata.write_h5ad(temp_path) + # Get the bucket name and key from the s3 url + bucket_name, key = s3_url.removeprefix("s3://").split("/", 1) + # Specify model version in key + key = key + adata_path + # Write model to s3 + s3.Object(bucket_name=bucket_name, key=key).put(Body=open(temp_path, "rb")) + + +def read_model_pickle_from_s3(s3_url, model_name): + model_path = f"{model_name}.pickle" + temp_path = os.path.join(SPECTRA_DEFAULT_DIR, model_path) + + os.makedirs(os.path.dirname(temp_path), exist_ok=True) + s3 = boto3.resource("s3") + + # Get the bucket name and key from the s3 url + bucket_name, key = s3_url.removeprefix("s3://").split("/", 1) + # Specify model version in key + key = key + model_path + + # Read model from s3 + s3_object = s3.Object(bucket_name=bucket_name, key=key) + try: + s3_object.download_file(temp_path) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + print("Model could not be found") + return None + + with open(temp_path, "rb") as f: + model = pickle.load(f) + return model + + +def load_model( + adata, + s3_dir, + experiment_name, + model_name, + use_cell_types=False, + cell_type_key="", +): + """ + Loads trained model + + adata: AnnData, data to store model results + s3_dir: str, directory where the model is stored + experiment_name: str, name of experiment (same as wandb name) + model_name: str, name of model + use_cell_types: bool, boolean if model used cell types + cell_type_key: str, cell type key is use_cell_types==True + + returns: + trained model: SPECTRA_Model instance + anndata with saved parameters: contains the following fields: + - + + """ + # load model from checkpoint + wrapper = read_model_pickle_from_s3(s3_dir + experiment_name, model_name) + + # initialize Spectra wrapper + if use_cell_types: + labels = np.array(adata.obs[cell_type_key]) + wrapper._SPECTRA_Model__store_parameters(labels) + else: + wrapper._SPECTRA_Model__store_parameters_no_celltypes() + + # save parameters + # vocab = adata.var_names + # id2word = dict((idx, v) for idx, v in enumerate(vocab)) + # word2id = dict((v, idx) for idx, v in enumerate(vocab)) + adata.uns["SPECTRA_factors"] = wrapper.factors + adata.uns["SPECTRA_L"] = wrapper.internal_model.L + adata.uns["SPECTRA_pert_scores"] = wrapper.cell_scores + + return wrapper, adata + + +# + +# preprocess perturbations for rna565 - leave intergenics as separate +# reformat guide gene naming conventions: ctrl for control, + delimiting the guides +def replace_ctrl_words(s): + # Define the regex pattern + pattern = r"\bCTRL00\w*\b" + + # Check if the string is 'nan' + if s == "nan": + return s + + else: + rep = re.sub(pattern, "ctrl", s) + return rep + + +def replace_intergenic_words(s): + # Define the regex pattern + pattern = r"\bINTERGENIC\w*\b" + + # Check if the string is 'nan' + if s == "nan": + return s + + else: + rep = re.sub(pattern, "intergenic", s) + return rep + + +def inhouse_preprocess(adata): + guides = np.array(adata.obs["target_gene_name"]) + # leave nans, reformat | as +, and replace CTRL as controls + # filtered_nan_guides = np.where(guides == "nan", "ctrl", guides) + filtered_delimiter_guides = np.array([x.replace("|", "+") for x in guides]) + v_replace_intergenic_words = np.vectorize(replace_intergenic_words) + filtered_delimiter_guides = v_replace_intergenic_words(filtered_delimiter_guides) + v_replace_ctrl_words = np.vectorize(replace_ctrl_words) + filtered_delimiter_guides = v_replace_ctrl_words(filtered_delimiter_guides) + adata.obs["condition"] = filtered_delimiter_guides + adata.obs["condition"] = np.where( + adata.obs["condition"] == "ctrl+ctrl", "ctrl", adata.obs["condition"] + ) + + # add control column + condition = np.array(adata.obs["condition"]) + controls = np.where(condition == "ctrl", 1, 0) + adata.obs["control"] = controls + + # # reformat singletons + for i, guide in enumerate(adata.obs["condition"]): + if ("ctrl" in guide) and (guide.count("+") == 1): + pert = guide.split("+") + if pert[0] == "ctrl": + adata.obs["condition"][i] = pert[1] + else: + adata.obs["condition"][i] = pert[0] + return adata + + +def filter_noisy_genes(adata): + """ + Filter noisy genes from anndata - both the expression and graph + """ + # filter noise genes + noise_prefixes = set(["RPL", "RPS", "MT-", "NEAT1", "MALAT1", "NDUF"]) + + def prefix_match(name, taglist): + taglist = tuple(taglist) + if name.startswith(taglist): + return name + return None + + relevant_gene_idx = [] + for i, x in enumerate(adata.var_names): + match = prefix_match(x, noise_prefixes) + if not match: + relevant_gene_idx.append(i) + + adata = adata[:, relevant_gene_idx] + adata.uns["sparse_gene_network"] = sparse.csr_matrix( + adata.uns["sparse_gene_network"].todense()[relevant_gene_idx][ + :, relevant_gene_idx + ] + ) + + return adata + + +def split_data_by_cell(X, D, test_size=0.2, val_size=0.2): + """ + Split data into train/val/test by cells (seeing all perturbations in training) + """ + data_idx = [i for i in range(X.shape[0])] + train_val_idx, test_idx, D_train_val, D_test = train_test_split( + data_idx, D, test_size=test_size, random_state=1, stratify=D + ) + train_idx, val_idx, D_train, D_val = train_test_split( + train_val_idx, + D_train_val, + test_size=val_size, + random_state=1, + stratify=D_train_val, + ) + + return train_idx, val_idx, test_idx + + +def split_data_by_combinations( + adata, + X, + D, + perturbation_key="condition", + intergenic="intergenic", + test_size=0.3, + val_size=0.2, +): + """ + Split data into train/val/test by perturbation (holdout some combinations) + """ + pert_list = list(adata.obs[perturbation_key].unique()) + combo_perts = [i for i in pert_list if ("+" in i) and (intergenic not in i)] + single_perts = [i for i in pert_list if ("+" not in i) or (intergenic in i)] + + single_idx = [ + i for i, x in enumerate(adata.obs[perturbation_key]) if x in single_perts + ] + D_single = D[single_idx] + + train_idx_single, val_idx_single, D_train_single, D_val_single = train_test_split( + single_idx, D_single, test_size=test_size, random_state=1, stratify=D_single + ) + train_val_combos, test_combos = train_test_split( + combo_perts, test_size=val_size, random_state=1 + ) + train_val_idx = [ + i for i, x in enumerate(adata.obs[perturbation_key]) if x in train_val_combos + ] + test_idx = [ + i for i, x in enumerate(adata.obs[perturbation_key]) if x in test_combos + ] + D_train_val = D[train_val_idx] + train_idx_c, val_idx_c, _, _ = train_test_split( + train_val_idx, D_train_val, test_size=0.2, random_state=1, stratify=D_train_val + ) + train_idx = train_idx_single + train_idx_c + val_idx = val_idx_single + val_idx_c + + return train_idx, val_idx, test_idx + + +def generate_k_fold( + adata, + X, + D, + perturbation_key="condition", + intergenic="intergenic", + folds=5, + fold_idx=0, + test_size=0.2, + val_size=0.2, +): + """ + Split data into folds + """ + pert_list = list(adata.obs[perturbation_key].unique()) + combo_perts = [i for i in pert_list if ("+" in i)] + single_perts = [i for i in pert_list if ("+" not in i)] + + # singletons + single_idx = [ + i for i, x in enumerate(adata.obs[perturbation_key]) if x in single_perts + ] + D_single = D[single_idx] + train_val_single_idx, test_single_idx, D_train_val_single, D_test_single = ( + train_test_split( + single_idx, D_single, test_size=test_size, random_state=1, stratify=D_single + ) + ) + train_single_idx, val_single_idx, _, _ = train_test_split( + train_val_single_idx, + D_train_val_single, + test_size=val_size, + random_state=1, + stratify=D_train_val_single, + ) + + # combos - kfold + kf = KFold(n_splits=folds, random_state=1, shuffle=True) + kf_splits = kf.split(combo_perts) + train_val_combos_idx, test_combos_idx = next( + itertools.islice(kf_splits, fold_idx, None) + ) + train_val_combos = np.array(combo_perts)[train_val_combos_idx] + test_combos = np.array(combo_perts)[test_combos_idx] + + train_val_combos_idx = [ + i for i, x in enumerate(adata.obs[perturbation_key]) if x in train_val_combos + ] + test_combos_idx = [ + i for i, x in enumerate(adata.obs[perturbation_key]) if x in test_combos + ] + D_train_val = D[train_val_combos_idx] + train_idx_c, val_idx_c, _, _ = train_test_split( + train_val_combos_idx, + D_train_val, + test_size=0.2, + random_state=1, + stratify=D_train_val, + ) + + train_idx = train_single_idx + train_idx_c + val_idx = val_single_idx + val_idx_c + test_idx = test_single_idx + test_combos_idx + + return train_idx, val_idx, test_idx + + +def set_seed(seed: int) -> None: + """Sets the random seed to seed. + + Args: + seed: the random seed. + """ + + torch.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + random.seed(seed) + np.random.seed(seed) + + +def generate_loss_weights(adata, perturbation_key): + """ + Generate loss weights for weighting based on fraction of perturbations + + Parameters: + - adata: full anndata + - perturbation_key: key for perturbation annotations + """ + # generate weights for losses - inverse of number of cells + loss_weights = {} + for x in adata.obs[perturbation_key].unique(): + weight = 1 / adata.obs[perturbation_key].value_counts()[x] + loss_weights[x] = weight + sample_weights = np.array([loss_weights[i] for i in adata.obs[perturbation_key]]) + return sample_weights + + +#### Interpretability Analyses #### +GPROFILER_SOURCES = [ + "GO:MF", + "GO:CC", + "GO:BP", + "REAC", + "WP", + "TF", + "MIRNA", + "HPA", + "CORUM", + "HP", +] + + +def get_gprofiler( + de_genes: pd.DataFrame, + organism: str = "hsapiens", + sources: List[str] = GPROFILER_SOURCES, + no_evidences: bool = False, +) -> pd.DataFrame: + """ + Perform gene set enrichment using gprofiler. + de_genes should have a column "gene_symbol" containing gene symbols. + """ + gp = GProfiler(return_dataframe=True) + result = gp.profile( + query=list(de_genes["gene_symbol"]), + organism=organism, + sources=sources, + no_evidences=no_evidences, + ) + go_to_gene = read_aws_pickle("s3://pert-spectra/references/GO_to_Gene.pickle") + filtered_goterms = list(go_to_gene.keys()) + result = result[result.native.isin(filtered_goterms)] + return result + + +def retrieve_stringdb_neighbors(genes: List = []): + """ + Use StringDB to retrieve functional neighbors for perturbations + """ + # retrieve stringdb + PERTSPECTRA_DEFAULT_DIR = os.path.join(os.path.expanduser("~"), "pertspectra_cache") + stringdb_s3_url = "s3://pert-spectra/references/StringDB.HQ.txt" + save_path = os.path.join(PERTSPECTRA_DEFAULT_DIR, stringdb_s3_url.split("/")[-1]) + os.makedirs(os.path.dirname(save_path), exist_ok=True) + s3 = boto3.resource("s3") + # Get the bucket name and key from the s3 url + bucket_name, key = stringdb_s3_url.removeprefix("s3://").split("/", 1) + s3_object = s3.Object(bucket_name=bucket_name, key=key) + s3_object.download_file(save_path) + stringdb_hq = pd.read_csv(save_path, sep="\t") + + perts = [x for x in genes if x not in ["ctrl", "intergenic", "basal"]] + pert_neighbors: dict[Any, Any] = {key: set() for key in perts} + for index, row in stringdb_hq.iterrows(): + if row["x"] > 0.8: + if row["i_genes"] in pert_neighbors.keys(): + pert_neighbors[row["i_genes"]].add(row["j_genes"]) + elif row["j_genes"] in pert_neighbors.keys(): + pert_neighbors[row["j_genes"]].add(row["i_genes"]) + return pert_neighbors + + +def run_gsea( + de_genes: pd.DataFrame, + min_size: int = 10, + max_size: int = 500, + weighted_score_type: int = 0, + permutation_num: int = 1000, + ascending: bool = False, + no_plot: bool = True, + processes: int = -1, + verbose: bool = True, + seed: int = 0, + gsea_name_col: str = "gene_symbol", +): + import gseapy as gp + from statsmodels.stats.multitest import multipletests + + gsea_inputs = de_genes[[gsea_name_col, "z"]].sort_values("z", ascending=False) + gene_sets = read_aws_pickle("s3://pert-spectra/references/GO_to_Gene.pickle") + + # Run prerank + pre_res = gp.prerank( + gsea_inputs, + gene_sets=gene_sets, + outdir=None, + min_size=min_size, + max_size=max_size, + weighted_score_type=weighted_score_type, + permutation_num=permutation_num, + ascending=ascending, + no_plot=no_plot, + verbose=verbose, + threads=processes, + seed=seed, + ) + result = pre_res.res2d + + # Adjust p-value + result["gsea_weight"] = weighted_score_type + + result["pval"] = (result["NOM p-val"] * permutation_num + 1) / (permutation_num + 1) + result["fdr_bh"] = multipletests(result["pval"], method="fdr_bh")[1] + result["fwer_bf"] = np.clip(result["pval"] * result.shape[0], 0, 1) + result["GO_ID"] = result["Term"] + # adjust data types to enable serialization to h5ad if gsea output is added to .uns + # attribute of anndata + columns_to_format = { + k: v + for (k, v) in GSEA_COLUMN_DATATYPES_FOR_SERIALIZATION.items() + if k in result.columns + } + result = result.astype(columns_to_format) + + return result, pre_res + + +def factor_enrichment_gsea(adata, latent, max_size=300, fdr=5e-3): + factor_to_go = {} + latent = latent + for i in range(len(latent)): + # rank by latent factor loadings + lvec = latent[i] + gsea_input = pd.DataFrame( + [adata.var_names, lvec], + index=["gene_symbol", "z"], + ).T + # run gsea + gsea_input["z"] = gsea_input["z"].astype("float") + res = run_gsea(gsea_input, max_size=max_size)[0] + # filter to BP + go_reference = read_aws_csv( + "s3://pert-spectra/references/GO_terms.txt.gz", zipped=True + ) + go_bp = go_reference[go_reference["go_category"] == "biological_process"] + go_bp_ids = set(go_bp["go_id"].values) + # filter on fdr and nes + res = res[res["fdr_bh"] <= fdr] + res = res[np.abs(res["NES"]) > 1] + res = res[res["GO_ID"].isin(go_bp_ids)] + factor_to_go[i] = res + return factor_to_go + + +def perturbation_signal_recovery( + pert_to_go, model_pert_to_go, filtered_go_terms, perturbations +): + """ + Returns p-value of bootstrapped hypergeoemtric test of the overlap between known processes + vs model identified processes + + Args: + pert_to_go (dict): dict of perturbations to GO terms from a prior + model_pert_to_go (dict): dict of perturbations to GO terms from the model + filtered_go_terms (list): list of all GO terms in the prior + perturbations (list): list of perturbations + + Return: + pvals (dict): dict of perturbation to corrected p-value + """ + set_seed(0) + pvals = {} + for pert in perturbations: + if pert in ["ctrl", "intergenic", "basal"]: + continue + groupA = pert_to_go[pert] + groupB = model_pert_to_go[pert] + # Total number of unique items + M = len(filtered_go_terms) + # Number of items in set1 + n = len(groupA) + # Number of items in set2 + k = len(groupB) + # Number of overlapping items (intersection of both sets) + N = len(groupA.intersection(groupB)) + + # only consider if there are >5 processes from researchdb + if n < 5: + continue + rv = hypergeom(M, n, k) + observed_p_value = rv.sf(N - 1) + + # Output the p-value + print(f"Overlap for {pert}: {N} out of {n} in researchDB") + print(f"P-value for {pert}: {observed_p_value}") + if N == 0: + pvals[pert] = 1 + else: + pvals[pert] = observed_p_value + + # pvalue correction + pval_list = list(pvals.values()) + pvals_corrected = false_discovery_control(pval_list) + for i, key in enumerate(pvals): + pvals[key] = pvals_corrected[i] + return pvals + + +### Precision-recall analysis ### +def auprc( + dist_matrix: pd.DataFrame, + benchmark_sources: list = ["StringDB", "CORUM"], + benchmark_data_dir: str = "s3://pert-spectra/references/recall_datasets/", + log_stats: bool = False, +): + """ + Return AUC and best F1 score+threshold of precision-recall curve + """ + # convert distance matrix to sim matrix + d_norm = (dist_matrix - dist_matrix.min()) / (dist_matrix.max() - dist_matrix.min()) + sim_matrix = 1 - d_norm + + # calculate pr metric + auc_metrics = {} + f1_metrics = {} + pr_metrics = {} + # inputs = {} + for s in benchmark_sources: + rels = get_benchmark_relationships(benchmark_data_dir, s) + rels = rels[ + rels.node_1.isin(sim_matrix.index) & rels.node_2.isin(sim_matrix.index) + ] + adj_true = np.array( + [ + [0 for _ in range(len(sim_matrix.index))] + for _ in range(len(sim_matrix.index)) + ] + ) + adj_labels = {x: i for i, x in enumerate(sim_matrix.index)} + for i in range(adj_true.shape[0]): + adj_true[i][i] = 1 + for i, row in rels.iterrows(): + n1, n2 = row["node_1"], row["node_2"] + adj_true[adj_labels[n1]][adj_labels[n2]] = 1 + adj_true[adj_labels[n2]][adj_labels[n1]] = 1 + fpr, tpr, thresholds = precision_recall_curve( + np.reshape(adj_true.flatten(), (-1, 1)), + np.reshape(sim_matrix.values.flatten(), (-1, 1)), + ) + # calculate auc + auc_metrics[s] = {auc(tpr, fpr)} + # calculate best f1 + f1_scores = 2 * tpr * fpr / (tpr + fpr) + f1_metrics[s] = [np.max(f1_scores), thresholds[np.argmax(f1_scores)]] + # record pr metrics + pr_metrics[s] = {"precision": fpr, "recall": tpr, "thresholds": thresholds} + # record inputs + # inputs[s] = {'adj_true':np.reshape(adj_true.flatten(), (-1, 1)), + #'sim_matrix':np.reshape(sim_matrix.values.flatten(), (-1, 1))} + + return ( + pd.DataFrame.from_dict(auc_metrics, orient="index", columns=["AUC"]), + pd.DataFrame.from_dict(f1_metrics, orient="index", columns=["F1", "Threshold"]), + pr_metrics, + ) + + +### Recall Analyses borrowed from EFAAR### +def get_benchmark_relationships(benchmark_data_dir: str, src: str, filter=True): + """ + Reads a CSV file containing benchmark data and returns a filtered DataFrame. + + Args: + benchmark_data_dir (str): The directory containing the benchmark data files. + src (str): The name of the source containing the benchmark data. + filter (bool, optional): Whether to filter the DataFrame. Defaults to True. + + Returns: + pd.DataFrame: A DataFrame containing the benchmark relationships. + """ + df = read_aws_csv(benchmark_data_dir + src + ".txt") + return filter_relationships(df) if filter else df + + +def convert_metrics_to_df(metrics: dict, source: str) -> pd.DataFrame: + """ + Convert metrics dictionary to dataframe to be used in summary. + + Args: + metrics (dict): metrics dictionary + source (str): benchmark source name + + Returns: + pd.DataFrame: a dataframe with metrics + """ + metrics_dict_with_list = {key: [value] for key, value in metrics.items()} + metrics_dict_with_list["source"] = [source] + return pd.DataFrame.from_dict(metrics_dict_with_list) + + +def filter_relationships(df: pd.DataFrame): + """ + Filters a DataFrame of relationships between entities, removing any rows with + self-relationships + , ie. where the same entity appears in both columns, and also removing any duplicate + relationships (A-B and B-A). + + Args: + df (pd.DataFrame): DataFrame containing columns 'entity1' and 'entity2', representing the + entities involved in + each relationship. + + Returns: + pd.DataFrame: DataFrame containing columns 'entity1' and 'entity2', representing + the entities + involved in + each relationship after removing any rows where the same entity appears in both columns. + """ + df["sorted_entities"] = df.apply( + lambda row: tuple(sorted([row.node_1, row.node_2])), axis=1 + ) + df["node_1"] = df.sorted_entities.apply(lambda x: x[0]) + df["node_2"] = df.sorted_entities.apply(lambda x: x[1]) + return df[["node_1", "node_2"]].query("node_1!=node_2").drop_duplicates()