From 12f54cfbbfacafc618ac09dee819001308e8858c Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 25 Aug 2023 12:39:38 +0200 Subject: [PATCH] Update/integrate last v1 changes (#214) * update migration scripts * update batch_int * update bat_int clus_overlap * update denoising * add cp10k norm * update schema * update label_projection * [WIP] spectral_features new control_method * add diffusion map method * update dim_red spectral_feature and diffu_map * update dim_red * generalise cp normalization * CPM -> CP10k * fix failing test * fix typo * update and rename rmse to distance correlation * set CP normalization from cpm to cp10k * fix typo cpm to cp * updated changelog --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 8 +- src/common/comp_tests/check_method_config.py | 2 +- src/common/create_component/script.py | 2 +- src/common/schemas/defs_common.yaml | 2 +- .../normalization/log_cp/config.vsh.yaml | 22 ++++++ .../{log_cpm => log_cp}/script.py | 11 +-- .../normalization/log_cpm/config.vsh.yaml | 13 ---- .../{sqrt_cpm => sqrt_cp}/config.vsh.yaml | 11 ++- .../{sqrt_cpm => sqrt_cp}/script.py | 9 ++- src/datasets/processors/pca/script.py | 2 +- .../resource_test_scripts/multimodal.sh | 4 +- .../resource_test_scripts/pancreas.sh | 4 +- .../workflows/process_openproblems_v1/main.nf | 8 +- src/migration/check_migration.sh | 14 ++++ .../check_migration_status/script.py | 16 +++- .../no_integration_batch/config.vsh.yaml | 2 +- .../random_embed_cell/config.vsh.yaml | 2 +- .../random_embed_cell_jitter/config.vsh.yaml | 2 +- .../random_integration/config.vsh.yaml | 2 +- .../methods/bbknn/config.vsh.yaml | 6 +- .../methods/combat/config.vsh.yaml | 6 +- .../methods/fastmnn/config.vsh.yaml | 2 +- .../methods/mnn_correct/config.vsh.yaml | 2 +- .../methods/mnnpy/config.vsh.yaml | 4 +- .../methods/scanorama_embed/config.vsh.yaml | 6 +- .../methods/scanorama_feature/config.vsh.yaml | 6 +- .../methods/scanvi/config.vsh.yaml | 2 +- .../methods/scvi/config.vsh.yaml | 4 +- .../metrics/asw_batch/config.vsh.yaml | 2 +- .../metrics/asw_label/config.vsh.yaml | 2 +- .../cell_cycle_conservation/config.vsh.yaml | 2 +- .../clustering_overlap/config.vsh.yaml | 4 +- .../metrics/pcr/config.vsh.yaml | 2 +- .../batch_integration/workflows/run/main.nf | 2 +- .../no_denoising/config.vsh.yaml | 2 +- .../perfect_denoising/config.vsh.yaml | 2 +- .../denoising/methods/alra/config.vsh.yaml | 2 +- .../denoising/methods/dca/config.vsh.yaml | 2 +- .../methods/knn_smoothing/config.vsh.yaml | 2 +- .../denoising/methods/magic/config.vsh.yaml | 4 +- .../denoising/metrics/mse/config.vsh.yaml | 4 +- .../denoising/metrics/poisson/config.vsh.yaml | 5 +- src/tasks/denoising/workflows/run/main.nf | 2 +- src/tasks/denoising/workflows/run/run_test.sh | 2 +- .../workflows/run/run_test_on_tower.sh | 2 +- .../random_features/config.vsh.yaml | 2 +- .../spectral_features/config.vsh.yaml | 41 ++++++++++ .../true_features/config.vsh.yaml | 23 +----- .../control_methods/true_features/script.py | 12 +-- .../methods/densmap/config.vsh.yaml | 12 +-- .../methods/diffusion_map/config.vsh.yaml | 44 +++++++++++ .../methods/diffusion_map/script.py | 77 +++++++++++++++++++ .../methods/ivis/config.vsh.yaml | 4 +- .../methods/neuralee/config.vsh.yaml | 6 +- .../methods/pca/config.vsh.yaml | 8 +- .../methods/phate/config.vsh.yaml | 12 +-- .../methods/tsne/config.vsh.yaml | 8 +- .../methods/umap/config.vsh.yaml | 10 +-- .../metrics/coranking/config.vsh.yaml | 14 ++-- .../metrics/coranking/library.bib | 62 --------------- .../density_preservation/config.vsh.yaml | 11 ++- .../metrics/density_preservation/script.py | 25 +++--- .../distance_correlation/config.vsh.yaml | 49 ++++++++++++ .../{rmse => distance_correlation}/script.py | 23 +++--- .../metrics/rmse/config.vsh.yaml | 45 ----------- .../metrics/trustworthiness/config.vsh.yaml | 2 +- .../resources_test_scripts/pancreas.sh | 4 +- .../workflows/run/main.nf | 2 +- .../workflows/run/run_test.sh | 2 +- .../workflows/run/run_test_on_tower.sh | 2 +- .../majority_vote/config.vsh.yaml | 2 +- .../random_labels/config.vsh.yaml | 2 +- .../true_labels/config.vsh.yaml | 2 +- .../methods/knn/config.vsh.yaml | 6 +- .../logistic_regression/config.vsh.yaml | 6 +- .../methods/mlp/config.vsh.yaml | 6 +- .../methods/scanvi/config.vsh.yaml | 4 +- .../methods/scanvi_scarches/config.vsh.yaml | 4 + .../seurat_transferdata/config.vsh.yaml | 4 +- .../methods/xgboost/config.vsh.yaml | 6 +- .../metrics/accuracy/config.vsh.yaml | 2 +- .../metrics/f1/config.vsh.yaml | 6 +- .../resources_test_scripts/pancreas.sh | 4 +- .../label_projection/workflows/run/main.nf | 2 +- .../workflows/run/run_test.sh | 2 +- .../workflows/run/run_test_on_tower.sh | 2 +- 86 files changed, 455 insertions(+), 322 deletions(-) create mode 100644 src/datasets/normalization/log_cp/config.vsh.yaml rename src/datasets/normalization/{log_cpm => log_cp}/script.py (73%) delete mode 100644 src/datasets/normalization/log_cpm/config.vsh.yaml rename src/datasets/normalization/{sqrt_cpm => sqrt_cp}/config.vsh.yaml (52%) rename src/datasets/normalization/{sqrt_cpm => sqrt_cp}/script.py (80%) create mode 100644 src/migration/check_migration.sh create mode 100644 src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml create mode 100644 src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml create mode 100644 src/tasks/dimensionality_reduction/methods/diffusion_map/script.py delete mode 100644 src/tasks/dimensionality_reduction/metrics/coranking/library.bib create mode 100644 src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml rename src/tasks/dimensionality_reduction/metrics/{rmse => distance_correlation}/script.py (70%) delete mode 100644 src/tasks/dimensionality_reduction/metrics/rmse/config.vsh.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index c63820536b..4c84ff9790 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ ## general +### NEW FUNCTIONALITY + +* Updated all current tasks in v2 to latest changes in OP v1 (PR #214) + ### MAJOR CHANGES * Relocate task directories to new `src/tasks/` location (PR #142). @@ -11,6 +15,8 @@ and `ghcr.io/openproblems-bio/base-r` (PR #168). * Update batch integration docker images to OpenProblems base images (PR #171). + +* Changed default normalization CPM to CP10k (PR #214) ### MINOR CHANGES @@ -274,7 +280,7 @@ * `methods/neuralee`: Migrated from v1. -* `metrics/rmse`: Migrated from v1, but will likely be removed. +* `metrics/distance_correlation`: Migrated from v1, but will likely be removed. * `metrics/trustworthiness`: Migrated from v1, but will likely be removed. diff --git a/src/common/comp_tests/check_method_config.py b/src/common/comp_tests/check_method_config.py index ecbb2dbaf2..61a2bf0f6f 100644 --- a/src/common/comp_tests/check_method_config.py +++ b/src/common/comp_tests/check_method_config.py @@ -98,7 +98,7 @@ def search_ref_bib(reference): assert arg_id in arg_names, f"Argument '{arg_id}' in `.functionality.info.variants['{paramset_id}']` is not an argument in `.functionality.arguments`." assert "preferred_normalization" in info, "preferred_normalization not an info field" -norm_methods = ["log_cpm", "counts", "log_scran_pooling", "sqrt_cpm", "l1_sqrt"] +norm_methods = ["log_cpm", "log_cp10k", "counts", "log_scran_pooling", "sqrt_cpm", "sqrt_cp10k", "l1_sqrt"] assert info["preferred_normalization"] in norm_methods, "info['preferred_normalization'] not one of '" + "', '".join(norm_methods) + "'." diff --git a/src/common/create_component/script.py b/src/common/create_component/script.py index 1bc6d97cc5..1c7de0010c 100644 --- a/src/common/create_component/script.py +++ b/src/common/create_component/script.py @@ -80,7 +80,7 @@ def generate_info(par, component_type, pretty_name) -> str: | description: | | FILL IN: A (multi-line) description of how this method works. | # Which normalisation method this component prefers to use (required). - | preferred_normalization: log_cpm + | preferred_normalization: log_cp10k |''') if component_type == "method": str += strip_margin(f'''\ diff --git a/src/common/schemas/defs_common.yaml b/src/common/schemas/defs_common.yaml index 0032c0e1c6..a069d5cc35 100644 --- a/src/common/schemas/defs_common.yaml +++ b/src/common/schemas/defs_common.yaml @@ -59,7 +59,7 @@ definitions: required: [ type ] additionalProperties: false PreferredNormalization: - enum: [l1_sqrt, log_cpm, log_scran_pooling, sqrt_cpm, counts] + enum: [l1_sqrt, log_cpm, log_cp10k, log_scran_pooling, sqrt_cpm, sqrt_cp10k, counts] description: | Which normalization method a component prefers. diff --git a/src/datasets/normalization/log_cp/config.vsh.yaml b/src/datasets/normalization/log_cp/config.vsh.yaml new file mode 100644 index 0000000000..4d1770f2c4 --- /dev/null +++ b/src/datasets/normalization/log_cp/config.vsh.yaml @@ -0,0 +1,22 @@ +__merge__: ../../api/comp_normalization.yaml +functionality: + name: "log_cp" + description: "Normalize data using Log CP" + resources: + - type: python_script + path: script.py + arguments: + - name: "--n_cp" + type: integer + default: 1e4 + description: "Number of counts per cell" + - name: "--norm_id" + type: string + default: log_cp10k + description: "normalization ID to use e.g. 1e6 -> log_cpm, 1e4 -> log_cp10k" +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.1 + - type: nextflow + directives: + label: [ lowmem, lowcpu ] diff --git a/src/datasets/normalization/log_cpm/script.py b/src/datasets/normalization/log_cp/script.py similarity index 73% rename from src/datasets/normalization/log_cpm/script.py rename to src/datasets/normalization/log_cp/script.py index 6a28cbcc22..0fadc2ffe4 100644 --- a/src/datasets/normalization/log_cpm/script.py +++ b/src/datasets/normalization/log_cp/script.py @@ -4,11 +4,12 @@ par = { 'input': "resources_test/common/pancreas/dataset.h5ad", 'output': "output.h5ad", - 'layer_output': "log_cpm", - 'obs_size_factors': "log_cpm_size_factors" + 'layer_output': "log_cp10k", + 'obs_size_factors': "log_cp10k_size_factors", + 'n_cp': 1e6, } meta = { - "functionality_name": "normalize_log_cpm" + "functionality_name": "normalize_log_cp10k" } ## VIASH END @@ -18,7 +19,7 @@ print(">> Normalize data", flush=True) norm = sc.pp.normalize_total( adata, - target_sum=1e6, + target_sum=par["n_cp"], layer="counts", inplace=False ) @@ -27,7 +28,7 @@ print(">> Store output in adata", flush=True) adata.layers[par["layer_output"]] = lognorm adata.obs[par["obs_size_factors"]] = norm["norm_factor"] -adata.uns["normalization_id"] = meta["functionality_name"] +adata.uns["normalization_id"] = par["norm_id"] print(">> Write data", flush=True) adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/log_cpm/config.vsh.yaml b/src/datasets/normalization/log_cpm/config.vsh.yaml deleted file mode 100644 index 631bdbae10..0000000000 --- a/src/datasets/normalization/log_cpm/config.vsh.yaml +++ /dev/null @@ -1,13 +0,0 @@ -__merge__: ../../api/comp_normalization.yaml -functionality: - name: "log_cpm" - description: "Normalize data using Log CPM" - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.1 - - type: nextflow - directives: - label: [ lowmem, lowcpu ] diff --git a/src/datasets/normalization/sqrt_cpm/config.vsh.yaml b/src/datasets/normalization/sqrt_cp/config.vsh.yaml similarity index 52% rename from src/datasets/normalization/sqrt_cpm/config.vsh.yaml rename to src/datasets/normalization/sqrt_cp/config.vsh.yaml index dcf0b36b64..a347ec01d0 100644 --- a/src/datasets/normalization/sqrt_cpm/config.vsh.yaml +++ b/src/datasets/normalization/sqrt_cp/config.vsh.yaml @@ -1,10 +1,19 @@ __merge__: ../../api/comp_normalization.yaml functionality: - name: "sqrt_cpm" + name: "sqrt_cp" description: "Normalize data using Log Sqrt" resources: - type: python_script path: script.py + arguments: + - name: "--n_cp" + type: integer + default: 1e4 + description: "Number of counts per cell" + - name: "--norm_id" + type: string + default: sqrt_cp10k + description: "normalization id to use e.g. 1e4 -> sqrt_cp10k, 1e6 -> sqrt_cpm" platforms: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.1 diff --git a/src/datasets/normalization/sqrt_cpm/script.py b/src/datasets/normalization/sqrt_cp/script.py similarity index 80% rename from src/datasets/normalization/sqrt_cpm/script.py rename to src/datasets/normalization/sqrt_cp/script.py index f99227f3c9..af30b56083 100644 --- a/src/datasets/normalization/sqrt_cpm/script.py +++ b/src/datasets/normalization/sqrt_cp/script.py @@ -6,7 +6,8 @@ 'input': "resources_test/common/pancreas/dataset.h5ad", 'output': "output.h5ad", 'layer_output': "sqrt_cpm", - 'obs_size_factors': "size_factors_sqrt_cpm" + 'obs_size_factors': "size_factors_sqrt_cpm", + 'n_cp': 1e6, } meta = { "functionality_name": "normalize_sqrt_cpm" @@ -19,16 +20,16 @@ print(">> Normalize data", flush=True) norm = sc.pp.normalize_total( adata, - target_sum=1e6, + target_sum=par['n_cp'], layer="counts", inplace=False ) -lognorm = np.sqrt(norm["X"]) +lognorm = np.sqrt(norm['X']) print(">> Store output in adata", flush=True) adata.layers[par["layer_output"]] = lognorm adata.obs[par["obs_size_factors"]] = norm["norm_factor"] -adata.uns["normalization_id"] = meta["functionality_name"] +adata.uns["normalization_id"] = par["norm_id"] print(">> Write data", flush=True) adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/processors/pca/script.py b/src/datasets/processors/pca/script.py index ffc89c34c0..0990b97374 100644 --- a/src/datasets/processors/pca/script.py +++ b/src/datasets/processors/pca/script.py @@ -4,7 +4,7 @@ ### VIASH START par = { 'input': 'resources_test/common/pancreas/dataset.h5ad', - 'layer_input': 'log_cpm', + 'layer_input': 'log_cp10k', 'output': 'dataset.h5ad', 'obsm_embedding': 'X_pca', 'varm_loadings': 'pca_loadings', diff --git a/src/datasets/resource_test_scripts/multimodal.sh b/src/datasets/resource_test_scripts/multimodal.sh index fe0e9c472b..364efbf3ad 100644 --- a/src/datasets/resource_test_scripts/multimodal.sh +++ b/src/datasets/resource_test_scripts/multimodal.sh @@ -43,12 +43,12 @@ viash run src/datasets/processors/subsample/config.vsh.yaml -- \ # run sqrt cpm normalisation on mod 1 file -viash run src/datasets/normalization/log_cpm/config.vsh.yaml -- \ +viash run src/datasets/normalization/sqrt_cp/config.vsh.yaml -- \ --input $DATASET_DIR/raw_mod1.h5ad \ --output $DATASET_DIR/normalized_mod1.h5ad # run log cpm normalisation on mod 2 file -viash run src/datasets/normalization/log_cpm/config.vsh.yaml -- \ +viash run src/datasets/normalization/log_cp/config.vsh.yaml -- \ --input $DATASET_DIR/raw_mod2.h5ad \ --output $DATASET_DIR/normalized_mod2.h5ad diff --git a/src/datasets/resource_test_scripts/pancreas.sh b/src/datasets/resource_test_scripts/pancreas.sh index e78f738649..9a49f7c7de 100755 --- a/src/datasets/resource_test_scripts/pancreas.sh +++ b/src/datasets/resource_test_scripts/pancreas.sh @@ -42,8 +42,8 @@ viash run src/datasets/processors/subsample/config.vsh.yaml -- \ --output $DATASET_DIR/raw.h5ad \ --seed 123 -# run log cpm normalisation -viash run src/datasets/normalization/log_cpm/config.vsh.yaml -- \ +# run log cp10k normalisation +viash run src/datasets/normalization/log_cp/config.vsh.yaml -- \ --input $DATASET_DIR/raw.h5ad \ --output $DATASET_DIR/normalized.h5ad diff --git a/src/datasets/workflows/process_openproblems_v1/main.nf b/src/datasets/workflows/process_openproblems_v1/main.nf index aa6e0f4243..dfca3e8b49 100644 --- a/src/datasets/workflows/process_openproblems_v1/main.nf +++ b/src/datasets/workflows/process_openproblems_v1/main.nf @@ -7,9 +7,9 @@ targetDir = params.rootDir + "/target/nextflow" include { openproblems_v1 } from "$targetDir/datasets/loaders/openproblems_v1/main.nf" // normalization methods -include { log_cpm } from "$targetDir/datasets/normalization/log_cpm/main.nf" +include { log_cpm } from "$targetDir/datasets/normalization/log_cp/main.nf" include { log_scran_pooling } from "$targetDir/datasets/normalization/log_scran_pooling/main.nf" -include { sqrt_cpm } from "$targetDir/datasets/normalization/sqrt_cpm/main.nf" +include { sqrt_cpm } from "$targetDir/datasets/normalization/sqrt_cp/main.nf" include { l1_sqrt } from "$targetDir/datasets/normalization/l1_sqrt/main.nf" // dataset processors @@ -27,8 +27,8 @@ config = readConfig("$projectDir/config.vsh.yaml") // add custom tracer to nextflow to capture exit codes, memory usage, cpu usage, etc. traces = initialize_tracer() -// normalization_methods = [log_cpm, log_scran_pooling, sqrt_cpm, l1_sqrt -normalization_methods = [log_cpm, sqrt_cpm, l1_sqrt] +// normalization_methods = [log_cp, log_scran_pooling, sqrt_cp, l1_sqrt +normalization_methods = [log_cp, sqrt_cp, l1_sqrt] workflow { helpMessage(config) diff --git a/src/migration/check_migration.sh b/src/migration/check_migration.sh new file mode 100644 index 0000000000..1ce39634f2 --- /dev/null +++ b/src/migration/check_migration.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# viash run src/common/get_git_sha/config.vsh.yaml -p native -- --input /home/kai/Documents/openroblems/openproblems --output output/op_git_sha.json + +TASK_IDS=`ls src/tasks` + +for task_id in $TASK_IDS; do + echo ">> Processing $task_id" + viash run src/common/get_method_info/config.vsh.yaml -- --input . --task_id $task_id --output output/${task_id}_method.json + viash run src/migration/check_migration_status/config.vsh.yaml -p native -- --git_sha resources_test/input_git_sha.json --comp_info output/${task_id}_method.json --output output/${task_id}_method_status.json + viash run src/common/get_metric_info/config.vsh.yaml -- --input . --task_id $task_id --output output/${task_id}_metric.json + viash run src/migration/check_migration_status/config.vsh.yaml -p native -- --git_sha resources_test/input_git_sha.json --comp_info output/${task_id}_metric.json --output output/${task_id}_metric_status.json + +done \ No newline at end of file diff --git a/src/migration/check_migration_status/script.py b/src/migration/check_migration_status/script.py index 86d0a2ba46..6e88b2d9ed 100644 --- a/src/migration/check_migration_status/script.py +++ b/src/migration/check_migration_status/script.py @@ -3,9 +3,9 @@ ## VIASH START par = { - 'git_sha': 'temp/openproblems-v1.json', - 'comp_info': 'temp/denoising_metrics.json', - 'output': 'temp/migration_status.json' + 'git_sha': 'resources_test/input_git_sha.json', + 'comp_info': 'output/denoising_metric.json', + 'output': 'output/denoising_metric_status.json' } ## VIASH END @@ -16,10 +16,18 @@ def check_status(comp_item: List[Dict[str, str]], git_objects: List[Dict[str, st git_object["sha"].""" v1_path = comp_item.get("v1", {}).get("path") + + if "metric_id" in comp_item: + v1_path = comp_item.get("v1.path") + if not v1_path: return "v1.path missing" v1_commit = comp_item.get("v1", {}).get("commit") + + if "metric_id" in comp_item: + v1_commit = comp_item.get("v1.commit") + if not v1_commit: return "v1.commit missing" @@ -28,7 +36,7 @@ def check_status(comp_item: List[Dict[str, str]], git_objects: List[Dict[str, st return "v1.path does not exist in git repo" git_sha = git_object[0]["sha"] - if git_sha == comp_item["v1_commit"]: + if git_sha == v1_commit: return "up to date" else: return f"out of date (sha: {git_sha})" diff --git a/src/tasks/batch_integration/control_methods/no_integration_batch/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration_batch/config.vsh.yaml index da3013e908..b57dbb1cf9 100644 --- a/src/tasks/batch_integration/control_methods/no_integration_batch/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/no_integration_batch/config.vsh.yaml @@ -9,7 +9,7 @@ functionality: v1: path: openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cpm + preferred_normalization: log_cp10k resources: - type: python_script path: script.py diff --git a/src/tasks/batch_integration/control_methods/random_embed_cell/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_embed_cell/config.vsh.yaml index f6f6e89a56..a4ea2c49b8 100644 --- a/src/tasks/batch_integration/control_methods/random_embed_cell/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_embed_cell/config.vsh.yaml @@ -9,7 +9,7 @@ functionality: v1: path: openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cpm + preferred_normalization: log_cp10k resources: - type: python_script path: script.py diff --git a/src/tasks/batch_integration/control_methods/random_embed_cell_jitter/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_embed_cell_jitter/config.vsh.yaml index 3e4a0fc924..faf4c6f702 100644 --- a/src/tasks/batch_integration/control_methods/random_embed_cell_jitter/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_embed_cell_jitter/config.vsh.yaml @@ -9,7 +9,7 @@ functionality: v1: path: openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cpm + preferred_normalization: log_cp10k arguments: - name: "--jitter" type: double diff --git a/src/tasks/batch_integration/control_methods/random_integration/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/config.vsh.yaml index a9e0884ca5..9b43f82aea 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/config.vsh.yaml @@ -9,7 +9,7 @@ functionality: v1: path: openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cpm + preferred_normalization: log_cp10k resources: - type: python_script path: script.py diff --git a/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml b/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml index 129bac5cbf..742616c743 100644 --- a/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml @@ -15,12 +15,12 @@ functionality: documentation_url: "https://github.com/Teichlab/bbknn#readme" v1: path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf - preferred_normalization: log_cpm + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k variants: bbknn_full_unscaled: bbknn_full_scaled: - preferred_normalization: log_cpm_scaled + preferred_normalization: log_cp10k_scaled resources: - type: python_script path: script.py diff --git a/src/tasks/batch_integration/methods/combat/config.vsh.yaml b/src/tasks/batch_integration/methods/combat/config.vsh.yaml index 4e01dfb1ec..0314e42438 100644 --- a/src/tasks/batch_integration/methods/combat/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/combat/config.vsh.yaml @@ -18,12 +18,12 @@ functionality: documentation_url: "https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html" v1: path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf - preferred_normalization: log_cpm + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k variants: combat_full_unscaled: combat_full_scaled: - preferred_normalization: log_cpm_scaled + preferred_normalization: log_cp10k_scaled resources: - type: python_script path: script.py diff --git a/src/tasks/batch_integration/methods/fastmnn/config.vsh.yaml b/src/tasks/batch_integration/methods/fastmnn/config.vsh.yaml index a20640b119..b1ea4bec9e 100644 --- a/src/tasks/batch_integration/methods/fastmnn/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/fastmnn/config.vsh.yaml @@ -17,7 +17,7 @@ functionality: reference: "haghverdi2018batch" repository_url: "https://code.bioconductor.org/browse/batchelor/" documentation_url: "https://bioconductor.org/packages/batchelor/" - preferred_normalization: log_cpm + preferred_normalization: log_cp10k resources: - type: r_script path: script.R diff --git a/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml b/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml index 12c3b5ef52..15f30ec456 100644 --- a/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml @@ -11,7 +11,7 @@ functionality: reference: "haghverdi2018batch" repository_url: "https://code.bioconductor.org/browse/batchelor/" documentation_url: "https://bioconductor.org/packages/batchelor/" - preferred_normalization: log_cpm + preferred_normalization: log_cp10k resources: - type: r_script path: script.R diff --git a/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml b/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml index 47123c7372..5fdf1f0a8b 100644 --- a/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml @@ -17,11 +17,11 @@ functionality: v1: path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf - preferred_normalization: log_cpm + preferred_normalization: log_cp10k variants: mnn_full_unscaled: mnn_full_scaled: - preferred_normalization: log_cpm_scaled + preferred_normalization: log_cp10k_scaled resources: - type: python_script path: script.py diff --git a/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml b/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml index ae4de238f1..654e8c6e25 100644 --- a/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml @@ -13,12 +13,12 @@ functionality: documentation_url: "https://github.com/brianhie/scanorama#readme" v1: path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf - preferred_normalization: log_cpm + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k variants: scanorama_embed_full_unscaled: scanorama_embed_full_scaled: - preferred_normalization: log_cpm_scaled + preferred_normalization: log_cp10k_scaled resources: - type: python_script path: script.py diff --git a/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml index 43b5e10062..b144b0e788 100644 --- a/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml @@ -13,12 +13,12 @@ functionality: documentation_url: "https://github.com/brianhie/scanorama#readme" v1: path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf - preferred_normalization: log_cpm + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k variants: scanorama_feature_full_unscaled: scanorama_feature_full_scaled: - preferred_normalization: log_cpm_scaled + preferred_normalization: log_cp10k_scaled resources: - type: python_script path: script.py diff --git a/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml b/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml index 82f75714b8..41182a651c 100644 --- a/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml @@ -25,7 +25,7 @@ functionality: v1: path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf - preferred_normalization: log_cpm + preferred_normalization: log_cp10k variants: scanvi_full_unscaled: resources: diff --git a/src/tasks/batch_integration/methods/scvi/config.vsh.yaml b/src/tasks/batch_integration/methods/scvi/config.vsh.yaml index 75f1bcf6e5..d1bf368aa8 100644 --- a/src/tasks/batch_integration/methods/scvi/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scvi/config.vsh.yaml @@ -12,8 +12,8 @@ functionality: documentation_url: "https://github.com/YosefLab/scvi-tools#readme" v1: path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf - preferred_normalization: log_cpm + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k variants: scvi_full_unscaled: resources: diff --git a/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml b/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml index dbf6d97f4d..f265b058d8 100644 --- a/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml @@ -32,7 +32,7 @@ functionality: maximize: true v1: path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 resources: - type: python_script path: script.py diff --git a/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml b/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml index 50435d3ce6..6a5babce30 100644 --- a/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml @@ -20,7 +20,7 @@ functionality: maximize: true v1: path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 resources: - type: python_script path: script.py diff --git a/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml b/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml index 95fb0804d4..69849dfc4b 100644 --- a/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml @@ -29,7 +29,7 @@ functionality: maximize: true v1: path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 resources: - type: python_script path: script.py diff --git a/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml b/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml index 9e6558df6a..98ed7e3662 100644 --- a/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml @@ -24,7 +24,7 @@ functionality: maximize: true v1: path: openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - name: nmi label: NMI summary: "NMI compares overlap by scaling using mean entropy terms and optimizing Louvain clustering to obtain the best match between clusters and labels." @@ -43,7 +43,7 @@ functionality: maximize: true v1: path: openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 resources: - type: python_script path: script.py diff --git a/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml b/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml index 68704855a0..b043c2cd47 100644 --- a/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml @@ -23,7 +23,7 @@ functionality: reference: luecken2022benchmarking v1: path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 min: 0 max: 1 maximize: true diff --git a/src/tasks/batch_integration/workflows/run/main.nf b/src/tasks/batch_integration/workflows/run/main.nf index d79e51a705..942878fd94 100644 --- a/src/tasks/batch_integration/workflows/run/main.nf +++ b/src/tasks/batch_integration/workflows/run/main.nf @@ -117,7 +117,7 @@ workflow run_wf { def pref = config.functionality.info.preferred_normalization // if the preferred normalisation is none at all, // we can pass whichever dataset we want - (norm == "log_cpm" && pref == "counts") || norm == pref + (norm == "log_cp10k" && pref == "counts") || norm == pref }, // define a new 'id' by appending the method name to the dataset id diff --git a/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml b/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml index f5267b9a22..f03199ab17 100644 --- a/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml +++ b/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml @@ -7,7 +7,7 @@ functionality: description: "This method serves as a negative control, where the denoised data is a copy of the unaltered training data. This represents the scoring threshold if denoising was not performed on the data." v1: path: openproblems/tasks/denoising/methods/baseline.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 variants: no_denoising: preferred_normalization: counts diff --git a/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml b/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml index b4d7f84cfe..27fcfa6953 100644 --- a/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml +++ b/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml @@ -7,7 +7,7 @@ functionality: description: "This method serves as a positive control, where the test data is copied 1-to-1 to the denoised data. This makes it seem as if the data is perfectly denoised as it will be compared to the test data in the metrics." v1: path: openproblems/tasks/denoising/methods/baseline.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 variants: perfect_denoising: preferred_normalization: counts diff --git a/src/tasks/denoising/methods/alra/config.vsh.yaml b/src/tasks/denoising/methods/alra/config.vsh.yaml index 96bf990a7d..82398c806d 100644 --- a/src/tasks/denoising/methods/alra/config.vsh.yaml +++ b/src/tasks/denoising/methods/alra/config.vsh.yaml @@ -18,7 +18,7 @@ functionality: documentation_url: https://github.com/KlugerLab/ALRA/blob/master/README.md v1: path: openproblems/tasks/denoising/methods/alra.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 variants: alra: preferred_normalization: counts diff --git a/src/tasks/denoising/methods/dca/config.vsh.yaml b/src/tasks/denoising/methods/dca/config.vsh.yaml index 125ee0e4a1..29c7b244ef 100644 --- a/src/tasks/denoising/methods/dca/config.vsh.yaml +++ b/src/tasks/denoising/methods/dca/config.vsh.yaml @@ -14,7 +14,7 @@ functionality: repository_url: "https://github.com/theislab/dca" v1: path: openproblems/tasks/denoising/methods/dca.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 variants: dca: preferred_normalization: counts diff --git a/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml b/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml index 92f35e3240..b573412828 100644 --- a/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml +++ b/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml @@ -20,7 +20,7 @@ functionality: repository_url: "https://github.com/yanailab/knn-smoothing" v1: path: openproblems/tasks/denoising/methods/knn_smoothing.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 variants: knn_smoothing: preferred_normalization: counts diff --git a/src/tasks/denoising/methods/magic/config.vsh.yaml b/src/tasks/denoising/methods/magic/config.vsh.yaml index 48c6044fef..d3d7122c1a 100644 --- a/src/tasks/denoising/methods/magic/config.vsh.yaml +++ b/src/tasks/denoising/methods/magic/config.vsh.yaml @@ -3,7 +3,7 @@ functionality: name: "magic" info: label: MAGIC - summary: "MAGIC imputes and denoises scRNA-seq data using Euclidean distances and a Gaussian kernel to calculate the affinity matrix, followed by a Markov process and multiplication with the normalised data to obtain imputed values." + summary: "MAGIC imputes and denoises scRNA-seq data that is noisy or dropout-prone." description: "MAGIC (Markov Affinity-based Graph Imputation of Cells) is a method for imputation and denoising of noisy or dropout-prone single cell RNA-sequencing data. Given a normalised scRNA-seq expression matrix, it first calculates @@ -20,7 +20,7 @@ functionality: repository_url: "https://github.com/KrishnaswamyLab/MAGIC" v1: path: openproblems/tasks/denoising/methods/magic.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 variants: magic: magic_approx: diff --git a/src/tasks/denoising/metrics/mse/config.vsh.yaml b/src/tasks/denoising/metrics/mse/config.vsh.yaml index 89dc75d285..9013183fe4 100644 --- a/src/tasks/denoising/metrics/mse/config.vsh.yaml +++ b/src/tasks/denoising/metrics/mse/config.vsh.yaml @@ -10,10 +10,10 @@ functionality: reference: batson2019molecular v1: path: openproblems/tasks/denoising/metrics/mse.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 maximize: false min: 0 - max: +inf + max: "+.inf" resources: - type: python_script path: script.py diff --git a/src/tasks/denoising/metrics/poisson/config.vsh.yaml b/src/tasks/denoising/metrics/poisson/config.vsh.yaml index 1ef35f9d76..367570e8de 100644 --- a/src/tasks/denoising/metrics/poisson/config.vsh.yaml +++ b/src/tasks/denoising/metrics/poisson/config.vsh.yaml @@ -2,7 +2,6 @@ __merge__: ../../api/comp_metric.yaml functionality: name: "poisson" info: - reference: "batson2019molecular" metrics: - name: poisson label: Poisson Loss @@ -12,10 +11,10 @@ functionality: reference: batson2019molecular v1: path: openproblems/tasks/denoising/metrics/poisson.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 maximize: false min: 0 - max: +inf + max: "+.inf" resources: - type: python_script path: script.py diff --git a/src/tasks/denoising/workflows/run/main.nf b/src/tasks/denoising/workflows/run/main.nf index 4b98ec7698..ed7585aa82 100644 --- a/src/tasks/denoising/workflows/run/main.nf +++ b/src/tasks/denoising/workflows/run/main.nf @@ -72,7 +72,7 @@ workflow run_wf { def pref = config.functionality.info.preferred_normalization // if the preferred normalisation is none at all, // we can pass whichever dataset we want - (norm == "log_cpm" && pref == "counts") || norm == pref + (norm == "log_cp10k" && pref == "counts") || norm == pref }, // define a new 'id' by appending the method name to the dataset id diff --git a/src/tasks/denoising/workflows/run/run_test.sh b/src/tasks/denoising/workflows/run/run_test.sh index e671b93965..f6f0e8884c 100755 --- a/src/tasks/denoising/workflows/run/run_test.sh +++ b/src/tasks/denoising/workflows/run/run_test.sh @@ -22,7 +22,7 @@ nextflow \ -c src/wf_utils/labels_ci.config \ --id pancreas \ --dataset_id pancreas \ - --normalization_id log_cpm \ + --normalization_id log_cp10k \ --input_train $DATASET_DIR/train.h5ad \ --input_test $DATASET_DIR/test.h5ad \ --output scores.tsv \ diff --git a/src/tasks/denoising/workflows/run/run_test_on_tower.sh b/src/tasks/denoising/workflows/run/run_test_on_tower.sh index 5634670594..912cd376dc 100644 --- a/src/tasks/denoising/workflows/run/run_test_on_tower.sh +++ b/src/tasks/denoising/workflows/run/run_test_on_tower.sh @@ -8,7 +8,7 @@ id: pancreas_subsample input_train: s3://openproblems-data/$DATASET_DIR/train.h5ad input_test: s3://openproblems-data/$DATASET_DIR/test.h5ad dataset_id: pancreas -normalization_id: log_cpm +normalization_id: log_cp10k output: scores.tsv publish_dir: s3://openproblems-nextflow/output_test/v2/denoising HERE diff --git a/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml b/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml index 9cbb060c57..6fe1089de7 100644 --- a/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml @@ -7,7 +7,7 @@ functionality: description: "This method serves as a negative control, where the data is randomly embedded into a two-dimensional space, with no attempt to preserve the original structure." v1: path: openproblems/tasks/dimensionality_reduction/methods/baseline.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf + commit: 80b37e7a6aa27df4436f400397564c01276817e0 preferred_normalization: counts variants: random_features: diff --git a/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml b/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml new file mode 100644 index 0000000000..ae926ec5d0 --- /dev/null +++ b/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml @@ -0,0 +1,41 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: "spectral_features" + info: + label: Spectral Features + summary: "Positive control by Use 1000-dimensional diffusions maps as an embedding." + description: "This serves as a positive control since it uses 1000-dimensional diffusions maps as an embedding" + v1: + path: openproblems/tasks/dimensionality_reduction/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + spectral_features: + arguments: + - name: "--n_comps" + type: integer + default: 1000 + description: "Number of components to use for the embedding." + - name: t + type: integer + default: 1 + description: "Number to power the eigenvalues by." + - name: n_retries + type: integer + default: 1 + description: "Number of times to retry if the embedding fails, each time adding noise." + resources: + - type: python_script + path: /src/tasks/dimensionality_reduction/methods/diffusion_map/script.py +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.1 + setup: + - type: python + pypi: + - umap-learn + - scipy + - numpy + - type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml b/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml index 37fb6bac0e..74d7f248e5 100644 --- a/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml @@ -7,35 +7,16 @@ functionality: description: "This serves as a positive control since the original high-dimensional data is retained as is, without any loss of information" v1: path: openproblems/tasks/dimensionality_reduction/methods/baseline.py - commit: 4a0ee9b3731ff10d8cd2e584726a61b502aef613 - preferred_normalization: counts + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k variants: true_features: - true_features_log_cpm: - preferred_normalization: log_cpm - use_normalized_layer: true - true_features_log_cpm_hvg: - preferred_normalization: log_cpm - use_normalized_layer: true - n_hvg: 1000 - arguments: - - name: "--use_normalized_layer" - type: boolean - default: false - description: Whether to work with the raw counts or the normalized counts. - - name: "--n_hvg" - type: integer - description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. - default: 1000 resources: - type: python_script path: script.py platforms: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.1 - setup: - - type: python - packages: scanpy - type: nextflow directives: label: [ highmem, highcpu ] diff --git a/src/tasks/dimensionality_reduction/control_methods/true_features/script.py b/src/tasks/dimensionality_reduction/control_methods/true_features/script.py index aa8469051c..1a58cd4984 100644 --- a/src/tasks/dimensionality_reduction/control_methods/true_features/script.py +++ b/src/tasks/dimensionality_reduction/control_methods/true_features/script.py @@ -4,8 +4,6 @@ par = { "input": "resources_test/dimensionality_reduction/pancreas/test.h5ad", "output": "reduced.h5ad", - "n_hvg": 100, - "use_normalized_layer": False } meta = { "functionality_name": "true_features", @@ -16,15 +14,7 @@ input = ad.read_h5ad(par["input"]) print("Create high dimensionally embedding with all features", flush=True) -if par["use_normalized_layer"]: - X_emb = input.layers["counts"].toarray() -else: - X_emb = input.layers["normalized"].toarray() - -if par["n_hvg"]: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] - X_emb = X_emb[:, idx] +X_emb = input.layers["normalized"].toarray() print("Create output AnnData", flush=True) output = ad.AnnData( diff --git a/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml index cfb1ccd926..626110cd9a 100644 --- a/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml @@ -10,15 +10,15 @@ functionality: documentation_url: https://github.com/lmcinnes/umap#readme v1: path: openproblems/tasks/dimensionality_reduction/methods/umap.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf - preferred_normalization: log_cpm + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k variants: - densmap_logCPM: - densmap_pca_logCPM: + densmap_logCP10k: + densmap_pca_logCP10k: n_pca_dims: 50 - densmap_logCPM_1kHVG: + densmap_logCP10k_1kHVG: n_hvg: 1000 - densmap_pca_logCPM_1kHVG: + densmap_pca_logCP10k_1kHVG: n_pca_dims: 50 n_hvg: 1000 arguments: diff --git a/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml new file mode 100644 index 0000000000..643a7b8bed --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml @@ -0,0 +1,44 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: "diffusion_maps" + info: + label: Diffusion maps + summary: "Positive control by Use 1000-dimensional diffusions maps as an embedding." + description: "This serves as a positive control since it uses 1000-dimensional diffusions maps as an embedding" + reference: coifman2006diffusion + documentation_url: https://github.com/openproblems-bio/openproblems + repository_url: https://github.com/openproblems-bio/openproblems + v1: + path: openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + diffusion_map: + arguments: + - name: "--n_comps" + type: integer + default: 2 + description: "Number of components to use for the embedding." + - name: t + type: integer + default: 1 + description: "Number to power the eigenvalues by." + - name: n_retries + type: integer + default: 1 + description: "Number of times to retry if the embedding fails, each time adding noise." + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.1 + setup: + - type: python + pypi: + - umap-learn + - scipy + - numpy + - type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/tasks/dimensionality_reduction/methods/diffusion_map/script.py b/src/tasks/dimensionality_reduction/methods/diffusion_map/script.py new file mode 100644 index 0000000000..cf8633120c --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/diffusion_map/script.py @@ -0,0 +1,77 @@ +import anndata as ad +import umap + +## VIASH START +par = { + "input": "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "output": "reduced.h5ad", + "n_comps": 2, +} +meta = { + "functionality_name": "foo", +} +## VIASH END + +def diffusion_map(graph, n_comps, t, n_retries): + import numpy as np + import scipy.sparse.linalg + + diag_data = np.asarray(graph.sum(axis=0)) + identity = scipy.sparse.identity(graph.shape[0], dtype=np.float64) + diag = scipy.sparse.spdiags( + 1.0 / np.sqrt(diag_data), 0, graph.shape[0], graph.shape[0] + ) + laplacian = identity - diag * graph * diag + num_lanczos_vectors = max(2 * n_comps + 1, int(np.sqrt(graph.shape[0]))) + try: + eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh( + laplacian, + n_comps, + which="SM", + ncv=num_lanczos_vectors, + tol=1e-4, + v0=np.ones(laplacian.shape[0]), + maxiter=graph.shape[0] * 5, + ) + return (eigenvalues**t) * eigenvectors + except scipy.sparse.linalg.ArpackNoConvergence: + if n_retries > 0: + # add some noise and try again + graph_rand = graph.copy().tocoo() + graph_rand.row = np.random.choice( + graph_rand.shape[0], len(graph_rand.row), replace=True + ) + graph_rand.data *= 0.01 + return diffusion_map( + graph + graph_rand, n_comps, t, n_retries=n_retries - 1 + ) + else: + raise + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) + +print("Create high dimensionally embedding with all features", flush=True) + +n_comps = min(par["n_comps"], min(input.shape) - 2) + +graph = umap.UMAP(transform_mode="graph").fit_transform(input.layers["normalized"]) + +X_emb = diffusion_map(graph, n_comps, t=par["t"], n_retries=par["n_retries"]) + + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=input.obs[[]], + obsm={ + "X_emb": X_emb + }, + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml index 4d57c4df9d..c22d2d1fd6 100644 --- a/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml @@ -17,8 +17,8 @@ functionality: documentation_url: "https://github.com/beringresearch/ivis#readme" v1: path: openproblems/tasks/dimensionality_reduction/methods/ivis.py - commit: 9ebb777b3b76337e731a3b99f4bf39462a15c4cc - preferred_normalization: log_cpm + commit: 93d2161a08da3edf249abedff5111fb5ce527552 + preferred_normalization: log_cp10k variants: ivis_logCPM_1kHVG: arguments: diff --git a/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml index 6911b450a2..34e13c8c41 100644 --- a/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml @@ -18,13 +18,13 @@ functionality: documentation_url: "https://github.com/HiBearME/NeuralEE#readme" v1: path: openproblems/tasks/dimensionality_reduction/methods/neuralee.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf - preferred_normalization: log_cpm + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k variants: neuralee_default: normalize: true n_hvg: 500 - neuralee_logCPM_1kHVG: + neuralee_logCP10k_1kHVG: normalize: false n_hvg: 1000 arguments: diff --git a/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml index 7ae19d13e9..5ca15443c4 100644 --- a/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml @@ -16,11 +16,11 @@ functionality: documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html" v1: path: openproblems/tasks/dimensionality_reduction/methods/pca.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf - preferred_normalization: log_cpm + commit: 154ccb9fd99113f3d28d9c3f139194539a0290f9 + preferred_normalization: log_cp10k variants: - pca_logCPM: - pca_logCPM_1kHVG: + pca_logCP10k: + pca_logCP10k_1kHVG: n_hvg: 1000 arguments: - name: "--n_hvg" diff --git a/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml index d69b8cc6f2..57b0e0eeac 100644 --- a/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml @@ -18,17 +18,17 @@ functionality: documentation_url: "https://github.com/KrishnaswamyLab/PHATE#readme" v1: path: openproblems/tasks/dimensionality_reduction/methods/phate.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf - preferred_normalization: sqrt_cpm + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: sqrt_cp10k variants: phate_default: phate_sqrt: gamma: 0 - phate_logCPM: - preferred_normalization: log_cpm - phate_logCPM_1kHVG: + phate_logCP10k: + preferred_normalization: log_cp10k + phate_logCP10k_1kHVG: n_hvg: 1000 - preferred_normalization: log_cpm + preferred_normalization: log_cp10k arguments: - name: '--n_pca_dims' type: integer diff --git a/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml index 0da62a6a83..1b3e9ca9f4 100644 --- a/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml @@ -16,11 +16,11 @@ functionality: documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE" v1: path: openproblems/tasks/dimensionality_reduction/methods/tsne.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf - preferred_normalization: log_cpm + commit: 154ccb9fd99113f3d28d9c3f139194539a0290f9 + preferred_normalization: log_cp10k variants: - tsne_logCPM: - tsne_logCPM_1kHVG: + tsne_logCP10k: + tsne_logCP10k_1kHVG: n_hvg: 1000 arguments: - name: "--n_hvg" diff --git a/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml index 1aff2d0c2c..ddced67815 100644 --- a/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml @@ -16,14 +16,14 @@ functionality: v1: path: openproblems/tasks/dimensionality_reduction/methods/umap.py commit: 14d70b330cae09527a6d4c4e552db240601e31cf - preferred_normalization: log_cpm + preferred_normalization: log_cp10k variants: - umap_logCPM: - umap_pca_logCPM: + umap_logCP10k: + umap_pca_logCP10k: n_pca_dims: 50 - umap_logCPM_1kHVG: + umap_logCP10k_1kHVG: n_hvg: 1000 - umap_pca_logCPM_1kHVG: + umap_pca_logCP10k_1kHVG: n_pca_dims: 50 n_hvg: 1000 arguments: diff --git a/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml index 552b50fd04..a4cc208ba3 100644 --- a/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml @@ -17,7 +17,7 @@ functionality: maximize: true v1: path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 note: | The original v1 implementations consisted of a lot of helper functions which were derived from the pyDRMetrics package. This version uses the coRanking package @@ -38,7 +38,7 @@ functionality: maximize: true v1: path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 note: | The original v1 implementations consisted of a lot of helper functions which were derived from the pyDRMetrics package. This version uses the coRanking package @@ -59,7 +59,7 @@ functionality: maximize: true v1: path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 note: | The original v1 implementations consisted of a lot of helper functions which were derived from the pyDRMetrics package. This version uses the coRanking package @@ -80,7 +80,7 @@ functionality: maximize: true v1: path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 note: | The original v1 implementations consisted of a lot of helper functions which were derived from the pyDRMetrics package. This version uses the coRanking package @@ -101,7 +101,7 @@ functionality: maximize: true v1: path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 note: | The original v1 implementations consisted of a lot of helper functions which were derived from the pyDRMetrics package. This version uses the coRanking package @@ -122,7 +122,7 @@ functionality: maximize: true v1: path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 note: | The original v1 implementations consisted of a lot of helper functions which were derived from the pyDRMetrics package. This version uses the coRanking package @@ -143,7 +143,7 @@ functionality: maximize: true v1: path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 note: | The original v1 implementations consisted of a lot of helper functions which were derived from the pyDRMetrics package. This version uses the coRanking package diff --git a/src/tasks/dimensionality_reduction/metrics/coranking/library.bib b/src/tasks/dimensionality_reduction/metrics/coranking/library.bib deleted file mode 100644 index 5ecdb67e51..0000000000 --- a/src/tasks/dimensionality_reduction/metrics/coranking/library.bib +++ /dev/null @@ -1,62 +0,0 @@ - -@misc{lueks2011evaluate, - doi = {10.48550/ARXIV.1110.3917}, - url = {https://arxiv.org/abs/1110.3917}, - author = {Lueks, Wouter and Mokbel, Bassam and Biehl, Michael and Hammer, Barbara}, - keywords = {Machine Learning (cs.LG), Information Retrieval (cs.IR), FOS: Computer and information sciences, FOS: Computer and information sciences}, - title = {How to Evaluate Dimensionality Reduction? - Improving the Co-ranking Matrix}, - publisher = {arXiv}, - year = {2011}, - copyright = {arXiv.org perpetual, non-exclusive license} -} -@article{kraemer2018dimred, - doi = {10.32614/rj-2018-039}, - url = {https://doi.org/10.32614/rj-2018-039}, - year = {2018}, - publisher = {The R Foundation}, - volume = {10}, - number = {1}, - pages = {342}, - author = {Guido Kraemer and Markus Reichstein and Miguel, D. Mahecha}, - title = {{dimRed} and {coRanking} - Unifying Dimensionality Reduction in R}, - journal = {The R Journal} -} -@article{chen2009local, - doi = {10.1198/jasa.2009.0111}, - url = {https://doi.org/10.1198/jasa.2009.0111}, - year = {2009}, - month = mar, - publisher = {Informa {UK} Limited}, - volume = {104}, - number = {485}, - pages = {209--219}, - author = {Lisha Chen and Andreas Buja}, - title = {Local Multidimensional Scaling for Nonlinear Dimension Reduction, Graph Drawing, and Proximity Analysis}, - journal = {Journal of the American Statistical Association} -} -@article{lee2009quality, - doi = {10.1016/j.neucom.2008.12.017}, - url = {https://doi.org/10.1016/j.neucom.2008.12.017}, - year = {2009}, - month = mar, - publisher = {Elsevier {BV}}, - volume = {72}, - number = {7-9}, - pages = {1431--1443}, - author = {John A. Lee and Michel Verleysen}, - title = {Quality assessment of dimensionality reduction: Rank-based criteria}, - journal = {Neurocomputing} -} -@article{venna2006local, - doi = {10.1016/j.neunet.2006.05.014}, - url = {https://doi.org/10.1016/j.neunet.2006.05.014}, - year = {2006}, - month = jul, - publisher = {Elsevier {BV}}, - volume = {19}, - number = {6-7}, - pages = {889--899}, - author = {Jarkko Venna and Samuel Kaski}, - title = {Local multidimensional scaling}, - journal = {Neural Networks} -} \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml index 91d10dcf43..ed671faedd 100644 --- a/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml @@ -15,7 +15,16 @@ functionality: maximize: true v1: path: openproblems/tasks/dimensionality_reduction/metrics/density.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + arguments: + - name: "--n_neighbors" + type: integer + default: 30 + description: "Number of neighbors to use for density estimation." + - name: "--seed" + type: integer + default: 42 + description: "Random seed." resources: - type: python_script path: script.py diff --git a/src/tasks/dimensionality_reduction/metrics/density_preservation/script.py b/src/tasks/dimensionality_reduction/metrics/density_preservation/script.py index 9cae4d1f12..9bf44397c2 100644 --- a/src/tasks/dimensionality_reduction/metrics/density_preservation/script.py +++ b/src/tasks/dimensionality_reduction/metrics/density_preservation/script.py @@ -11,6 +11,8 @@ "input_embedding": "resources_test/dimensionality_reduction/pancreas/reduced.h5ad", "input_solution": "resources_test/dimensionality_reduction/pancreas/test.h5ad", "output": "score.h5ad", + "n_neighbors": 30, + "seed": 42, } ## VIASH END @@ -84,27 +86,22 @@ def compute_density_preservation( return 0.0 print("Compute local radii in original data", flush=True) - _, ro, _ = UMAP( - n_neighbors=_K, - random_state=_SEED, - densmap=True, - output_dens=True - ).fit_transform(high_dim) + ro = _calculate_radii( + high_dim, + n_neighbors=n_neighbors, + random_state=random_state + ) print("Compute local radii of embedding", flush=True) re = _calculate_radii( X_emb, - n_neighbors=_K, - random_state=_SEED + n_neighbors=n_neighbors, + random_state=random_state ) print("Compute pearson correlation", flush=True) return pearsonr(ro, re)[0] -# number of neighbors -_K = 30 -# Fix seed -_SEED = 42 print("Load data", flush=True) input_solution = ad.read_h5ad(par["input_solution"]) @@ -116,8 +113,8 @@ def compute_density_preservation( density_preservation = compute_density_preservation( X_emb=X_emb, high_dim=high_dim, - n_neighbors=_K, - random_state=_SEED + n_neighbors=par["n_neighbors"], + random_state=par["seed"] ) print("Create output AnnData object", flush=True) diff --git a/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml new file mode 100644 index 0000000000..7e30f9efbe --- /dev/null +++ b/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml @@ -0,0 +1,49 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: distance_correlation + info: + metrics: + - name: distance_correlation + label: Distance Correlation + summary: "Calculates the distance correlation by computing Spearman correlations between distances." + description: "Calculates the distance correlation by computing Spearman correlations between distances on the full (or processed) data matrix and the dimensionally-reduced matrix." + reference: kruskal1964mds + min: 0 + max: "+.inf" + maximize: false + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + note: This metric was ported but will probably be removed soon. + - name: distance_correlation_spectral + label: Distance Correlation Spectral + summary: "Spearman correlation between all pairwise diffusion distances in the original and dimension-reduced data." + description: "Spearman correlation between all pairwise diffusion distances in the original and dimension-reduced data." + reference: coifman2006diffusion + min: 0 + max: "+.inf" + maximize: false + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + note: This metric was ported but will probably be removed soon. + arguments: + - name: "--spectral" + type: boolean_true + description: Calculate the spectral root mean squared error. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.1 + setup: + - type: python + packages: + - umap-learn + - scikit-learn + - numpy + - scipy + - type: nextflow + directives: + label: [ midmem, midcpu ] diff --git a/src/tasks/dimensionality_reduction/metrics/rmse/script.py b/src/tasks/dimensionality_reduction/metrics/distance_correlation/script.py similarity index 70% rename from src/tasks/dimensionality_reduction/metrics/rmse/script.py rename to src/tasks/dimensionality_reduction/metrics/distance_correlation/script.py index 4b33fe02ce..d461f271b4 100644 --- a/src/tasks/dimensionality_reduction/metrics/rmse/script.py +++ b/src/tasks/dimensionality_reduction/metrics/distance_correlation/script.py @@ -1,7 +1,7 @@ import anndata as ad import numpy as np import sklearn.decomposition -import scipy.optimize +import scipy.stats import scipy.spatial from sklearn.metrics import pairwise_distances import umap @@ -15,13 +15,13 @@ } ## VIASH END -def _rmse(X, X_emb): +def _distance_correlation(X, X_emb): high_dimensional_distance_vector = scipy.spatial.distance.pdist(X) low_dimensional_distance_vector = scipy.spatial.distance.pdist(X_emb) - _, rmse = scipy.optimize.nnls( - low_dimensional_distance_vector[:, None], high_dimensional_distance_vector + corr = scipy.stats.spearmanr( + low_dimensional_distance_vector, high_dimensional_distance_vector ) - return rmse + return corr print("Load data", flush=True) input_solution = ad.read_h5ad(par["input_solution"]) @@ -31,17 +31,18 @@ def _rmse(X, X_emb): X_emb = input_embedding.obsm["X_emb"] print("Compute NNLS residual after SVD", flush=True) -n_svd = 200 +n_svd = 500 svd_emb = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(high_dim) -rmse = _rmse(svd_emb, X_emb) +dist_corr = _distance_correlation(svd_emb, X_emb) +#! Explicitly not changing it to use diffusion map method as this will have a positive effect on the diffusion map method for this specific metric. print("Compute NLSS residual after spectral embedding", flush=True) -n_comps = min(200, min(input_solution.shape) - 2) +n_comps = min(1000, min(input_solution.shape) - 2) umap_graph = umap.UMAP(transform_mode="graph").fit_transform(high_dim) spectral_emb = umap.spectral.spectral_layout( high_dim, umap_graph, n_comps, random_state=np.random.default_rng() ) -rmse_spectral = _rmse(spectral_emb, X_emb) +dist_corr_spectral = _distance_correlation(spectral_emb, X_emb) print("Create output AnnData object", flush=True) output = ad.AnnData( @@ -49,8 +50,8 @@ def _rmse(X, X_emb): "dataset_id": input_solution.uns["dataset_id"], "normalization_id": input_solution.uns["normalization_id"], "method_id": input_embedding.uns["method_id"], - "metric_ids": [ "rmse", "rmse_spectral" ], - "metric_values": [ rmse, rmse_spectral ] + "metric_ids": [ "distance correlation", "distance_correlation_spectral" ], + "metric_values": [ dist_corr, dist_corr_spectral ] } ) diff --git a/src/tasks/dimensionality_reduction/metrics/rmse/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/rmse/config.vsh.yaml deleted file mode 100644 index 5874ffb3c1..0000000000 --- a/src/tasks/dimensionality_reduction/metrics/rmse/config.vsh.yaml +++ /dev/null @@ -1,45 +0,0 @@ -__merge__: ../../api/comp_metric.yaml -functionality: - name: "rmse" - info: - metrics: - - name: rmse - label: RMSE - summary: "The residual after applying the Non-Negative Least Squares solver on the pairwise distances of an SVD." - description: "The residual after applying the Non-Negative Least Squares solver on the pairwise distances of an SVD." - reference: kruskal1964mds - min: 0 - max: "+.inf" - maximize: false - - name: rmse_spectral - label: RMSE Spectral - summary: "The residual after applying the Non-Negative Least Squares solver on the pairwise distances of a spectral embedding." - description: "The residual after applying the Non-Negative Least Squares solver on the pairwise distances of a spectral embedding." - reference: coifman2006diffusion - min: 0 - max: "+.inf" - maximize: false - v1: - path: openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py - commit: b353a462f6ea353e0fc43d0f9fcbbe621edc3a0b - note: This metric was ported but will probably be removed soon. - arguments: - - name: "--spectral" - type: boolean_true - description: Calculate the spectral root mean squared error. - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.1 - setup: - - type: python - packages: - - umap-learn - - scikit-learn - - numpy - - scipy - - type: nextflow - directives: - label: [ midmem, midcpu ] diff --git a/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml index ce65fc8b60..b56012ae74 100644 --- a/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml @@ -13,7 +13,7 @@ functionality: maximize: true v1: path: openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py - commit: c2470ce02e6f196267cec1c554ba7ae389c0956a + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 note: This metric is already included in the 'coranking' component and can be removed. resources: - type: python_script diff --git a/src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh b/src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh index c208311399..165181ca72 100755 --- a/src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh +++ b/src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh @@ -8,7 +8,7 @@ REPO_ROOT=$(git rev-parse --show-toplevel) # ensure that the command below is run from the root of the repository cd "$REPO_ROOT" -RAW_DATA=resources_test/common/pancreas/dataset.h5ad +RAW_DATA=resources_test/common/pancreas/cp10k_dataset.h5ad DATASET_DIR=resources_test/dimensionality_reduction/pancreas if [ ! -f $RAW_DATA ]; then @@ -46,7 +46,7 @@ nextflow \ -profile docker \ --id pancreas \ --dataset_id pancreas \ - --normalization_id log_cpm \ + --normalization_id log_cp10k \ --input $DATASET_DIR/dataset.h5ad \ --input_solution $DATASET_DIR/solution.h5ad \ --output scores.tsv \ diff --git a/src/tasks/dimensionality_reduction/workflows/run/main.nf b/src/tasks/dimensionality_reduction/workflows/run/main.nf index 9dd5b10231..6d0913191f 100644 --- a/src/tasks/dimensionality_reduction/workflows/run/main.nf +++ b/src/tasks/dimensionality_reduction/workflows/run/main.nf @@ -80,7 +80,7 @@ workflow run_wf { def pref = config.functionality.info.preferred_normalization // if the preferred normalisation is none at all, // we can pass whichever dataset we want - (norm == "log_cpm" && pref == "counts") || norm == pref + (norm == "log_cp10k" && pref == "counts") || norm == pref }, // define a new 'id' by appending the method name to the dataset id diff --git a/src/tasks/dimensionality_reduction/workflows/run/run_test.sh b/src/tasks/dimensionality_reduction/workflows/run/run_test.sh index 3aeeb58baa..299f8accf8 100755 --- a/src/tasks/dimensionality_reduction/workflows/run/run_test.sh +++ b/src/tasks/dimensionality_reduction/workflows/run/run_test.sh @@ -21,7 +21,7 @@ nextflow \ -resume \ --id pancreas \ --dataset_id pancreas \ - --normalization_id log_cpm \ + --normalization_id log_cp10k \ --input $DATASET_DIR/dataset.h5ad \ --input_solution $DATASET_DIR/solution.h5ad \ --output scores.tsv \ diff --git a/src/tasks/dimensionality_reduction/workflows/run/run_test_on_tower.sh b/src/tasks/dimensionality_reduction/workflows/run/run_test_on_tower.sh index befcde4d49..f2ff994080 100644 --- a/src/tasks/dimensionality_reduction/workflows/run/run_test_on_tower.sh +++ b/src/tasks/dimensionality_reduction/workflows/run/run_test_on_tower.sh @@ -8,7 +8,7 @@ id: pancreas_subsample input: s3://openproblems-data/$DATASET_DIR/dataset.h5ad input_solution: s3://openproblems-data/$DATASET_DIR/solution.h5ad dataset_id: pancreas -normalization_id: log_cpm +normalization_id: log_cp10k output: scores.tsv publish_dir: s3://openproblems-nextflow/output_test/v2/dimensionality_reduction HERE diff --git a/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml b/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml index 6cd01534c4..53142aaf9e 100644 --- a/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml +++ b/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml @@ -7,7 +7,7 @@ functionality: description: "A control-type method that predicts all cells to belong to the most abundant cell type in the dataset" v1: path: openproblems/tasks/label_projection/methods/baseline.py - commit: b460ecb183328c857cbbf653488f522a4034a61c + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 variants: majority_vote: preferred_normalization: counts diff --git a/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml b/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml index 014ee5249d..dc95a42468 100644 --- a/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml +++ b/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml @@ -7,7 +7,7 @@ functionality: description: "A negative control, where the labels are randomly predicted without training the data." v1: path: openproblems/tasks/label_projection/methods/baseline.py - commit: b460ecb183328c857cbbf653488f522a4034a61c + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 preferred_normalization: counts variants: random_labels: diff --git a/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml b/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml index ef313a16ee..384c2cf92e 100644 --- a/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml +++ b/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml @@ -7,7 +7,7 @@ functionality: description: "A positive control, where the solution labels are copied 1 to 1 to the predicted data." v1: path: openproblems/tasks/label_projection/methods/baseline.py - commit: b460ecb183328c857cbbf653488f522a4034a61c + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 preferred_normalization: counts variants: true_labels: diff --git a/src/tasks/label_projection/methods/knn/config.vsh.yaml b/src/tasks/label_projection/methods/knn/config.vsh.yaml index 0841b7ebe4..12445bedd0 100644 --- a/src/tasks/label_projection/methods/knn/config.vsh.yaml +++ b/src/tasks/label_projection/methods/knn/config.vsh.yaml @@ -17,10 +17,10 @@ functionality: documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html" v1: path: openproblems/tasks/label_projection/methods/knn_classifier.py - commit: c2470ce02e6f196267cec1c554ba7ae389c0956a - preferred_normalization: log_cpm + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k variants: - knn_classifier_log_cpm: + knn_classifier_log_cp10k: knn_classifier_scran: preferred_normalization: log_scran_pooling resources: diff --git a/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml b/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml index 8deac18a99..990b8cf368 100644 --- a/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml +++ b/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml @@ -14,10 +14,10 @@ functionality: documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" v1: path: openproblems/tasks/label_projection/methods/logistic_regression.py - commit: c2470ce02e6f196267cec1c554ba7ae389c0956a - preferred_normalization: log_cpm + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k variants: - logistic_regression_log_cpm: + logistic_regression_log_cp10k: logistic_regression_scran: preferred_normalization: log_scran_pooling resources: diff --git a/src/tasks/label_projection/methods/mlp/config.vsh.yaml b/src/tasks/label_projection/methods/mlp/config.vsh.yaml index 8ec1f9cbf0..8046a01e95 100644 --- a/src/tasks/label_projection/methods/mlp/config.vsh.yaml +++ b/src/tasks/label_projection/methods/mlp/config.vsh.yaml @@ -17,10 +17,10 @@ functionality: documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html" v1: path: openproblems/tasks/label_projection/methods/mlp.py - commit: c2470ce02e6f196267cec1c554ba7ae389c0956a - preferred_normalization: log_cpm + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k variants: - mlp_log_cpm: + mlp_log_cp10k: mlp_scran: preferred_normalization: log_scran_pooling arguments: diff --git a/src/tasks/label_projection/methods/scanvi/config.vsh.yaml b/src/tasks/label_projection/methods/scanvi/config.vsh.yaml index f765b07c98..5cbc8fb3a4 100644 --- a/src/tasks/label_projection/methods/scanvi/config.vsh.yaml +++ b/src/tasks/label_projection/methods/scanvi/config.vsh.yaml @@ -17,8 +17,8 @@ functionality: documentation_url: https://scarches.readthedocs.io/en/latest/scanvi_surgery_pipeline.html v1: path: openproblems/tasks/label_projection/methods/scvi_tools.py - commit: 4bb8a7e04545a06c336d3d9364a1dd84fa2af1a4 - preferred_normalization: log_cpm + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 + preferred_normalization: log_cp10k variants: scanvi_all_genes: scanvi_hvg: diff --git a/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml b/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml index 56662a542c..38df609144 100644 --- a/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml +++ b/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml @@ -17,8 +17,12 @@ functionality: documentation_url: https://docs.scvi-tools.org repository_url: https://github.com/scverse/scvi-tools preferred_normalization: counts + v1: + path: openproblems/tasks/label_projection/methods/scvi_tools.py + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 variants: scanvi_scarches: + #! TODO: add other scanvi_scarches variants # Component-specific parameters (optional) arguments: diff --git a/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml b/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml index b30629f6b5..045819ba47 100644 --- a/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml +++ b/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml @@ -18,8 +18,8 @@ functionality: documentation_url: "https://satijalab.org/seurat/articles/integration_mapping.html" v1: path: openproblems/tasks/label_projection/methods/seurat.py - commit: 3f19f0e87a8bc8b59c7521ba01917580aff81bc8 - preferred_normalization: log_cpm + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k variants: seurat: resources: diff --git a/src/tasks/label_projection/methods/xgboost/config.vsh.yaml b/src/tasks/label_projection/methods/xgboost/config.vsh.yaml index 2234967a79..c37e7611f9 100644 --- a/src/tasks/label_projection/methods/xgboost/config.vsh.yaml +++ b/src/tasks/label_projection/methods/xgboost/config.vsh.yaml @@ -14,10 +14,10 @@ functionality: documentation_url: "https://xgboost.readthedocs.io/en/stable/index.html" v1: path: openproblems/tasks/label_projection/methods/xgboost.py - commit: 123bb7b39c51c58e19ddf0fbbc1963c3dffde14c - preferred_normalization: log_cpm + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 + preferred_normalization: log_cp10k variants: - xgboost_log_cpm: + xgboost_log_cp10k: xgboost_scran: preferred_normalization: log_scran_pooling resources: diff --git a/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml b/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml index 9414a5eaad..11674fde5c 100644 --- a/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml +++ b/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml @@ -13,7 +13,7 @@ functionality: reference: grandini2020metrics v1: path: openproblems/tasks/label_projection/metrics/accuracy.py - commit: fcd5b876e7d0667da73a2858bc27c40224e19f65 + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 resources: - type: python_script path: script.py diff --git a/src/tasks/label_projection/metrics/f1/config.vsh.yaml b/src/tasks/label_projection/metrics/f1/config.vsh.yaml index f78f4c8bba..ec6eece949 100644 --- a/src/tasks/label_projection/metrics/f1/config.vsh.yaml +++ b/src/tasks/label_projection/metrics/f1/config.vsh.yaml @@ -13,7 +13,7 @@ functionality: maximize: true v1: path: openproblems/tasks/label_projection/metrics/f1.py - commit: bb16ca05ae1ce20ce59bfa7a879641b9300df6b0 + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - name: f1_macro label: F1 macro summary: "Unweighted mean of each label F1-score" @@ -24,7 +24,7 @@ functionality: maximize: true v1: path: openproblems/tasks/label_projection/metrics/f1.py - commit: bb16ca05ae1ce20ce59bfa7a879641b9300df6b0 + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - name: f1_micro label: F1 micro summary: "Calculation of TP, FN and FP." @@ -35,7 +35,7 @@ functionality: maximize: true v1: path: openproblems/tasks/label_projection/metrics/f1.py - commit: bb16ca05ae1ce20ce59bfa7a879641b9300df6b0 + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 resources: - type: python_script path: script.py diff --git a/src/tasks/label_projection/resources_test_scripts/pancreas.sh b/src/tasks/label_projection/resources_test_scripts/pancreas.sh index d9780a4425..bb3b687ba1 100755 --- a/src/tasks/label_projection/resources_test_scripts/pancreas.sh +++ b/src/tasks/label_projection/resources_test_scripts/pancreas.sh @@ -9,7 +9,7 @@ REPO_ROOT=$(git rev-parse --show-toplevel) # ensure that the command below is run from the root of the repository cd "$REPO_ROOT" -RAW_DATA=resources_test/common/pancreas/dataset.h5ad +RAW_DATA=resources_test/common/pancreas/cp10k_dataset.h5ad DATASET_DIR=resources_test/label_projection/pancreas if [ ! -f $RAW_DATA ]; then @@ -49,7 +49,7 @@ nextflow \ -resume \ --id pancreas \ --dataset_id pancreas \ - --normalization_id log_cpm \ + --normalization_id log_cp10k \ --input_train $DATASET_DIR/train.h5ad \ --input_test $DATASET_DIR/test.h5ad \ --input_solution $DATASET_DIR/solution.h5ad \ diff --git a/src/tasks/label_projection/workflows/run/main.nf b/src/tasks/label_projection/workflows/run/main.nf index bd54498e0c..d6f5146440 100644 --- a/src/tasks/label_projection/workflows/run/main.nf +++ b/src/tasks/label_projection/workflows/run/main.nf @@ -85,7 +85,7 @@ workflow run_wf { def pref = config.functionality.info.preferred_normalization // if the preferred normalisation is none at all, // we can pass whichever dataset we want - (norm == "log_cpm" && pref == "counts") || norm == pref + (norm == "log_cp10k" && pref == "counts") || norm == pref }, // define a new 'id' by appending the method name to the dataset id diff --git a/src/tasks/label_projection/workflows/run/run_test.sh b/src/tasks/label_projection/workflows/run/run_test.sh index b31c9ae4ac..a909381666 100755 --- a/src/tasks/label_projection/workflows/run/run_test.sh +++ b/src/tasks/label_projection/workflows/run/run_test.sh @@ -21,7 +21,7 @@ nextflow \ -resume \ --id pancreas \ --dataset_id pancreas \ - --normalization_id log_cpm \ + --normalization_id log_cp10k \ --input_train $DATASET_DIR/train.h5ad \ --input_test $DATASET_DIR/test.h5ad \ --input_solution $DATASET_DIR/solution.h5ad \ diff --git a/src/tasks/label_projection/workflows/run/run_test_on_tower.sh b/src/tasks/label_projection/workflows/run/run_test_on_tower.sh index 27c7ee8e3d..cce0f3d89f 100644 --- a/src/tasks/label_projection/workflows/run/run_test_on_tower.sh +++ b/src/tasks/label_projection/workflows/run/run_test_on_tower.sh @@ -9,7 +9,7 @@ input_train: s3://openproblems-data/$DATASET_DIR/train.h5ad input_test: s3://openproblems-data/$DATASET_DIR/test.h5ad input_solution: s3://openproblems-data/$DATASET_DIR/solution.h5ad dataset_id: pancreas -normalization_id: log_cpm +normalization_id: log_cp10k output: scores.tsv publish_dir: s3://openproblems-nextflow/output_test/v2/label_projection HERE