From d5a833774b1488fb7e1f0650692aab2c3f753144 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Mon, 11 Sep 2023 09:33:54 +0200 Subject: [PATCH] feat: also adding gene-to-phen edges from HPO (#9) --- .github/workflows/main.yml | 3 ++- cada_prio/cli.py | 9 +++++++- cada_prio/train_model.py | 22 +++++++++++++------ .../train_smoke/genes_to_phenotype.head.txt | 3 +++ 4 files changed, 28 insertions(+), 9 deletions(-) create mode 100644 tests/data/train_smoke/genes_to_phenotype.head.txt diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b989a05..3bceeae 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -94,6 +94,7 @@ jobs: pip freeze - name: Run tests - run: pytest + run: | + pytest -vvv --capture=no - uses: codecov/codecov-action@v3 diff --git a/cada_prio/cli.py b/cada_prio/cli.py index b8dff99..03103d1 100644 --- a/cada_prio/cli.py +++ b/cada_prio/cli.py @@ -30,6 +30,7 @@ def cli(ctx: click.Context, verbose: bool): required=True, ) @click.option("--path-hpo-obo", type=str, help="path HPO OBO file", required=True) +@click.option("--cpus", type=int, help="number of CPUs to use", default=1) @click.pass_context def cli_train_model( ctx: click.Context, @@ -38,11 +39,17 @@ def cli_train_model( path_gene_hpo_links: str, path_hpo_genes_to_phenotype: str, path_hpo_obo: str, + cpus: int, ): """train model""" ctx.ensure_object(dict) train_model.run( - path_out, path_hgnc_json, path_gene_hpo_links, path_hpo_genes_to_phenotype, path_hpo_obo + path_out, + path_hgnc_json, + path_gene_hpo_links, + path_hpo_genes_to_phenotype, + path_hpo_obo, + cpus, ) diff --git a/cada_prio/train_model.py b/cada_prio/train_model.py index 0975855..f3da458 100644 --- a/cada_prio/train_model.py +++ b/cada_prio/train_model.py @@ -220,16 +220,18 @@ class EmbeddingParams: min_count: int = 1 #: Set the batch_words in the fitting batch_words: int = 4 - #: Number of workers threads to use - workers: int = 4 -def build_and_fit_model(clinvar_gen2phen, hpo_ontology): +def build_and_fit_model(*, clinvar_gen2phen, hpo_gen2phen, hpo_ontology, cpus: int = 1): # create graph edges combining HPO hierarchy and training edges from ClinVar logger.info("Constructing training graph ...") logger.info("- building edges ...") training_edges = list( - itertools.chain(yield_hpo_edges(hpo_ontology), yield_gene2phen_edges(clinvar_gen2phen)) + itertools.chain( + yield_hpo_edges(hpo_ontology), + yield_gene2phen_edges(hpo_gen2phen), + yield_gene2phen_edges(clinvar_gen2phen), + ) ) logger.info("- graph construction") training_graph = nx.Graph() @@ -246,7 +248,7 @@ def build_and_fit_model(clinvar_gen2phen, hpo_ontology): num_walks=embedding_params.num_walks, p=embedding_params.p, q=embedding_params.q, - workers=embedding_params.workers, + workers=cpus, ) logger.info("- fitting model") model = embedding.fit( @@ -292,15 +294,21 @@ def run( path_gene_hpo_links: str, path_hpo_genes_to_phenotype: str, path_hpo_obo: str, + cpus: int = 1, ): # load all data ncbi_to_hgnc, hgnc_info = load_hgnc_info(path_hgnc_json) clinvar_gen2phen = load_clinvar_gen2phen(path_gene_hpo_links) hpo_gen2phen = load_hpo_gen2phen(path_hpo_genes_to_phenotype, ncbi_to_hgnc) hpo_ontology, hpo_id_from_alt, hpo_id_to_name = load_hpo_ontology(path_hpo_obo) - _, _, _ = hpo_gen2phen, hpo_id_from_alt, hpo_id_to_name + _, _ = hpo_id_from_alt, hpo_id_to_name # build and fit model - training_graph, model = build_and_fit_model(clinvar_gen2phen, hpo_ontology) + training_graph, model = build_and_fit_model( + clinvar_gen2phen=clinvar_gen2phen, + hpo_gen2phen=hpo_gen2phen, + hpo_ontology=hpo_ontology, + cpus=cpus, + ) # write out graph and model write_graph_and_model(path_out, hgnc_info, training_graph, model) diff --git a/tests/data/train_smoke/genes_to_phenotype.head.txt b/tests/data/train_smoke/genes_to_phenotype.head.txt new file mode 100644 index 0000000..0ab248b --- /dev/null +++ b/tests/data/train_smoke/genes_to_phenotype.head.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbb22923696a1e11c05c25eeb54fc627d53952a5545ab9b443b70bbae1b6e140 +size 703