From d5a833774b1488fb7e1f0650692aab2c3f753144 Mon Sep 17 00:00:00 2001
From: Manuel Holtgrewe <manuel.holtgrewe@bih-charite.de>
Date: Mon, 11 Sep 2023 09:33:54 +0200
Subject: [PATCH] feat: also adding gene-to-phen edges from HPO (#9)

---
 .github/workflows/main.yml                    |  3 ++-
 cada_prio/cli.py                              |  9 +++++++-
 cada_prio/train_model.py                      | 22 +++++++++++++------
 .../train_smoke/genes_to_phenotype.head.txt   |  3 +++
 4 files changed, 28 insertions(+), 9 deletions(-)
 create mode 100644 tests/data/train_smoke/genes_to_phenotype.head.txt

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b989a05..3bceeae 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -94,6 +94,7 @@ jobs:
             pip freeze
 
       - name: Run tests
-        run: pytest
+        run: |
+          pytest -vvv --capture=no
 
       - uses: codecov/codecov-action@v3
diff --git a/cada_prio/cli.py b/cada_prio/cli.py
index b8dff99..03103d1 100644
--- a/cada_prio/cli.py
+++ b/cada_prio/cli.py
@@ -30,6 +30,7 @@ def cli(ctx: click.Context, verbose: bool):
     required=True,
 )
 @click.option("--path-hpo-obo", type=str, help="path HPO OBO file", required=True)
+@click.option("--cpus", type=int, help="number of CPUs to use", default=1)
 @click.pass_context
 def cli_train_model(
     ctx: click.Context,
@@ -38,11 +39,17 @@ def cli_train_model(
     path_gene_hpo_links: str,
     path_hpo_genes_to_phenotype: str,
     path_hpo_obo: str,
+    cpus: int,
 ):
     """train model"""
     ctx.ensure_object(dict)
     train_model.run(
-        path_out, path_hgnc_json, path_gene_hpo_links, path_hpo_genes_to_phenotype, path_hpo_obo
+        path_out,
+        path_hgnc_json,
+        path_gene_hpo_links,
+        path_hpo_genes_to_phenotype,
+        path_hpo_obo,
+        cpus,
     )
 
 
diff --git a/cada_prio/train_model.py b/cada_prio/train_model.py
index 0975855..f3da458 100644
--- a/cada_prio/train_model.py
+++ b/cada_prio/train_model.py
@@ -220,16 +220,18 @@ class EmbeddingParams:
     min_count: int = 1
     #: Set the batch_words in the fitting
     batch_words: int = 4
-    #: Number of workers threads to use
-    workers: int = 4
 
 
-def build_and_fit_model(clinvar_gen2phen, hpo_ontology):
+def build_and_fit_model(*, clinvar_gen2phen, hpo_gen2phen, hpo_ontology, cpus: int = 1):
     # create graph edges combining HPO hierarchy and training edges from ClinVar
     logger.info("Constructing training graph ...")
     logger.info("- building edges ...")
     training_edges = list(
-        itertools.chain(yield_hpo_edges(hpo_ontology), yield_gene2phen_edges(clinvar_gen2phen))
+        itertools.chain(
+            yield_hpo_edges(hpo_ontology),
+            yield_gene2phen_edges(hpo_gen2phen),
+            yield_gene2phen_edges(clinvar_gen2phen),
+        )
     )
     logger.info("- graph construction")
     training_graph = nx.Graph()
@@ -246,7 +248,7 @@ def build_and_fit_model(clinvar_gen2phen, hpo_ontology):
         num_walks=embedding_params.num_walks,
         p=embedding_params.p,
         q=embedding_params.q,
-        workers=embedding_params.workers,
+        workers=cpus,
     )
     logger.info("- fitting model")
     model = embedding.fit(
@@ -292,15 +294,21 @@ def run(
     path_gene_hpo_links: str,
     path_hpo_genes_to_phenotype: str,
     path_hpo_obo: str,
+    cpus: int = 1,
 ):
     # load all data
     ncbi_to_hgnc, hgnc_info = load_hgnc_info(path_hgnc_json)
     clinvar_gen2phen = load_clinvar_gen2phen(path_gene_hpo_links)
     hpo_gen2phen = load_hpo_gen2phen(path_hpo_genes_to_phenotype, ncbi_to_hgnc)
     hpo_ontology, hpo_id_from_alt, hpo_id_to_name = load_hpo_ontology(path_hpo_obo)
-    _, _, _ = hpo_gen2phen, hpo_id_from_alt, hpo_id_to_name
+    _, _ = hpo_id_from_alt, hpo_id_to_name
 
     # build and fit model
-    training_graph, model = build_and_fit_model(clinvar_gen2phen, hpo_ontology)
+    training_graph, model = build_and_fit_model(
+        clinvar_gen2phen=clinvar_gen2phen,
+        hpo_gen2phen=hpo_gen2phen,
+        hpo_ontology=hpo_ontology,
+        cpus=cpus,
+    )
     # write out graph and model
     write_graph_and_model(path_out, hgnc_info, training_graph, model)
diff --git a/tests/data/train_smoke/genes_to_phenotype.head.txt b/tests/data/train_smoke/genes_to_phenotype.head.txt
new file mode 100644
index 0000000..0ab248b
--- /dev/null
+++ b/tests/data/train_smoke/genes_to_phenotype.head.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbb22923696a1e11c05c25eeb54fc627d53952a5545ab9b443b70bbae1b6e140
+size 703