MRCIEU · mvab · Nov 29, 2021 · Dec 2, 2021 · Dec 2, 2021 · Dec 2, 2021
diff --git a/ADDING_DATA_DETAILED_GUIDE.md b/ADDING_DATA_DETAILED_GUIDE.md
@@ -56,7 +56,7 @@ There are 3 steps:
 
 #### Prep
 * The graph build has to happen on jojo (or other server)
-* (source bashrc) and conda activate neo4j_build
+* (`source ~/.bashrc`) and `conda activate neo4j_build`
 * Make a folder in `workflow/source_data/FOLDER`
 * Modify `DATA_DIR` in `.env` when running the source script to point at the local source_data folder
 
@@ -66,19 +66,20 @@ There are 3 steps:
 snakemake -r clean_all -j 1
 snakemake -r all -j 4
 ```
+If there are issues with missing data, you will need to run scripts that make those datasets manually, to determine the issue (e.g. new data version has extra columns) and fix it to produce the missing data.
 
 #### Step 2
 
-Assuming scripts and ymls are created and locally testes, run: 
+Assuming scripts and ymls are created and locally tested, run: 
 
 ```
 # run source script
 python -m workflow.scripts.source.SOURCE_SCRIPT
 
-# run processing script
-python -m workflow.scripts.processing.rels.PROCESSING_SCRIPT -n (rel name in data_integration.yml) -d workflow/source_data/
+# run processing script (-d is optional if DATA_DIR in .env is set to a local path)
+python -m workflow.scripts.processing.rels.PROCESSING_SCRIPT -n (name in data_integration.yml) -d workflow/source_data/
 
-# check new data
+# check new data (gives a short uninformative message as if nothing happened)
 snakemake -r check_new_data -j 10
 ```
 

diff --git a/workflow/scripts/processing/nodes/literature/semrep-biorxiv.py b/workflow/scripts/processing/nodes/literature/semrep-biorxiv.py
@@ -63,6 +63,7 @@ def merge_data(lit_data, sem_data):
             "license",
             "category",
             "abstract",
+            'jatsxml',
         ],
         axis=1,
         inplace=True,

diff --git a/workflow/scripts/processing/nodes/literature/semrep-medrxiv.py b/workflow/scripts/processing/nodes/literature/semrep-medrxiv.py
@@ -63,6 +63,7 @@ def merge_data(lit_data, sem_data):
             "license",
             "category",
             "abstract",
+            'jatsxml',
         ],
         axis=1,
         inplace=True,

diff --git a/workflow/scripts/processing/rels/opentargets_drug_disease.py b/workflow/scripts/processing/rels/opentargets_drug_disease.py
@@ -22,43 +22,53 @@
 
 FILE = get_source(meta_id,1)
 
-def get_disease_data():    
+def get_disease_data():
+    # query the graph for all mondo_ids and efo_ids separately and then merge them on mondo_id col
+    # we have to do two separate queries because efo column has to be unwind, and
+    # that command retains only rows where the operation is performed, so we lose all mondo_ids that have nan in efo col
+
     driver = neo4j_connect()
     session = driver.session()
-    query = """
-        match (d:Disease) unwind(d.efo) as mondo_efo_id return d.id as disease_id, mondo_efo_id;
-    """
-    query_data = session.run(query).data()
-    df = pd.json_normalize(query_data)
-    logger.info(df)
-    return df
+
+    query1 = """ match (d:Disease) return d.id as mondo_id """
+    query_data1 = session.run(query1).data()
+    mondo_only = pd.json_normalize(query_data1)
+
+    query2 = """match (d:Disease) unwind(d.efo) as efo_id return d.id as mondo_id , efo_id"""
+    query_data2 = session.run(query2).data()
+    mondo_w_efo = pd.json_normalize(query_data2)
+    mondo_w_efo['efo_id'] = 'http://www.ebi.ac.uk/efo/EFO_' + mondo_w_efo['efo_id'].astype(str)
+
+    disease_df = pd.merge(mondo_only, mondo_w_efo, how='outer', on='mondo_id')
+    disease_df = disease_df.drop_duplicates()
+    return disease_df
 
 
 def process():
     data = os.path.join(dataDir, FILE)
     # not sure why double quotes weren't being handled properly, added engine param
     df = pd.read_csv(data, sep=",", engine="python")
+    df = df.rename(columns={"efo_id": "disease_id"})
     logger.info(df.shape)
     logger.info("\n {}", df.head())
+    df = df[["molecule_name", "disease_id"]]
 
-    #get disease data 
+    # get disease data from the graph
     disease_df = get_disease_data()
-    disease_df['mondo_efo_id'] = 'http://www.ebi.ac.uk/efo/EFO_'+disease_df['mondo_efo_id'].astype(str)
     logger.info(disease_df)
 
-    keep_cols = [
-        "molecule_name",
-        "efo_id",
-    ]
-    df = df[keep_cols]
-
-    mondo_match = pd.merge(df,disease_df,left_on='efo_id',right_on='disease_id')[['molecule_name','disease_id']]
-    #logger.info(mondo_match)
+    # join df (OT data) and disease_df (graph) on mondo_id
+    mondo_match = pd.merge(df, disease_df, left_on='disease_id', right_on='mondo_id')[['molecule_name', 'disease_id']]
+    mondo_match.drop_duplicates(inplace=True)
+    # logger.info(mondo_match)
 
-    efo_match = pd.merge(df,disease_df,left_on='efo_id',right_on='mondo_efo_id')[['molecule_name','disease_id']]
-    #logger.info(efo_match)
+    # join on efo_id, but keep the corresponding mondo_id, as this is what used for mapping
+    efo_match = pd.merge(df, disease_df, left_on='disease_id', right_on='efo_id')[['molecule_name', 'mondo_id']]
+    efo_match = efo_match.rename(columns={"mondo_id": "disease_id"})
+    efo_match.drop_duplicates(inplace=True)
+    # logger.info(efo_match)
 
-    cat_df = pd.concat([mondo_match,efo_match])
+    cat_df = pd.concat([mondo_match, efo_match])
     logger.info(cat_df.shape)
 
     cat_df.drop_duplicates(inplace=True)
@@ -68,7 +78,7 @@ def process():
     cat_df.columns = col_names
     cat_df["source"] = cat_df["source"].str.upper()
 
-    create_import(df=cat_df, meta_id=meta_id)
+    create_import(df=cat_df, meta_id=args.name)
 
 if __name__ == "__main__":
     process()