Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Curate additional DOID-MESH mappings and contribute to DOID #104

Merged
merged 16 commits into from
Oct 3, 2022
141 changes: 141 additions & 0 deletions scripts/add_mesh_xrefs_to_doid_owl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""This script adds newly inferred cross-references for DOID.

These are directly added to the version controlled DOID OWL file.
"""
import csv

import obonet

from biomappings import load_mappings

EDITABLE_OWL_PATH = "/Users/ben/src/HumanDiseaseOntology/src/ontology/doid-edit.owl"
OBO_PATH = "/Users/ben/src/HumanDiseaseOntology/src/ontology/HumanDO.obo"
REVIEW_PATH = "/Users/ben/src/HumanDiseaseOntology/doid_mesh_review.tsv"

# Get the DOID ontology
g = obonet.read_obo(
"https://raw.githubusercontent.com/DiseaseOntology/"
"HumanDiseaseOntology/main/src/ontology/HumanDO.obo"
)


def add_xref(lines, node, xref):
"""Add xrefs to an appropriate place in the OWL file."""
node_owl = node.replace(":", "_")
look_for_xref = False
start_xref_idx = None
def_idx = None
xref_entries = []
for idx, line in enumerate(lines):
# First, find the class with the given ID and start looking for xrefs
if line.startswith("# Class: obo:%s" % node_owl):
blank_counter = 0
look_for_xref = True
def_idx = idx + 2
continue
# If we are already looking for xrefs after the header
if look_for_xref:
# Note that there is always a blank line right after this header, and
# also after the block corresponding to the entry ends. So we need
# to be able to tell whether we are at the first or second blank line
# after the header.
if not line.strip():
if not blank_counter:
blank_counter += 1
continue
else:
break
# If we find some xrefs, we keep track of those
if line.startswith("AnnotationAssertion(oboInOwl:hasDbXref obo"):
if not start_xref_idx:
start_xref_idx = idx
xref_entries.append(line)
# If we found any xrefs but now there is a different line, we finish
elif start_xref_idx and not line.startswith("AnnotationAssertion(oboInOwl:hasDbXref"):
break
# If we never found any existing xrefs then we will put the new xref
# after the definition
if start_xref_idx is None:
start_xref_idx = def_idx + 1
# We now have to render the xref string and sort xrefs alphabetically
# to make sure we put the new one in the right place
xref_str = 'AnnotationAssertion(oboInOwl:hasDbXref obo:%s "%s"^^xsd:string)\n' % (
node_owl,
xref,
)
xref_entries.append(xref_str)
xref_entries = sorted(xref_entries)
xr_idx = xref_entries.index(xref_str)
lines.insert(start_xref_idx + xr_idx, xref_str)
return lines


if __name__ == "__main__":
# There are some curations that are redundant since DOID already mapped
# these nodes to MESH. We figure out what these are so we can avoid
# adding them.
doid_already_mapped = set()
g = obonet.read_obo(OBO_PATH)
for node, data in g.nodes(data=True):
# Skip external entries
if not node.startswith("DOID"):
continue
# Make sure we have a name
if "name" not in data:
continue
# Get existing xrefs as a standardized dict
xrefs = dict([xref.split(":", maxsplit=1) for xref in data.get("xref", [])])
# If there are already MESH mappings, we keep track of that
if "MESH" in xrefs:
doid_already_mapped.add(node)

# We now load mappings curated in Biomappings
mappings = load_mappings()
doid_mappings = [
(m["source identifier"], m["target identifier"], m)
for m in mappings
if (
m["source prefix"] == "doid"
and m["target prefix"] == "mesh"
and m["source identifier"] not in doid_already_mapped
)
]
# Make sure we get and standardize the order of mappings in both directions
doid_mappings += [
(m["target identifier"], m["source identifier"], m)
for m in mappings
if (
m["source prefix"] == "mesh"
and m["target prefix"] == "doid"
and m["target identifier"] not in doid_already_mapped
)
]

# Read the OWL file
with open(EDITABLE_OWL_PATH, "r") as fh:
lines = fh.readlines()

review_cols = [
"source prefix",
"source identifier",
"source name",
"relation",
"target prefix",
"target identifier",
"target name",
"type",
"source",
]
review_rows = [review_cols]
# Add all the xrefs to the OWL, simultaneously add xrefs to a review TSV
for do_id, mesh_id, mapping in doid_mappings:
lines = add_xref(lines, do_id, "MESH:" + mesh_id)
review_rows.append([mapping[c] for c in review_cols])

# Dump the new review TSV and OWL file
with open(REVIEW_PATH, "w") as fh:
writer = csv.writer(fh, delimiter="\t")
writer.writerows(review_rows)

with open(EDITABLE_OWL_PATH, "w") as fh:
fh.writelines(lines)
94 changes: 94 additions & 0 deletions scripts/generate_doid_mesh_mappings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""Generate mappings using Gilda from DOID to MeSH."""
from collections import Counter

import gilda
import obonet
from indra.databases import mesh_client
from indra.ontology.standardize import standardize_db_refs
from indra.tools.fix_invalidities import fix_invalidities_db_refs

from biomappings import load_false_mappings, load_mappings, load_unsure
from biomappings.resources import PredictionTuple, append_prediction_tuples

# Get the DOID ontology
g = obonet.read_obo(
"https://raw.githubusercontent.com/DiseaseOntology/"
"HumanDiseaseOntology/main/src/ontology/HumanDO.obo"
)

# Make sure we know which mappings have already been curated
curated_mappings = set()
for m in list(load_mappings()) + list(load_unsure()) + list(load_false_mappings()):
if m["source prefix"] == "doid" and m["target prefix"] == "mesh":
curated_mappings.add(m["source identifier"])
elif m["target prefix"] == "doid" and m["source prefix"] == "mesh":
curated_mappings.add(m["target identifier"])


# We now iterate over all DOID entries and check for possible mappings
mappings = {}
existing_refs_to_mesh = set()
already_mappable = set()
for node, data in g.nodes(data=True):
# Skip external entries
if not node.startswith("DOID"):
continue
# Make sure we have a name
if "name" not in data:
continue
# Skip if already curated
if node in curated_mappings:
continue
# Get existing xrefs as a standardized dict
xrefs = [xref.split(":", maxsplit=1) for xref in data.get("xref", [])]
xrefs_dict = fix_invalidities_db_refs(dict(xrefs))
standard_refs = standardize_db_refs(xrefs_dict)
# If there are already MESH mappings, we keep track of that
if "MESH" in standard_refs:
already_mappable.add(node)
existing_refs_to_mesh |= {id for ns, id in standard_refs.items() if ns == "MESH"}
# We can now ground the name and specifically look for MESH matches
matches = gilda.ground(data["name"], namespaces=["MESH"])
# If we got a match, we add the MESH ID as a mapping
if matches:
for grounding in matches[0].get_groundings():
if grounding[0] == "MESH":
mappings[node] = matches[0].term.id


print("Found %d DOID->MESH mappings." % len(mappings))

# We makes sure that (i) the node is not already mappable to MESH and that
# (ii) there isn't some other node that was not already mapped to the
# given MESH ID
mappings = {
k: v
for k, v in mappings.items()
if v not in existing_refs_to_mesh and k not in already_mappable
}

# We now need to make sure that we don't reuse the same MESH ID across
# multiple predicted mappings
cnt = Counter(mappings.values())
mappings = {k: v for k, v in mappings.items() if cnt[v] == 1}

print("Found %d filtered DOID->MESH mappings." % len(mappings))

# We can now add the predictions
predictions = []
for doid, mesh_id in mappings.items():
pred = PredictionTuple(
target_prefix="doid",
target_identifier=doid[5:],
target_name=g.nodes[doid]["name"],
relation="skos:exactMatch",
source_prefix="mesh",
source_id=mesh_id,
source_name=mesh_client.get_mesh_name(mesh_id),
type="lexical",
confidence=0.9,
source="generate_doid_mesh_mappings.py",
)
predictions.append(pred)

append_prediction_tuples(predictions, deduplicate=True, sort=True)
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ install_requires =
pyyaml
tqdm
pystow>=0.2.7
bioregistry>=0.4.29
bioregistry>=0.5.104

zip_safe = false
include_package_data = True
Expand Down
12 changes: 10 additions & 2 deletions src/biomappings/resources/incorrect.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,12 @@ doid DOID:0050575 D-2-hydroxyglutaric aciduria skos:exactMatch mesh C535306 2-Hy
doid DOID:0050575 D-2-hydroxyglutaric aciduria skos:exactMatch umls C2746066 Combined D-2- and L-2-hydroxyglutaric aciduria manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0050577 cranioectodermal dysplasia skos:exactMatch umls C0432235 CRANIOECTODERMAL DYSPLASIA 1 manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0050580 hereditary lymphedema skos:exactMatch mesh D008209 Lymphedema manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0050650 familial atrial fibrillation skos:exactMatch mesh D001281 Atrial Fibrillation manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0050665 fetal alcohol syndrome skos:exactMatch mesh D063647 Fetal Alcohol Spectrum Disorders manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0050666 partial fetal alcohol syndrome skos:exactMatch mesh D063647 Fetal Alcohol Spectrum Disorders manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0050667 alcohol-related neurodevelopmental disorder skos:exactMatch mesh D063647 Fetal Alcohol Spectrum Disorders manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0050672 dyskinetic cerebral palsy skos:exactMatch mesh D002547 Cerebral Palsy manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0050673 mixed cerebral palsy skos:exactMatch mesh D002547 Cerebral Palsy manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0050685 small cell carcinoma skos:exactMatch umls C0149925 Small cell carcinoma of lung manually_reviewed orcid:0000-0003-4423-4370
doid DOID:0050882 spinocerebellar ataxia type 5 skos:exactMatch mesh D020754 Spinocerebellar Ataxias manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0050954 spinocerebellar ataxia type 1 skos:exactMatch mesh D020754 Spinocerebellar Ataxias manually_reviewed orcid:0000-0001-9439-5346
Expand All @@ -148,11 +151,15 @@ doid DOID:0060141 finger agnosia skos:exactMatch mesh D000377 Agnosia manually_r
doid DOID:0060151 tactile agnosia skos:exactMatch mesh D000377 Agnosia manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0060152 time agnosia skos:exactMatch mesh D000377 Agnosia manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0060153 topographical agnosia skos:exactMatch mesh D000377 Agnosia manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0070197 distal myopathy 1 skos:exactMatch mesh D049310 Distal Myopathies manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0070340 classic citrullinemia skos:exactMatch mesh D020159 Citrullinemia manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0080046 Stickler syndrome skos:exactMatch mesh C537492 Stickler syndrome, type 1 manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0080333 aortic valve disease 1 skos:exactMatch mesh D000082882 Bicuspid Aortic Valve Disease manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0080546 non-alcoholic fatty liver skos:exactMatch mesh D065626 Non-alcoholic Fatty Liver Disease manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0080547 non-alcoholic steatohepatitis skos:exactMatch mesh D065626 Non-alcoholic Fatty Liver Disease manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0080748 chronic inducible urticaria skos:exactMatch mesh D000080223 Chronic Urticaria manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0080749 chronic spontaneous urticaria skos:exactMatch mesh D000080223 Chronic Urticaria manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0080939 hereditary angioedema type I skos:exactMatch mesh D056829 Hereditary Angioedema Types I and II manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0111135 congenital generalized lipodystrophy type 1 skos:exactMatch mesh D052497 Lipodystrophy, Congenital Generalized manually_reviewed orcid:0000-0001-9439-5346
doid DOID:0111136 congenital generalized lipodystrophy type 2 skos:exactMatch mesh D052497 Lipodystrophy, Congenital Generalized manually_reviewed orcid:0000-0001-9439-5346
doid DOID:1024 leprosy skos:exactMatch umls C0051981 leprosy vaccine manually_reviewed orcid:0000-0003-4423-4370
Expand All @@ -170,6 +177,7 @@ doid DOID:1498 cholera skos:exactMatch umls C0008359 cholera vaccine manually_re
doid DOID:1586 rheumatic fever skos:exactMatch umls C1548484 rheumatic fever vaccine manually_reviewed orcid:0000-0003-4423-4370
doid DOID:1909 melanoma skos:exactMatch umls C0796561 Melanoma vaccine manually_reviewed orcid:0000-0003-4423-4370
doid DOID:2043 hepatitis B skos:exactMatch umls C0062527 hepatitis B surface antigen vaccine manually_reviewed orcid:0000-0003-4423-4370
doid DOID:3479 uveal cancer skos:exactMatch mesh D014604 Uveal Neoplasms manually_reviewed orcid:0000-0001-9439-5346
doid DOID:3482 plague skos:exactMatch umls C0032066 Plague Vaccine manually_reviewed orcid:0000-0003-4423-4370
doid DOID:3910 lung adenocarcinoma skos:exactMatch efo 0005288 non-small cell lung adenocarcinoma manually_reviewed orcid:0000-0003-4423-4370
doid DOID:4552 large cell carcinoma skos:exactMatch efo 0003050 large cell lung carcinoma manually_reviewed orcid:0000-0003-4423-4370
Expand Down Expand Up @@ -358,6 +366,7 @@ mesh C535442 Bile acid synthesis defect, congenital, 1 skos:exactMatch doid DOID
mesh C535536 Iridogoniodysgenesis, dominant type skos:exactMatch doid DOID:0050786 iridogoniodysgenesis syndrome manually_reviewed orcid:0000-0003-1307-2508
mesh C535837 Pancreatic carcinoma, familial skos:exactMatch doid DOID:4905 pancreatic carcinoma manually_reviewed orcid:0000-0003-1307-2508
mesh C535847 Hay-Wells syndrome skos:exactMatch doid DOID:0090119 ankyloblepharon-ectodermal defects-cleft lip/palate syndrome manually_reviewed orcid:0000-0003-1307-2508
mesh C536133 Meckel syndrome type 1 skos:exactMatch doid DOID:0050778 Meckel syndrome manually_reviewed orcid:0000-0001-9439-5346
mesh C536192 Brittle cornea syndrome 1 skos:exactMatch doid DOID:14775 brittle cornea syndrome manually_reviewed orcid:0000-0003-1307-2508
mesh C536277 Idiopathic dilation cardiomyopathy skos:exactMatch doid DOID:0110429 dilated cardiomyopathy 1H manually_reviewed orcid:0000-0003-1307-2508
mesh C536300 Partington X-linked mental retardation syndrome skos:exactMatch doid DOID:14744 Partington syndrome manually_reviewed orcid:0000-0003-1307-2508
Expand All @@ -379,7 +388,6 @@ mesh C567200 Immunodeficiency, Hypogammaglobulinemia, and Reduced B Cells skos:e
mesh C567484 Mental Retardation, X-Linked, Syndromic, Christianson Type skos:exactMatch doid DOID:0060825 Christianson syndrome manually_reviewed orcid:0000-0003-1307-2508
mesh C567738 Neuropathy, Hereditary Sensory And Autonomic, Type IIB skos:exactMatch doid DOID:0070150 hereditary sensory and autonomic neuropathy type 2B manually_reviewed orcid:0000-0003-1307-2508
mesh C579934 Autosomal Recessive Cerebellar Ataxia Type 1 skos:exactMatch doid DOID:0111618 autosomal recessive spinocerebellar ataxia 8 manually_reviewed orcid:0000-0003-1307-2508
mesh C580012 Congenital Fibrosis of the Extraocular Muscles skos:exactMatch doid DOID:0080143 congenital fibrosis of the extraocular muscles manually_reviewed orcid:0000-0003-1307-2508
mesh C580473 Succinate-Coa Ligase Deficiency skos:exactMatch doid DOID:0080124 mitochondrial DNA depletion syndrome 5 manually_reviewed orcid:0000-0003-1307-2508
mesh C584135 endotrophin skos:exactMatch ncit C165979 Pro-C6 Measurement manually_reviewed orcid:0000-0001-9439-5346
mesh D000071636 Protein Phosphatase 2C skos:exactMatch hgnc 9279 PDP1 manually_reviewed orcid:0000-0001-9439-5346
Expand Down Expand Up @@ -413,6 +421,7 @@ mesh D003924 Diabetes Mellitus, Type 2 skos:exactMatch doid DOID:0050524 maturit
mesh D003925 Diabetic Angiopathies skos:exactMatch doid DOID:10182 diabetic peripheral angiopathy manually_reviewed orcid:0000-0003-1307-2508
mesh D004407 Dysgerminoma skos:exactMatch doid DOID:5511 dysgerminoma of ovary manually_reviewed orcid:0000-0003-1307-2508
mesh D004410 Dyslexia skos:exactMatch doid DOID:13365 reading disorder manually_reviewed orcid:0000-0003-1307-2508
mesh D004697 Endocarditis, Bacterial skos:exactMatch doid DOID:0060000 infective endocarditis manually_reviewed orcid:0000-0001-9439-5346
mesh D004932 Esophageal and Gastric Varices skos:exactMatch doid DOID:112 esophageal varix manually_reviewed orcid:0000-0003-1307-2508
mesh D005319 Fetal Hemoglobin skos:exactMatch efo 0004576 fetal hemoglobin measurement manually_reviewed orcid:0000-0003-4423-4370
mesh D005351 Fibromatosis, Gingival skos:exactMatch doid DOID:0060466 gingival fibromatosis manually_reviewed orcid:0000-0003-1307-2508
Expand All @@ -424,7 +433,6 @@ mesh D006441 Hemoglobin A skos:exactMatch efo 0009208 Hemoglobin A Measurement m
mesh D006443 Hemoglobin A2 skos:exactMatch efo 0005845 hemoglobin A2 measurement manually_reviewed orcid:0000-0003-4423-4370
mesh D006451 Hemoglobin, Sickle skos:exactMatch efo 0009223 Hemoglobin S Measurement manually_reviewed orcid:0000-0003-4423-4370
mesh D006480 Hemorrhagic Fever with Renal Syndrome skos:exactMatch doid DOID:0050200 Korean hemorrhagic fever manually_reviewed orcid:0000-0003-1307-2508
mesh D006519 Hepatitis, Alcoholic skos:exactMatch doid DOID:12351 alcoholic hepatitis manually_reviewed orcid:0000-0003-1307-2508
mesh D006593 Hexokinase skos:exactMatch go GO:0004396 hexokinase activity manually_reviewed orcid:0000-0001-9439-5346
mesh D006646 Histiocytosis, Langerhans-Cell skos:exactMatch doid DOID:2571 Langerhans-cell histiocytosis manually_reviewed orcid:0000-0003-1307-2508
mesh D007014 Hypophosphatasia skos:exactMatch doid DOID:0110915 childhood hypophosphatasia manually_reviewed orcid:0000-0003-1307-2508
Expand Down
Loading