Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import DrugBank-DrugCentral mappings #112

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions scripts/import_drugcentral_mappings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""This script imports Harry Caufield's mappings from DrugCentral to various resources."""

import bioregistry
import click
import pandas as pd
import pyobo
from tqdm import tqdm

from biomappings.resources import append_true_mappings

URL = "http://kg-hub-public-data.s3.amazonaws.com/frozen_incoming_data/drug-id-maps-0.2.sssom.tsv"


@click.command()
def main():
"""Import DrugCentral mappings from Harry Caufield into Biomappings."""
df = pd.read_csv(URL, sep="\t", skiprows=11)
del df["author_id"]
del df["comment"]
del df["mapping_justification"]
df = df.rename(
columns={
"predicate_id": "relation",
"subject_label": "source name",
"object_label": "target name",
"reviewer_id": "source",
}
)
df["type"] = "manually_reviewed"

drugcentral_drugbank_mappings = pyobo.get_filtered_xrefs("drugcentral", "drugbank")

mappings = df.iterrows()
mappings = [
row
for _, row in tqdm(
mappings, unit="mapping", unit_scale=True, total=len(df.index), desc="Pre-Filtering"
)
if (
row["subject_id"].startswith("DRUGBANK:")
and row["object_id"].startswith("DrugCentral:")
and row["object_id"].removeprefix("DrugCentral:") not in drugcentral_drugbank_mappings
)
]
mappings = (
_prepare_mapping(row)
for row in tqdm(mappings, unit="mapping", unit_scale=True, desc="Mapping")
)
# Filter mappings with missing values
mappings = (mapping for mapping in mappings if mapping)
append_true_mappings(mappings)


def _prepare_mapping(row):
row = row.to_dict()
subject_prefix, subject_id = bioregistry.parse_curie(row.pop("subject_id"))
target_prefix, target_id = bioregistry.parse_curie(row.pop("object_id"))
row["source prefix"] = subject_prefix
row["source identifier"] = (
bioregistry.miriam_standardize_identifier(subject_prefix, subject_id) or subject_id
)
row["target prefix"] = target_prefix
row["target identifier"] = (
bioregistry.miriam_standardize_identifier(target_prefix, target_id) or subject_id
)
if pd.isna(row["source name"]):
row["source name"] = pyobo.get_name(subject_prefix, subject_id)
if pd.isna(row["target name"]):
row["target name"] = pyobo.get_name(target_prefix, target_id)
if not all(value and pd.notna(value) for value in row.values()):
tqdm.write(str(row))
return None
return row


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions src/biomappings/resources/curators.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ hoytfamily 0000-0003-1307-2508 Amelia Hoyt
ddomingof 0000-0002-2046-6145 Daniel Domingo-Fernandez
cmungall 0000-0002-6601-2165 Christopher Mungall
KrishnaTO 0000-0002-2627-0696 Krishna Udaiwal
caufieldjh 0000-0001-5705-7831 Harry Caufield
Loading