Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-implement reconciliation code with curies #426

Merged
merged 14 commits into from
Sep 26, 2023
3 changes: 3 additions & 0 deletions .github/workflows/qc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Install poetry dependencies
run: python -m pip install -U pip setuptools
hrshdhgd marked this conversation as resolved.
Show resolved Hide resolved

#----------------------------------------------
# install & configure poetry
#----------------------------------------------
Expand Down
355 changes: 180 additions & 175 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.8"
click = ">=8.1.6"
curies = ">=0.6.2"
curies = ">=0.6.4"
linkml-runtime = ">=1.5.5"
importlib-metadata = ">=6.8.0"
pandas = ">=2.0.3"
Expand Down
79 changes: 24 additions & 55 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -1104,65 +1104,34 @@ def reconcile_prefix_and_data(
:param msdf: Mapping Set DataFrame.
:param prefix_reconciliation: Prefix reconcilation dictionary from a YAML file
:return: Mapping Set DataFrame with reconciled prefix_map and data.

This method is build on :func:`curies.remap_curie_prefixes` and
:func:`curies.rewire`. Note that if you want to overwrite a CURIE prefix in the Bioregistry
extended prefix map, you need to provide a place for the old one to go as in
``{"geo": "ncbi.geo", "geogeo": "geo"}``.
Just doing ``{"geogeo": "geo"}`` would not work since `geo` already exists.
"""
# Discussion about this found here:
# https://github.com/mapping-commons/sssom-py/issues/216#issue-1171701052
converter = msdf.converter
converter = curies.remap_curie_prefixes(converter, prefix_reconciliation["prefix_synonyms"])
converter = curies.rewire(converter, prefix_reconciliation["prefix_expansion_reconciliation"])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use constants instead of strings to satisfy SRP?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know what SRP means, but let's save updating this data structure for a subsequent PR

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Single responsibility principle: classes/methods/functions should only have one reason to change. If this string changes for whatever reason, many different classes and places in the code will have to change as well.


# TODO make this standardization code directly part of msdf after
# switching to native converter
def _upgrade(curie_or_iri: str) -> str:
if not is_iri(curie_or_iri) and is_curie(curie_or_iri):
return converter.standardize_curie(curie_or_iri) or curie_or_iri
return curie_or_iri

for column, values in _get_sssom_schema_object().dict["slots"].items():
if values["range"] != "EntityReference":
continue
if column not in msdf.df.columns:
continue
msdf.df[column] = msdf.df[column].map(_upgrade)

prefix_map = msdf.prefix_map
df: pd.DataFrame = msdf.df
data_switch_dict = dict()

prefix_synonyms = prefix_reconciliation["prefix_synonyms"]
prefix_expansion = prefix_reconciliation["prefix_expansion_reconciliation"]

# The prefix exists but the expansion needs to be updated.
expansion_replace = {
k: v for k, v in prefix_expansion.items() if k in prefix_map.keys() and v != prefix_map[k]
}

# Updates expansions in prefix_map
prefix_map.update(expansion_replace)

# Prefixes that need to be replaced
# IF condition:
# 1. Key OR Value in prefix_synonyms are keys in prefix_map
# e.g.: ICD10: ICD10CM - either should be present within
# the prefix_map.
# AND
# 2. Value in prefix_synonyms is NOT a value in expansion_replace.
# In other words, the existing expansion do not match the YAML.

prefix_replace = [
k
for k, v in prefix_synonyms.items()
if (k in prefix_map.keys() or v in prefix_map.keys()) and v not in expansion_replace.keys()
]

if len(prefix_replace) > 0:
for pr in prefix_replace:
correct_prefix = prefix_synonyms[pr]
correct_expansion = prefix_expansion[correct_prefix]
prefix_map[correct_prefix] = correct_expansion
logging.info(f"Adding prefix_map {correct_prefix}: {correct_expansion}")
if pr in prefix_map.keys():
prefix_map.pop(pr, None)
data_switch_dict[pr] = correct_prefix

logging.warning(f"Replacing prefix {pr} with {correct_prefix}")

# Data editing
if len(data_switch_dict) > 0:
# Read schema file
slots = _get_sssom_schema_object().dict["slots"]
entity_reference_columns = [k for k, v in slots.items() if v["range"] == "EntityReference"]
update_columns = [c for c in df.columns if c in entity_reference_columns]
for k, v in data_switch_dict.items():
df[update_columns] = df[update_columns].replace(k + ":", v + ":", regex=True)

msdf.df = df
msdf.prefix_map = prefix_map

# TODO: When expansion of 2 prefixes in the prefix_map are the same.
msdf.prefix_map = dict(converter.bimap)
return msdf


Expand Down
2 changes: 0 additions & 2 deletions tests/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,3 @@

test_out_dir = cwd / "tmp"
test_out_dir.mkdir(parents=True, exist_ok=True)

prefix_recon_yaml = data_dir / "prefix_reconciliation.yaml"
59 changes: 48 additions & 11 deletions tests/test_collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

import unittest

import yaml

from sssom.parsers import parse_sssom_table
from sssom.util import (
collapse,
Expand All @@ -14,7 +12,7 @@
parse,
reconcile_prefix_and_data,
)
from tests.constants import data_dir, prefix_recon_yaml
from tests.constants import data_dir


class TestCollapse(unittest.TestCase):
Expand Down Expand Up @@ -76,13 +74,52 @@ def test_reconcile_prefix(self):
"""Test curie reconciliation is performing as expected."""
msdf = parse_sssom_table(data_dir / "basic3.tsv")

with open(prefix_recon_yaml) as pref_rec:
prefix_reconciliation = yaml.safe_load(pref_rec)
self.assertEqual(
{
"a": "http://example.org/a/",
"b": "http://example.org/b/",
"c": "http://example.org/c/",
"d": "http://example.org/d/",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"owl": "http://www.w3.org/2002/07/owl#",
"orcid": "https://orcid.org/",
"semapv": "https://w3id.org/semapv/vocab/",
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"skos": "http://www.w3.org/2004/02/skos/core#",
"sssom": "https://w3id.org/sssom/",
},
msdf.prefix_map,
)
prefix_reconciliation = {
"prefix_synonyms": {
"a": "c",
"c": "c2",
"b": "bravo",
"r": "rdfs", # does not do anything, since "r" is not already in the prefix map
"o": "owl", # does not do anything, since "o" is not already in the prefix map
},
"prefix_expansion_reconciliation": {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename expansion_reconciliation to rewiring to maintain consistent terminology?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about this?

"c": "http://test.owl/c/",
"bravo": "http://test.owl/bravo",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#", # matches what's already there
"owl": "http://www.w3.org/2002/07/owl#", # matches what's already there
},
}

recon_msdf = reconcile_prefix_and_data(msdf, prefix_reconciliation)
matentzn marked this conversation as resolved.
Show resolved Hide resolved

prefix_expansion = prefix_reconciliation["prefix_expansion_reconciliation"]

for pfx, exp in prefix_expansion.items():
if pfx in recon_msdf.prefix_map.keys():
self.assertEqual(recon_msdf.prefix_map[pfx], exp)
self.assertEqual(
{
"bravo": "http://test.owl/bravo",
"c": "http://test.owl/c/",
"c2": "http://example.org/c/",
"d": "http://example.org/d/",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"owl": "http://www.w3.org/2002/07/owl#",
"orcid": "https://orcid.org/",
"semapv": "https://w3id.org/semapv/vocab/",
"skos": "http://www.w3.org/2004/02/skos/core#",
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"sssom": "https://w3id.org/sssom/",
},
recon_msdf.prefix_map,
)