Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
9ee0af2
Use `curies` in `sparql_util.py`
cthoyt Jul 22, 2023
6bdf3b6
Update sparql_util.py
cthoyt Jul 22, 2023
8ebea27
Additional cleanup of sparql endpoint
cthoyt Jul 22, 2023
e3383a4
Additional updates to RDF
cthoyt Jul 22, 2023
e9f4747
Fix bug where endpoint config is built up over time
cthoyt Jul 22, 2023
da7ad44
Deprecate old compression function
cthoyt Jul 22, 2023
483d10a
Update pyproject.toml
cthoyt Jul 22, 2023
7e19639
Update util.py
cthoyt Jul 24, 2023
ec3452f
Update util.py
cthoyt Jul 24, 2023
118017d
Add typing.deprecated to curie_from_uri
cthoyt Jul 24, 2023
1849d60
Update lock
cthoyt Jul 24, 2023
032be37
Add typing extensions
cthoyt Jul 24, 2023
8868d08
Update util.py
cthoyt Jul 24, 2023
29c1802
testing latest version
hrshdhgd Jul 24, 2023
fcc6a78
anchor to 1.4.2 like other projects
hrshdhgd Jul 24, 2023
2c75910
using snok poetry from marketplace
hrshdhgd Jul 24, 2023
faeca94
remove pip update
hrshdhgd Jul 24, 2023
ebcab13
lock file updated
hrshdhgd Jul 24, 2023
1cc5ea9
added --no-interaction
hrshdhgd Jul 24, 2023
13d9cad
virtualenv causing the errors
hrshdhgd Jul 24, 2023
ba21248
anchor versions
hrshdhgd Jul 24, 2023
651d278
poetry == 1.4.2
hrshdhgd Jul 24, 2023
6713ad0
remove poetry.lock from source control
hrshdhgd Jul 24, 2023
f4f9106
remove poetry.lock from source control
hrshdhgd Jul 24, 2023
d5624dc
remove version anchor for poetry
hrshdhgd Jul 24, 2023
752e320
reanchor poetry to 1.3.2as before
hrshdhgd Jul 24, 2023
0e028c2
Clean up
cthoyt Jul 25, 2023
172463a
Fix delete
cthoyt Jul 25, 2023
2610322
Update context.py
cthoyt Jul 25, 2023
6abda6f
Update parsers.py
cthoyt Jul 25, 2023
fe21fc7
Update util.py
cthoyt Jul 25, 2023
25d74ae
Clean DC and update tests
cthoyt Jul 25, 2023
1c25c94
Update sparql_util.py
cthoyt Jul 25, 2023
afe95a8
Update context.py
cthoyt Jul 25, 2023
439f561
Add implicit prefix map validity checker
cthoyt Jul 25, 2023
f0cd478
Add text explanation.
cthoyt Jul 25, 2023
a9b0f27
Update test_parsers.py
cthoyt Jul 25, 2023
e24fec2
Remove DC cleanup
cthoyt Jul 25, 2023
e5afffb
removing poetry.lock from gitignore and commiting the lock file from …
hrshdhgd Jul 25, 2023
3ddf061
Update .gitignore
cthoyt Jul 25, 2023
db44005
Merge branch 'master' into improve-sparql-util
cthoyt Jul 25, 2023
36180e1
Remove test_broken_obographs test are reinstating the correct one
matentzn Jul 27, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ authors = [
"Chris Mungall <[email protected]>",
"Nicolas Matentzoglu <[email protected]>",
"Harshad Hegde <[email protected]>"
]
]
license = "MIT"
readme = "README.md"

Expand All @@ -19,6 +19,7 @@ bioregistry = ">=0.9.43"
deprecation = ">=2.1.0"
linkml-runtime = ">=1.5.3"
networkx = ">=3.1"
curies = ">=0.5.7"
pandas = ">=2.0.2"
pansql = "^0.0.1"
pyyaml = ">=6.0"
Expand Down
7 changes: 2 additions & 5 deletions src/sssom/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,11 +356,8 @@ def sparql(
endpoint.limit = limit
if object_labels is not None:
endpoint.include_object_labels = object_labels
if prefix is not None:
if endpoint.prefix_map is None:
endpoint.prefix_map = {}
for k, v in prefix:
endpoint.prefix_map[k] = v
for k, v in prefix or []:
endpoint.prefix_map[k] = v
msdf = query_mappings(endpoint)
write_table(msdf, output)

Expand Down
21 changes: 8 additions & 13 deletions src/sssom/rdf_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
import logging
from typing import Any, Dict, List, Optional

from curies import Converter
from linkml_runtime.utils.metamodelcore import URIorCURIE
from rdflib import Graph, URIRef

# from .sssom_datamodel import EntityReference, Mapping
from sssom_schema import EntityReference, Mapping

from .parsers import to_mapping_set_document
Expand All @@ -24,17 +23,12 @@ def rewire_graph(
precedence: Optional[List[str]] = None,
) -> int:
"""Rewire an RDF Graph replacing using equivalence mappings."""
pm = mset.prefix_map
mdoc = to_mapping_set_document(mset)
rewire_map: Dict[URIorCURIE, URIorCURIE] = {}

def expand_curie(curie: str) -> URIRef:
"""Expand CURIE into URIRef."""
pfx, local = curie.split(":")
return URIRef(f"{pm[pfx]}{local}")

if mdoc.mapping_set.mappings is None:
raise TypeError

converter = Converter.from_prefix_map(mdoc.prefix_map)
rewire_map: Dict[URIorCURIE, URIorCURIE] = {}
for m in mdoc.mapping_set.mappings:
if not isinstance(m, Mapping):
continue
Expand All @@ -49,8 +43,8 @@ def expand_curie(curie: str) -> URIRef:
curr_tgt = rewire_map[src]
logging.info(f"Ambiguous: {src} -> {tgt} vs {curr_tgt}")
if precedence:
curr_pfx, _ = curr_tgt.split(":")
tgt_pfx, _ = tgt.split(":")
curr_pfx, _ = converter.parse_curie(curr_tgt)
tgt_pfx, _ = converter.parse_curie(tgt)
if tgt_pfx in precedence:
if curr_pfx not in precedence or precedence.index(
tgt_pfx
Expand All @@ -63,7 +57,8 @@ def expand_curie(curie: str) -> URIRef:
rewire_map[src] = tgt

uri_ref_rewire_map: Dict[URIRef, URIRef] = {
expand_curie(k): expand_curie(v) for k, v in rewire_map.items()
URIRef(converter.expand_strict(k)): URIRef(converter.expand_strict(v))
for k, v in rewire_map.items()
}

def rewire_node(n: Any):
Expand Down
84 changes: 25 additions & 59 deletions src/sssom/sparql_util.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
"""Utilities for querying mappings with SPARQL."""

import logging
from dataclasses import dataclass
from typing import Dict, List, Mapping, Optional
from dataclasses import dataclass, field
from textwrap import dedent
from typing import Dict, List, Optional

import pandas as pd
from curies import Converter
from rdflib import URIRef
from rdflib.namespace import RDFS, SKOS
from SPARQLWrapper import JSON, SPARQLWrapper

from .util import MappingSetDataFrame
from .util import MappingSetDataFrame, safe_compress

__all__ = [
"EndpointConfig",
Expand All @@ -26,13 +28,16 @@ class EndpointConfig:
predmap: Dict[str, str]
predicates: Optional[List[str]]
limit: Optional[int]
prefix_map: Optional[Dict[str, str]]
include_object_labels: bool = False
prefix_map: Dict[str, str] = field(default_factory=dict)


def query_mappings(config: EndpointConfig) -> MappingSetDataFrame:
"""Query a SPARQL endpoint to obtain a set of mappings."""
sparql = SPARQLWrapper(config.url)
if not config.prefix_map:
raise TypeError
converter = Converter.from_prefix_map(config.prefix_map)

if config.graph is None:
g = "?g"
elif isinstance(config.graph, str):
Expand All @@ -42,7 +47,7 @@ def query_mappings(config: EndpointConfig) -> MappingSetDataFrame:
if config.predicates is None:
predicates = [SKOS.exactMatch, SKOS.closeMatch]
else:
predicates = [expand_curie(predicate, config) for predicate in config.predicates]
predicates = [URIRef(converter.expand_strict(predicate)) for predicate in config.predicates]
predstr = " ".join(URIRef(predicate).n3() for predicate in predicates)
if config.limit is not None:
limitstr = f"LIMIT {config.limit}"
Expand All @@ -59,7 +64,8 @@ def query_mappings(config: EndpointConfig) -> MappingSetDataFrame:
cols.insert(-1, "object_label")
colstr = " ".join([f"?{c}" for c in cols])
olq = "OPTIONAL { ?object_id rdfs:label ?object_label }" if config.include_object_labels else ""
q = f"""\
sparql = dedent(
f"""\
PREFIX rdfs: {RDFS.uri.n3()}
SELECT {colstr}
WHERE {{
Expand All @@ -72,56 +78,16 @@ def query_mappings(config: EndpointConfig) -> MappingSetDataFrame:
BIND({g} as ?mapping_provider)
}} {limitstr}
"""
logging.info(q)
sparql.setQuery(q)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
rows = []
for result in results["results"]["bindings"]:
row = {k: v["value"] for k, v in result.items()}
rows.append(curiefy_values(row, config))
df = pd.DataFrame(rows)
if config.prefix_map is None:
raise TypeError
)
logging.info(sparql)

sparql_wrapper = SPARQLWrapper(config.url, returnFormat=JSON)
sparql_wrapper.setQuery(sparql)
results = sparql_wrapper.query().convert()
df = pd.DataFrame(
[
{key: safe_compress(v["value"], converter) for key, v in result.items()}
for result in results["results"]["bindings"]
]
)
return MappingSetDataFrame(df=df, prefix_map=config.prefix_map)


def curiefy_values(row: Mapping[str, str], config: EndpointConfig) -> Dict[str, str]:
"""Convert all values in the dict from URIs to CURIEs.

:param row: A dictionary of string keys to URIs
:param config: Configuration
:return: A dictionary of string keys to CURIEs
"""
return {k: contract_uri(v, config) for k, v in row.items()}


def contract_uri(uri: str, config: EndpointConfig) -> str:
"""Replace the URI with a CURIE based on the prefix map in the given configuration.

:param uri: A uniform resource identifier
:param config: Configuration
:return: A CURIE if it's able to contract, otherwise return the original URI
"""
if config.prefix_map is None:
return uri
for k, v in config.prefix_map.items():
if uri.startswith(v):
return uri.replace(v, f"{k}:")
return uri


def expand_curie(curie: str, config: EndpointConfig) -> URIRef:
"""Expand a CURIE to a URI.

:param curie: CURIE
:param config: Configuration
:return: URI of CURIE
"""
if config.prefix_map is None:
return URIRef(curie)
for k, v in config.prefix_map.items():
prefix = f"{k}:"
if curie.startswith(prefix):
return URIRef(curie.replace(prefix, v))
return URIRef(curie)
43 changes: 23 additions & 20 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Utility functions."""

import hashlib
import json
import logging
import os
import re
import warnings
from collections import defaultdict
from dataclasses import dataclass, field
from functools import reduce
Expand All @@ -30,12 +32,10 @@
import pandas as pd
import validators
import yaml
from curies import Converter
from jsonschema import ValidationError
from linkml_runtime.linkml_model.types import Uriorcurie
from pandas.errors import EmptyDataError

# from .sssom_datamodel import Mapping as SSSOM_Mapping
# from .sssom_datamodel import slots
from sssom_schema import Mapping as SSSOM_Mapping
from sssom_schema import slots

Expand Down Expand Up @@ -1107,13 +1107,12 @@ def get_prefix_from_curie(curie: str) -> str:
return ""


def curie_from_uri(uri: str, prefix_map: Mapping[str, str]) -> str:
def curie_from_uri(uri: str, prefix_map: Union[Mapping[str, str], Converter]) -> str:
"""Parse a CURIE from an IRI.

:param uri: The URI to parse. If this is already a CURIE, return directly.
:param prefix_map: The prefix map against which the IRI is checked
:return: A CURIE
:raises NoCURIEException: if a CURIE can not be parsed

Example parsing:
>>> m = {"hgnc.genegroup": "https://example.org/hgnc.genegroup:"}
Expand All @@ -1126,21 +1125,10 @@ def curie_from_uri(uri: str, prefix_map: Mapping[str, str]) -> str:
>>> curie_from_uri("hgnc.genegroup:1234", {})
'hgnc.genegroup:1234'
"""
# TODO consider replacing with :func:`bioregistry.curie_from_iri`
# FIXME what if the curie has a subspace in it? RE will fail
if is_curie(uri):
return uri
for prefix in prefix_map:
uri_prefix = prefix_map[prefix]
if uri.startswith(uri_prefix):
remainder = uri.replace(uri_prefix, "")
curie = f"{prefix}:{remainder}"
if is_curie(curie):
return f"{prefix}:{remainder}"
else:
logging.warning(f"{prefix}:{remainder} is not a CURIE ... skipping")
continue
raise NoCURIEException(f"{uri} does not follow any known prefixes")
warnings.warn("Use safe_compress() instead", DeprecationWarning)
if not isinstance(prefix_map, Converter):
converter = Converter.from_prefix_map(prefix_map)
return safe_compress(uri, converter)


def get_prefixes_used_in_table(df: pd.DataFrame) -> List[str]:
Expand Down Expand Up @@ -1601,3 +1589,18 @@ def invert_mappings(
def _invert_column_names(column_names: list, columns_invert_map: dict) -> dict:
"""Return a dictionary for column renames in pandas DataFrame."""
return {x: columns_invert_map[x] for x in column_names}


def safe_compress(uri: str, converter: Converter) -> str:
"""Parse a CURIE from an IRI.

:param uri: The URI to parse. If this is already a CURIE, return directly.
:param converter: Converter used for compression
:return: A CURIE
"""
if not is_curie(uri):
return converter.compress_strict(uri)
rv = converter.standardize_curie(uri)
if rv is None:
raise ValueError
return rv