diff --git a/src/sssom/context.py b/src/sssom/context.py index 343ef613..02ff9d96 100644 --- a/src/sssom/context.py +++ b/src/sssom/context.py @@ -52,10 +52,32 @@ def _get_built_in_prefix_map() -> Converter: HINT = Union[None, PrefixMap, Converter] -def ensure_converter(prefix_map: HINT = None) -> Converter: - """Ensure a converter is available.""" +def ensure_converter(prefix_map: HINT = None, *, use_defaults: bool = True) -> Converter: + """Ensure a converter is available. + + :param prefix_map: One of the following: + + 1. An empty dictionary or ``None``. This results in using the default + extended prefix map (currently based on a variant of the Bioregistry) + if ``use_defaults`` is set to true, otherwise just the builtin prefix + map including the prefixes in :data:`SSSOM_BUILT_IN_PREFIXES` + 2. A non-empty dictionary representing a prefix map. This is loaded as a + converter with :meth:`Converter.from_prefix_map`. It is chained + behind the builtin prefix map to ensure none of the + :data:`SSSOM_BUILT_IN_PREFIXES` are overwritten with non-default values + 3. A pre-instantiated :class:`curies.Converter`. Similarly to a prefix + map passed into this function, this is chained behind the builtin prefix + map + :param use_defaults: If an empty dictionary or None is passed to this function, + this parameter chooses if the extended prefix map (currently based on a + variant of the Bioregistry) gets loaded. + :returns: A re-usable converter + """ if not prefix_map: - return get_converter() + if use_defaults: + return get_converter() + else: + return _get_built_in_prefix_map() if not isinstance(prefix_map, Converter): prefix_map = Converter.from_prefix_map(prefix_map) return curies.chain([_get_built_in_prefix_map(), prefix_map]) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index b2d7cca7..2760a8f2 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -6,12 +6,13 @@ import logging as _logging import re import typing -from collections import Counter +from collections import ChainMap, Counter from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, TextIO, Tuple, Union, cast from xml.dom import Node, minidom from xml.dom.minidom import Document +import curies import numpy as np import pandas as pd import requests @@ -53,9 +54,9 @@ SSSOMSchemaView, ) -from .context import HINT, ensure_converter +from .context import HINT, _get_built_in_prefix_map, ensure_converter from .sssom_document import MappingSetDocument -from .typehints import Metadata, MetadataType, generate_mapping_set_id +from .typehints import Metadata, MetadataType, generate_mapping_set_id, get_default_metadata from .util import ( PREFIX_MAP_KEY, SSSOM_DEFAULT_RDF_SERIALISATION, @@ -193,44 +194,36 @@ def parse_sssom_table( stream: io.StringIO = _open_input(file_path) sep_new = _get_seperator_symbol_from_file_path(file_path) df, sssom_metadata = _read_pandas_and_metadata(stream, sep_new) - # if mapping_predicates: - # # Filter rows based on presence of predicate_id list provided. - # df = df[df["predicate_id"].isin(mapping_predicates)] - - # If SSSOM external metadata is provided, merge it with the internal metadata - - if sssom_metadata: - if meta: - for k, v in meta.items(): - if k in sssom_metadata: - if sssom_metadata[k] != v: - logging.warning( - f"SSSOM internal metadata {k} ({sssom_metadata[k]}) " - f"conflicts with provided ({meta[k]})." - ) - else: - logging.info(f"Externally provided metadata {k}:{v} is added to metadata set.") - sssom_metadata[k] = v - meta = sssom_metadata - - if PREFIX_MAP_KEY in sssom_metadata: - if prefix_map: - for k, v in prefix_map.items(): - if k in sssom_metadata[CURIE_MAP]: - if sssom_metadata[CURIE_MAP][k] != v: - logging.warning( - f"SSSOM prefix map {k} ({sssom_metadata[CURIE_MAP][k]}) " - f"conflicts with provided ({prefix_map[k]})." - ) - else: - logging.info( - f"Externally provided metadata {k}:{v} is added to metadata set." - ) - sssom_metadata[CURIE_MAP][k] = v - prefix_map = sssom_metadata[CURIE_MAP] + if meta is None: + meta = {} + + # The priority order for combining prefix maps are: + # 1. Built-in prefix map + # 2. Internal prefix map inside the document + # 3. Prefix map passed through this function inside the ``meta`` + # 4. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter) + converter = curies.chain( + [ + _get_built_in_prefix_map(), + Converter.from_prefix_map(sssom_metadata.pop(CURIE_MAP, {})), + Converter.from_prefix_map(meta.pop(CURIE_MAP, {})), + ensure_converter(prefix_map, use_defaults=False), + ] + ) + + # The priority order for combining metadata is: + # 1. Metadata appearing in the SSSOM document + # 2. Metadata passed through ``meta`` to this function + # 3. Default metadata + combine_meta = dict( + ChainMap( + sssom_metadata, + meta, + get_default_metadata(), + ) + ) - meta_all = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta) - msdf = from_sssom_dataframe(df, prefix_map=meta_all.prefix_map, meta=meta_all.metadata) + msdf = from_sssom_dataframe(df, prefix_map=converter, meta=combine_meta) return msdf diff --git a/src/sssom/writers.py b/src/sssom/writers.py index a7afb0ef..b50c56d9 100644 --- a/src/sssom/writers.py +++ b/src/sssom/writers.py @@ -441,7 +441,7 @@ def to_fhir_json(msdf: MappingSetDataFrame) -> Dict: def to_json(msdf: MappingSetDataFrame) -> JsonObj: """Convert a mapping set dataframe to a JSON object.""" doc = to_mapping_set_document(msdf) - data = JSONDumper().dumps(doc.mapping_set, contexts=doc.prefix_map) + data = JSONDumper().dumps(doc.mapping_set, contexts={"@context": doc.prefix_map}) json_obj = json.loads(data) return json_obj diff --git a/tests/test_conversion.py b/tests/test_conversion.py index 60b5942b..faeae644 100644 --- a/tests/test_conversion.py +++ b/tests/test_conversion.py @@ -146,9 +146,10 @@ def _test_to_dataframe(self, mdoc, test): test.ct_data_frame_rows, f"The pandas data frame has less elements than the orginal one for {test.filename}", ) - df.to_csv(test.get_out_file("roundtrip.tsv"), sep="\t") - # data = pd.read_csv(test.get_out_file("roundtrip.tsv"), sep="\t") - data = parse_sssom_table(test.get_out_file("roundtrip.tsv")).df + path = test.get_out_file("roundtrip.tsv") + with open(path, "w") as file: + write_table(msdf, file=file) + data = parse_sssom_table(path).df self.assertEqual( len(data), test.ct_data_frame_rows, diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 63dd505f..b2d6914e 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -4,7 +4,9 @@ import json import math import os +import tempfile import unittest +from pathlib import Path from xml.dom import minidom import numpy as np @@ -12,8 +14,12 @@ import yaml from rdflib import Graph +from sssom.constants import CURIE_MAP, DEFAULT_LICENSE, SSSOM_URI_PREFIX +from sssom.context import SSSOM_BUILT_IN_PREFIXES, ensure_converter from sssom.io import parse_file from sssom.parsers import ( + _open_input, + _read_pandas_and_metadata, from_alignment_minidom, from_obographs, from_sssom_dataframe, @@ -22,7 +28,7 @@ parse_sssom_table, ) from sssom.typehints import Metadata -from sssom.util import PREFIX_MAP_KEY, sort_df_rows_columns +from sssom.util import PREFIX_MAP_KEY, MappingSetDataFrame, sort_df_rows_columns from sssom.writers import write_table from tests.test_data import data_dir as test_data_dir from tests.test_data import test_out_dir @@ -245,3 +251,75 @@ def test_parse_obographs_merged(self): ) msdf = parse_sssom_table(outfile) self.assertTrue(custom_curie_map.items() <= msdf.prefix_map.items()) + + +class TestParseExplicit(unittest.TestCase): + """This test case contains explicit tests for parsing.""" + + def test_round_trip(self): + """Explicitly test round tripping.""" + rows = [ + ( + "DOID:0050601", + "ADULT syndrome", + "skos:exactMatch", + "UMLS:C1863204", + "ADULT SYNDROME", + "semapv:ManualMappingCuration", + "orcid:0000-0003-4423-4370", + ) + ] + columns = [ + "subject_id", + "subject_label", + "predicate_id", + "object_id", + "object_label", + "mapping_justification", + "creator_id", + ] + df = pd.DataFrame(rows, columns=columns) + msdf = MappingSetDataFrame(df=df, converter=ensure_converter()) + msdf.clean_prefix_map(strict=True) + #: This is a set of the prefixes that explicitly are used in this + #: example. SSSOM-py also adds the remaining builtin prefixes from + #: :data:`sssom.context.SSSOM_BUILT_IN_PREFIXES`, which is reflected + #: in the formulation of the test expectation below + explicit_prefixes = {"DOID", "semapv", "orcid", "skos", "UMLS"} + self.assertEqual( + explicit_prefixes.union(SSSOM_BUILT_IN_PREFIXES), + set(msdf.prefix_map), + ) + + with tempfile.TemporaryDirectory() as directory: + directory = Path(directory) + path = directory.joinpath("test.sssom.tsv") + with path.open("w") as file: + write_table(msdf, file) + + _, read_metadata = _read_pandas_and_metadata(_open_input(path)) + reconsitited_msdf = parse_sssom_table(path) + + # This tests what's actually in the file after it's written out + self.assertEqual({CURIE_MAP, "license", "mapping_set_id"}, set(read_metadata)) + self.assertEqual(DEFAULT_LICENSE, read_metadata["license"]) + self.assertTrue(read_metadata["mapping_set_id"].startswith(f"{SSSOM_URI_PREFIX}mappings/")) + + expected_prefix_map = { + "DOID": "http://purl.obolibrary.org/obo/DOID_", + "UMLS": "http://linkedlifedata.com/resource/umls/id/", + "orcid": "https://orcid.org/", + "owl": "http://www.w3.org/2002/07/owl#", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "semapv": "https://w3id.org/semapv/vocab/", + "skos": "http://www.w3.org/2004/02/skos/core#", + "sssom": "https://w3id.org/sssom/", + } + self.assertEqual( + expected_prefix_map, + read_metadata[CURIE_MAP], + ) + + # This checks that nothing funny gets added unexpectedly + self.assertEqual(expected_prefix_map, reconsitited_msdf.prefix_map) diff --git a/tests/test_validate.py b/tests/test_validate.py index e267586a..871edead 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -29,6 +29,22 @@ def test_validate_json(self): """ self.assertIsNone(validate(self.correct_msdf1, self.validation_types)) + @unittest.skip( + reason="""\ + + This test did not previously do what was expected. It was raising a validation error + not because of the text below suggesting the validator was able to identify an issue + with the `mapping_justification` slot, but because `orcid` was missing from the prefix map. + The error actually thrown was:: + + jsonschema.exceptions.ValidationError: The prefixes in {'orcid'} are missing from 'curie_map'. + + With updates in https://github.com/mapping-commons/sssom-py/pull/431, the default prefix map + which includes `orcid` is added on parse, and this error goes away. Therefore, this test + now fails, but again, this is a sporadic failure since the test was not correct in the first + place. Therefore, this test is now skipped and marked for FIXME. + """ + ) def test_validate_json_fail(self): """ Test if JSONSchemaValidation fail is as expected.