diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 3ba94b977..1e6efcb79 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -35,6 +35,32 @@ def get_all_children(sctid, pt2ch): return result +def get_direct_refset_mapping(in_dict: dict) -> dict: + """This method uses the output from Snomed.map_snomed2icd10 or + Snomed.map_snomed2opcs4 and removes the metadata and maps each + SNOMED CUI to the prioritised list of the target ontology CUIs. + + The input dict is expected to be in the following format: + - Keys are SnomedCT CUIs + - The values are lists of dictionaries, each list item (at least) + - Has a key 'code' that specifies the target onotlogy CUI + - Has a key 'mapPriority' that specifies the priority + + Args: + in_dict (dict): The input dict. + + Returns: + dict: The map from Snomed CUI to list of priorities list of target ontology CUIs. + """ + ret_dict = dict() + for k, vals in in_dict.items(): + # sort such that highest priority values are first + svals = sorted(vals, key=lambda el: el['mapPriority'], reverse=True) + # only keep the code / CUI + ret_dict[k] = [v['code'] for v in svals] + return ret_dict + + class Snomed: """ Pre-process SNOMED CT release files. @@ -53,6 +79,15 @@ def __init__(self, data_path, uk_ext=False, uk_drug_ext=False): self.release = data_path[-16:-8] self.uk_ext = uk_ext self.uk_drug_ext = uk_drug_ext + self.opcs_refset_id = "1126441000000105" + if ((self.uk_ext or self.uk_drug_ext) and + # using lexicographical comparison below + # e.g "20240101" > "20231122" results in True + # yet "20231121" > "20231122" reults in False + len(self.release) == len("20231122") and self.release >= "20231122"): + # NOTE for UK extensions starting from 20231122 the + # OPCS4 refset ID seems to be different + self.opcs_refset_id = '1382401000000109' def to_concept_df(self): """ @@ -398,7 +433,7 @@ def _map_snomed2refset(self): mapping_df = pd.concat(dfs2merge) del dfs2merge if self.uk_ext or self.uk_drug_ext: - opcs_df = mapping_df[mapping_df['refsetId'] == '1126441000000105'] + opcs_df = mapping_df[mapping_df['refsetId'] == self.opcs_refset_id] icd10_df = mapping_df[mapping_df['refsetId'] == '999002271000000101'] return icd10_df, opcs_df diff --git a/tests/utils/test_preprocess_snomed.py b/tests/utils/test_preprocess_snomed.py new file mode 100644 index 000000000..59a00f6fc --- /dev/null +++ b/tests/utils/test_preprocess_snomed.py @@ -0,0 +1,64 @@ +from typing import Dict +from medcat.utils import preprocess_snomed + +import unittest + + +EXAMPLE_REFSET_DICT: Dict = { + 'SCUI1': [ + {'code': 'TCUI1', 'mapPriority': '1'}, + {'code': 'TCUI2', 'mapPriority': '2'}, + {'code': 'TCUI3', 'mapPriority': '3'}, + ] +} + +# in order from highest priority to lowest +EXPECTED_DIRECT_MAPPINGS = {"SCUI1": ['TCUI3', 'TCUI2', 'TCUI1']} + +EXAMPLE_REFSET_DICT_WITH_EXTRAS = dict( + (k, [dict(v, otherKey=f"val-{k}") for v in vals]) for k, vals in EXAMPLE_REFSET_DICT.items()) + +EXAMPLE_REFSET_DICT_NO_PRIORITY = dict( + (k, [{ik: iv for ik, iv in v.items() if ik != 'mapPriority'} for v in vals]) for k, vals in EXAMPLE_REFSET_DICT.items() +) + +EXAMPLE_REFSET_DICT_NO_CODE = dict( + (k, [{ik: iv for ik, iv in v.items() if ik != 'code'} for v in vals]) for k, vals in EXAMPLE_REFSET_DICT.items() +) + + +class DirectMappingTest(unittest.TestCase): + + def test_example_gets_direct_mappings(self): + res = preprocess_snomed.get_direct_refset_mapping(EXAMPLE_REFSET_DICT) + self.assertEqual(res, EXPECTED_DIRECT_MAPPINGS) + + def test_example_w_extras_gets_direct_mappings(self): + res = preprocess_snomed.get_direct_refset_mapping(EXAMPLE_REFSET_DICT_WITH_EXTRAS) + self.assertEqual(res, EXPECTED_DIRECT_MAPPINGS) + + def test_example_no_priority_fails(self): + with self.assertRaises(KeyError): + preprocess_snomed.get_direct_refset_mapping(EXAMPLE_REFSET_DICT_NO_PRIORITY) + + def test_example_no_codfe_fails(self): + with self.assertRaises(KeyError): + preprocess_snomed.get_direct_refset_mapping(EXAMPLE_REFSET_DICT_NO_CODE) + +EXAMPLE_SNOMED_PATH_OLD = "SnomedCT_InternationalRF2_PRODUCTION_20220831T120000Z" +EXAMPLE_SNOMED_PATH_NEW = "SnomedCT_UKClinicalRF2_PRODUCTION_20231122T000001Z" + + +class TestSnomedVersionsOPCS4(unittest.TestCase): + + def test_old_gets_old_OPCS4_mapping_nonuk_ext(self): + snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_OLD, uk_ext=False) + self.assertEqual(snomed.opcs_refset_id, "1126441000000105") + + def test_old_gets_old_OPCS4_mapping_uk_ext(self): + snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_OLD, uk_ext=True) + self.assertEqual(snomed.opcs_refset_id, "1126441000000105") + + def test_new_gets_new_OCPS4_mapping_uk_ext(self): + snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_NEW, uk_ext=True) + self.assertEqual(snomed.opcs_refset_id, "1382401000000109")