From ab88b2281ecbeea91a5f5b0440c24abf76902871 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 10 Oct 2024 11:22:45 +0100 Subject: [PATCH 1/5] CU-86964zm4d: Use ignore tag correctly to ignore certain parts of UK release --- medcat/utils/preprocess_snomed.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 60bbb699..380f2c3f 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -329,7 +329,7 @@ def to_concept_df(self): contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept]) concept_snapshot = self._extension.value.exp_files.get_concept() description_snapshot = self._extension.value.exp_files.get_description() - if concept_snapshot in (None, _IGNORE_TAG) or ( + if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or ( self.bundle and self.bundle.value.has_invalid( self._extension, [RefSetFileType.concept, RefSetFileType.description])): continue @@ -404,7 +404,7 @@ def list_all_relationships(self): contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept]) concept_snapshot = self._extension.value.exp_files.get_concept() relationship_snapshot = self._extension.value.exp_files.get_relationship() - if concept_snapshot in (None, _IGNORE_TAG) or ( + if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or ( self.bundle and self.bundle.value.has_invalid( self._extension, [RefSetFileType.concept, RefSetFileType.description])): continue @@ -440,7 +440,7 @@ def relationship2json(self, relationshipcode, output_jsonfile): contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept]) concept_snapshot = self._extension.value.exp_files.get_concept() relationship_snapshot = self._extension.value.exp_files.get_relationship() - if concept_snapshot in (None, _IGNORE_TAG) or ( + if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or ( self.bundle and self.bundle.value.has_invalid( self._extension, [RefSetFileType.concept, RefSetFileType.description])): continue @@ -566,7 +566,7 @@ def _map_snomed2refset(self): self._set_extension(snomed_release, self.exts[i]) refset_terminology = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.refset]) icd10_ref_set = self._extension.value.exp_files.get_refset() - if icd10_ref_set in (None, _IGNORE_TAG) or ( + if icd10_ref_set is None or _IGNORE_TAG in icd10_ref_set or ( self.bundle and self.bundle.value.has_invalid( self._extension, [RefSetFileType.concept, RefSetFileType.description])): continue From ee9dda13e1200746213eb012bc9c71314b14d919 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 10 Oct 2024 11:28:53 +0100 Subject: [PATCH 2/5] CU-86964zm4d: Use OPCS4 later refset ID by default (and switch to older if needed) --- medcat/utils/preprocess_snomed.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 380f2c3f..d045824f 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -265,15 +265,16 @@ def _determine_bundle(cls, data_path) -> Optional[SupportedBundles]: return None def _set_extension(self, release: str, extension: SupportedExtension) -> None: - self.opcs_refset_id = "1126441000000105" + # NOTE: now using the later refset IF by default + self.opcs_refset_id = '1382401000000109' if (extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG) and # using lexicographical comparison below # e.g "20240101" > "20231122" results in True # yet "20231121" > "20231122" results in False - len(release) == len("20231122") and release >= "20231122"): + len(release) == len("20231122") and release < "20231122"): # NOTE for UK extensions starting from 20231122 the # OPCS4 refset ID seems to be different - self.opcs_refset_id = '1382401000000109' + self.opcs_refset_id = "1126441000000105" self._extension = extension @classmethod From 7c8d5f262d6788dd3849fd8bce57e47577049d75 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 10 Oct 2024 16:05:19 +0100 Subject: [PATCH 3/5] CU-86964zm4d: Fix OPCS4 refset ID tests. Fix the default value being tested for (i.e in case of international release that'll be shown). Add a test for old UK extension. --- tests/utils/test_preprocess_snomed.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/utils/test_preprocess_snomed.py b/tests/utils/test_preprocess_snomed.py index a133acdf..d7c2f662 100644 --- a/tests/utils/test_preprocess_snomed.py +++ b/tests/utils/test_preprocess_snomed.py @@ -51,6 +51,7 @@ def test_example_no_codfe_fails(self): EXAMPLE_SNOMED_PATH_OLD = "SnomedCT_InternationalRF2_PRODUCTION_20220831T120000Z" +EXAMPLE_SNOMED_PATH_OLD_UK = "SnomedCT_UKClinicalRF2_PRODUCTION_20220831T120000Z" EXAMPLE_SNOMED_PATH_NEW = "SnomedCT_UKClinicalRF2_PRODUCTION_20231122T000001Z" @@ -87,6 +88,13 @@ def test_old_gets_old_OPCS4_mapping(self): snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_OLD) snomed._set_extension(snomed._determine_release(EXAMPLE_SNOMED_PATH_OLD), snomed._determine_extension(EXAMPLE_SNOMED_PATH_OLD)) + self.assertEqual(snomed.opcs_refset_id, "1382401000000109") # defaults to this now + + def test_old_gets_old_OPCS4_mapping_UK(self): + with patch_fake_files(EXAMPLE_SNOMED_PATH_OLD_UK): + snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_OLD_UK) + snomed._set_extension(snomed._determine_release(EXAMPLE_SNOMED_PATH_OLD_UK), + snomed._determine_extension(EXAMPLE_SNOMED_PATH_OLD_UK)) self.assertEqual(snomed.opcs_refset_id, "1126441000000105") def test_new_gets_new_OCPS4_mapping(self): From a126c8127d185e9f0aee9aaee5d177e15b8e4096 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 10 Oct 2024 16:07:18 +0100 Subject: [PATCH 4/5] CU-86964zm4d: Add note regarding OPCS refset ID relevance only for UK extensions. --- medcat/utils/preprocess_snomed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index d045824f..1310b182 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -266,6 +266,7 @@ def _determine_bundle(cls, data_path) -> Optional[SupportedBundles]: def _set_extension(self, release: str, extension: SupportedExtension) -> None: # NOTE: now using the later refset IF by default + # NOTE: the OPCS4 refset ID is only relevant for UK releases self.opcs_refset_id = '1382401000000109' if (extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG) and # using lexicographical comparison below From de2e4cfa85e7c65616646691e47a3fab5a79610c Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 10 Oct 2024 17:11:00 +0100 Subject: [PATCH 5/5] CU-86964zm4d: Fix checking of extension outside loops. I.e determinie if a UK release/bundle is used for OPCS4/ICD10 mappings splitting. Always returning separate refsets for ICD10 and OSC internally, even if the latter is None. --- medcat/utils/preprocess_snomed.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 1310b182..dc6f4c3a 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -478,10 +478,7 @@ def map_snomed2icd10(self): dict: A dictionary containing the SNOMED CT to ICD-10 mappings including metadata. """ snomed2icd10df = self._map_snomed2refset() - if self._extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG): - return self._refset_df2dict(snomed2icd10df[0]) - else: - return self._refset_df2dict(snomed2icd10df) + return self._refset_df2dict(snomed2icd10df[0]) def map_snomed2opcs4(self) -> dict: """ @@ -496,7 +493,8 @@ def map_snomed2opcs4(self) -> dict: Returns: dict: A dictionary containing the SNOMED CT to OPCS-4 mappings including metadata. """ - if self._extension not in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG): + if all(ext not in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG) + for ext in self.exts): raise AttributeError( "OPCS-4 mapping does not exist in this edition") snomed2opcs4df = self._map_snomed2refset()[1] @@ -584,13 +582,14 @@ def _map_snomed2refset(self): dfs2merge.append(icd_mappings) mapping_df = pd.concat(dfs2merge) del dfs2merge - if self._extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG): + if any(ext in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG) + for ext in self.exts): opcs_df = mapping_df[mapping_df['refsetId'] == self.opcs_refset_id] icd10_df = mapping_df[mapping_df['refsetId'] == '999002271000000101'] return icd10_df, opcs_df else: - return mapping_df + return mapping_df, None class UnkownSnomedReleaseException(ValueError):