Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CU-86964zm4d fix preprocessing #496

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 15 additions & 14 deletions medcat/utils/preprocess_snomed.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,15 +265,17 @@ def _determine_bundle(cls, data_path) -> Optional[SupportedBundles]:
return None

def _set_extension(self, release: str, extension: SupportedExtension) -> None:
self.opcs_refset_id = "1126441000000105"
# NOTE: now using the later refset IF by default
# NOTE: the OPCS4 refset ID is only relevant for UK releases
self.opcs_refset_id = '1382401000000109'
if (extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG) and
# using lexicographical comparison below
# e.g "20240101" > "20231122" results in True
# yet "20231121" > "20231122" results in False
len(release) == len("20231122") and release >= "20231122"):
len(release) == len("20231122") and release < "20231122"):
# NOTE for UK extensions starting from 20231122 the
# OPCS4 refset ID seems to be different
self.opcs_refset_id = '1382401000000109'
self.opcs_refset_id = "1126441000000105"
self._extension = extension

@classmethod
Expand Down Expand Up @@ -329,7 +331,7 @@ def to_concept_df(self):
contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept])
concept_snapshot = self._extension.value.exp_files.get_concept()
description_snapshot = self._extension.value.exp_files.get_description()
if concept_snapshot in (None, _IGNORE_TAG) or (
if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or (
self.bundle and self.bundle.value.has_invalid(
self._extension, [RefSetFileType.concept, RefSetFileType.description])):
continue
Expand Down Expand Up @@ -404,7 +406,7 @@ def list_all_relationships(self):
contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept])
concept_snapshot = self._extension.value.exp_files.get_concept()
relationship_snapshot = self._extension.value.exp_files.get_relationship()
if concept_snapshot in (None, _IGNORE_TAG) or (
if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or (
self.bundle and self.bundle.value.has_invalid(
self._extension, [RefSetFileType.concept, RefSetFileType.description])):
continue
Expand Down Expand Up @@ -440,7 +442,7 @@ def relationship2json(self, relationshipcode, output_jsonfile):
contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept])
concept_snapshot = self._extension.value.exp_files.get_concept()
relationship_snapshot = self._extension.value.exp_files.get_relationship()
if concept_snapshot in (None, _IGNORE_TAG) or (
if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or (
self.bundle and self.bundle.value.has_invalid(
self._extension, [RefSetFileType.concept, RefSetFileType.description])):
continue
Expand Down Expand Up @@ -476,10 +478,7 @@ def map_snomed2icd10(self):
dict: A dictionary containing the SNOMED CT to ICD-10 mappings including metadata.
"""
snomed2icd10df = self._map_snomed2refset()
if self._extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG):
return self._refset_df2dict(snomed2icd10df[0])
else:
return self._refset_df2dict(snomed2icd10df)
return self._refset_df2dict(snomed2icd10df[0])

def map_snomed2opcs4(self) -> dict:
"""
Expand All @@ -494,7 +493,8 @@ def map_snomed2opcs4(self) -> dict:
Returns:
dict: A dictionary containing the SNOMED CT to OPCS-4 mappings including metadata.
"""
if self._extension not in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG):
if all(ext not in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG)
for ext in self.exts):
raise AttributeError(
"OPCS-4 mapping does not exist in this edition")
snomed2opcs4df = self._map_snomed2refset()[1]
Expand Down Expand Up @@ -566,7 +566,7 @@ def _map_snomed2refset(self):
self._set_extension(snomed_release, self.exts[i])
refset_terminology = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.refset])
icd10_ref_set = self._extension.value.exp_files.get_refset()
if icd10_ref_set in (None, _IGNORE_TAG) or (
if icd10_ref_set is None or _IGNORE_TAG in icd10_ref_set or (
self.bundle and self.bundle.value.has_invalid(
self._extension, [RefSetFileType.concept, RefSetFileType.description])):
continue
Expand All @@ -582,13 +582,14 @@ def _map_snomed2refset(self):
dfs2merge.append(icd_mappings)
mapping_df = pd.concat(dfs2merge)
del dfs2merge
if self._extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG):
if any(ext in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG)
for ext in self.exts):
opcs_df = mapping_df[mapping_df['refsetId'] == self.opcs_refset_id]
icd10_df = mapping_df[mapping_df['refsetId']
== '999002271000000101']
return icd10_df, opcs_df
else:
return mapping_df
return mapping_df, None


class UnkownSnomedReleaseException(ValueError):
Expand Down
8 changes: 8 additions & 0 deletions tests/utils/test_preprocess_snomed.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def test_example_no_codfe_fails(self):


EXAMPLE_SNOMED_PATH_OLD = "SnomedCT_InternationalRF2_PRODUCTION_20220831T120000Z"
EXAMPLE_SNOMED_PATH_OLD_UK = "SnomedCT_UKClinicalRF2_PRODUCTION_20220831T120000Z"
EXAMPLE_SNOMED_PATH_NEW = "SnomedCT_UKClinicalRF2_PRODUCTION_20231122T000001Z"


Expand Down Expand Up @@ -87,6 +88,13 @@ def test_old_gets_old_OPCS4_mapping(self):
snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_OLD)
snomed._set_extension(snomed._determine_release(EXAMPLE_SNOMED_PATH_OLD),
snomed._determine_extension(EXAMPLE_SNOMED_PATH_OLD))
self.assertEqual(snomed.opcs_refset_id, "1382401000000109") # defaults to this now

def test_old_gets_old_OPCS4_mapping_UK(self):
with patch_fake_files(EXAMPLE_SNOMED_PATH_OLD_UK):
snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_OLD_UK)
snomed._set_extension(snomed._determine_release(EXAMPLE_SNOMED_PATH_OLD_UK),
snomed._determine_extension(EXAMPLE_SNOMED_PATH_OLD_UK))
self.assertEqual(snomed.opcs_refset_id, "1126441000000105")

def test_new_gets_new_OCPS4_mapping(self):
Expand Down
Loading