From 1c3628d4a87a733f185f6fe75a53329d2752fdea Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Wed, 19 Jun 2024 14:38:00 +0100 Subject: [PATCH] CU-8694u3yd2 cleanup name removal (#450) * CU-8694u3yd2: Add logged warning for when using full-unlink * CU-8694u3yd2: Make CDB.remove_names simply expect an iterable of names * CU-8694u3yd2: Improve CDB.remove_names doc string * CU-8694u3yd2: Explicitly pass the keys to CDB.remove_names in CAT.unlink_concept_name * CU-8694u3yd2: Add note regarding state (and order) dependent tests to some CDB maker tests * CU-8694u3yd2: Rename/make protected CDB.remove_names method * CU-8694u3yd2: Create deprecated CDB.remove_names method --- medcat/cat.py | 7 +++++-- medcat/cdb.py | 15 ++++++++++----- tests/test_cdb_maker.py | 20 +++++++++++++++++++- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index 268f9cd8b..d11a534e6 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -689,13 +689,16 @@ def unlink_concept_name(self, cui: str, name: str, preprocessed_name: bool = Fal names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config) # If full unlink find all CUIs - if self.config.general.get('full_unlink', False): + if self.config.general.full_unlink: + logger.warning("In the config `full_unlink` is set to `True`. " + "Thus removing all CUIs linked to the specified name" + " (%s)", name) for n in names: cuis.extend(self.cdb.name2cuis.get(n, [])) # Remove name from all CUIs for c in cuis: - self.cdb.remove_names(cui=c, names=names) + self.cdb._remove_names(cui=c, names=names.keys()) def add_and_train_concept(self, cui: str, diff --git a/medcat/cdb.py b/medcat/cdb.py index 3f482a67c..2047681f4 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -5,7 +5,7 @@ import logging import aiofiles import numpy as np -from typing import Dict, Set, Optional, List, Union, cast +from typing import Dict, Set, Optional, List, Union, cast, Iterable import os from medcat import __version__ @@ -148,7 +148,12 @@ def update_cui2average_confidence(self, cui: str, new_sim: float) -> None: (self.cui2count_train.get(cui, 0) + 1) self.is_dirty = True - def remove_names(self, cui: str, names: Dict[str, Dict]) -> None: + @deprecated("Deprecated. For internal use only. Use CAT.unlink_concept_name instead", + depr_version=(1, 12, 0), removal_version=(1, 13, 0)) + def remove_names(self, cui: str, names: Iterable[str]) -> None: + self._remove_names(cui, names) + + def _remove_names(self, cui: str, names: Iterable[str]) -> None: """Remove names from an existing concept - effect is this name will never again be used to link to this concept. This will only remove the name from the linker (namely name2cuis and name2cuis2status), the name will still be present everywhere else. Why? Because it is bothersome to remove it from everywhere, but @@ -157,10 +162,10 @@ def remove_names(self, cui: str, names: Dict[str, Dict]) -> None: Args: cui (str): Concept ID or unique identifer in this database. - names (Dict[str, Dict]): - Names to be removed, should look like: `{'name': {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}` + names (Iterable[str]): + Names to be removed (e.g list, set, or even a dict (in which case keys will be used)). """ - for name in names.keys(): + for name in names: if name in self.name2cuis: if cui in self.name2cuis[name]: self.name2cuis[name].remove(cui) diff --git a/tests/test_cdb_maker.py b/tests/test_cdb_maker.py index f84e47b15..f454ebe7d 100644 --- a/tests/test_cdb_maker.py +++ b/tests/test_cdb_maker.py @@ -132,6 +132,24 @@ def setUpClass(cls): def tearDownClass(cls) -> None: cls.maker.destroy_pipe() + # NOTE: The following tests are state-dependent. That is to say, + # if the order in which they're executed changes, they may fail. + # They currently rely on the fact that test methods are executed + # in anlphabetic order. But this is overall not good test design + # since failure of one unit could lead to the failure of another + # in unsexpected ways (since there's an expectation on the state). + # + # e.g, if I run: + # python -m unittest\ + # tests.test_cdb_maker.B_CDBMakerEditTests.test_bd_addition_of_context_vector_positive\ + # tests.test_cdb_maker.B_CDBMakerEditTests.test_bc_filter_by_cui\ + # tests.test_cdb_maker.B_CDBMakerEditTests.test_bb_removal_of_name\ + # tests.test_cdb_maker.B_CDBMakerEditTests.test_ba_addition_of_new_name + # Then there will be failures in `test_ba_addition_of_new_name` and `test_bb_removal_of_name` + # due to the changes in state. + # + # Though to make it clear, in the standard configuration the tests run in the + # "correct" order and are successful. def test_ba_addition_of_new_name(self): self.cdb.add_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.pipe.spacy_nlp, {}, self.config), name_status='P', full_build=True) self.assertEqual(len(self.cdb.name2cuis), 6, "Should equal 6") @@ -142,7 +160,7 @@ def test_ba_addition_of_new_name(self): self.assertIn('my~:~new~name~.', self.cdb.name2cuis2status) def test_bb_removal_of_name(self): - self.cdb.remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.pipe.spacy_nlp, {}, self.config)) + self.cdb._remove_names(cui='C0000239', names=prepare_name('MY: new,-_! Name.', self.maker.pipe.spacy_nlp, {}, self.config)) self.assertEqual(len(self.cdb.name2cuis), 5, "Should equal 5") self.assertNotIn('my:newname.', self.cdb.name2cuis2status)