From b8255020c39e9aa0bac80d1908be0b3bc7b24b39 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 1 Nov 2023 16:27:27 +0000 Subject: [PATCH 1/9] CU-8692zguyq: Slight simplification of minimum-name-length logic --- medcat/preprocessing/cleaners.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/medcat/preprocessing/cleaners.py b/medcat/preprocessing/cleaners.py index 18314d562..43e8098e2 100644 --- a/medcat/preprocessing/cleaners.py +++ b/medcat/preprocessing/cleaners.py @@ -48,7 +48,8 @@ def prepare_name(raw_name: str, nlp: Language, names: Dict, config: Config) -> D snames = set() name = config.general['separator'].join(tokens) - if not config.cdb_maker.get('min_letters_required', 0) or len(re.sub("[^A-Za-z]*", '', name)) >= config.cdb_maker.get('min_letters_required', 0): + min_letters = config.cdb_maker.get('min_letters_required', 0) + if not min_letters or len(re.sub("[^A-Za-z]*", '', name)) >= min_letters: if name not in names: sname = "" for token in tokens: From 4ad40a5ae8d5ba9ee9686ce4cae70735e5614298 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 1 Nov 2023 16:28:27 +0000 Subject: [PATCH 2/9] CU-8692zguyq: Add some tests for prepare_name preprocessor --- tests/preprocessing/__init__.py | 0 tests/preprocessing/test_cleaners.py | 104 +++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 tests/preprocessing/__init__.py create mode 100644 tests/preprocessing/test_cleaners.py diff --git a/tests/preprocessing/__init__.py b/tests/preprocessing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/preprocessing/test_cleaners.py b/tests/preprocessing/test_cleaners.py new file mode 100644 index 000000000..b879d9ee6 --- /dev/null +++ b/tests/preprocessing/test_cleaners.py @@ -0,0 +1,104 @@ +from medcat.preprocessing.cleaners import prepare_name +from medcat.config import Config +from medcat.cdb_maker import CDBMaker + +import logging, os + +import unittest + + +class BaseCDBMakerTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + config = Config() + config.general['log_level'] = logging.DEBUG + config.general["spacy_model"] = "en_core_web_md" + cls.maker = CDBMaker(config) + csvs = [ + os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'examples', 'cdb.csv'), + os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'examples', 'cdb_2.csv') + ] + cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True) + + +class BasePrepareNameTest(BaseCDBMakerTests): + raw_name = 'raw' + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.do_prepare_name() + + # method called after setup, when raw_name has been specified + @classmethod + def do_prepare_name(cls) -> None: + cls.name = cls.cdb.config.general.separator.join(cls.raw_name.split()) + cls.names = prepare_name(cls.raw_name, cls.maker.pipe.spacy_nlp, {}, cls.cdb.config) + + def _dict_has_key_val_type(self, d: dict, key, val_type): + self.assertIn(key, d) + self.assertIsInstance(d[key], val_type) + + def _names_has_key_val_type(self, key, val_type): + self._dict_has_key_val_type(self.names, key, val_type) + + def test_result_has_name(self): + self._names_has_key_val_type(self.name, dict) + + def test_name_info_has_tokens(self): + self._dict_has_key_val_type(self.names[self.name], 'tokens', list) + + def test_name_info_has_words_as_tokens(self): + name_info = self.names[self.name] + tokens = name_info['tokens'] + for word in self.raw_name.split(): + with self.subTest(word): + self.assertIn(word, tokens) + + +class NamePreparationTests_OneLetter(BasePrepareNameTest): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.raw_name = "a" + # the minimum name length is defined by the following config option + # if I don't set this to 1 here, I would see the tests fail + # that would be because the result from `prepare_names` would be empty + cls.cdb.config.cdb_maker.min_letters_required = 1 + super().do_prepare_name() + + +class NamePreparationTests_TwoLetters(BasePrepareNameTest): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.raw_name = "an" + super().do_prepare_name() + + +class NamePreparationTests_MultiToken(BasePrepareNameTest): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.raw_name = "this raw name" + super().do_prepare_name() + + +class NamePreparationTests_Empty(BaseCDBMakerTests): + """In case of an empty name, I would expect the names dict + returned by `prepare_name` to be empty. + """ + empty_raw_name = '' + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.names = prepare_name(cls.empty_raw_name, cls.maker.pipe.spacy_nlp, {}, cls.cdb.config) + + def test_names_dict_is_empty(self): + self.assertEqual(len(self.names), 0) + self.assertEqual(self.names, {}) From 279dd9068a8abb6f908aa49f0bb6d9d3435579e4 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 1 Nov 2023 16:37:52 +0000 Subject: [PATCH 3/9] CU-8692zguyq: Add warning if no preferred name was added along a new CUI --- medcat/cdb.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/medcat/cdb.py b/medcat/cdb.py index 5a648f4af..73199f0df 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -318,6 +318,20 @@ def add_concept(self, if name_status == 'P' and cui not in self.cui2preferred_name: # Do not overwrite old preferred names self.cui2preferred_name[cui] = name_info['raw_name'] + elif names: + # if no name_info and names is NOT an empty dict + # this shouldn't really happen in the current setup + raise ValueError("Unknown state where there is no name_info, " + "yet the `names` dict is not empty (%s)", names) + elif name_status == 'P' and cui not in self.cui2preferred_name: + # this means names is an empty `names` dict + logger.warning("Did not manage to add a preferred name in `add_cui`. " + "This means that the `names` dict passed was empty. " + "This is _usually_ caused by either no name or too short " + "a name passed to the `prepare_name` method. " + "The minimum length is defined in: " + "'config.cdb_maker.min_letters_required' and " + "is currently set at %s", self.config.cdb_maker['min_letters_required']) # Add other fields if full_build if full_build: From 647075a93fabd47b960ec33e80a392a9280acd6f Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 1 Nov 2023 16:54:21 +0000 Subject: [PATCH 4/9] CU-8692zguyq: Add additional warning messages when adding/training a new CUI with no preferred name --- medcat/cat.py | 4 ++++ medcat/cdb.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/medcat/cat.py b/medcat/cat.py index a86584c3a..49d230404 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -840,6 +840,10 @@ def add_and_train_concept(self, Refer to medcat.cat.cdb.CDB.add_concept """ names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config) + if not names and cui not in self.cdb.cui2preferred_name: + logger.warning("No names were able to be prepared in CAT.add_and_train_concept " + "method. As such no preferred name will be able to be specifeid. " + "The CUI: '%s' and raw name: '%s'", cui, name) # Only if not negative, otherwise do not add the new name if in fact it should not be detected if do_add_concept and not negative: self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description, diff --git a/medcat/cdb.py b/medcat/cdb.py index 73199f0df..3f3158fa0 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -326,12 +326,13 @@ def add_concept(self, elif name_status == 'P' and cui not in self.cui2preferred_name: # this means names is an empty `names` dict logger.warning("Did not manage to add a preferred name in `add_cui`. " + "Was trying to do so for cui: '%s'" "This means that the `names` dict passed was empty. " "This is _usually_ caused by either no name or too short " "a name passed to the `prepare_name` method. " "The minimum length is defined in: " "'config.cdb_maker.min_letters_required' and " - "is currently set at %s", self.config.cdb_maker['min_letters_required']) + "is currently set at %s", cui, self.config.cdb_maker['min_letters_required']) # Add other fields if full_build if full_build: From 305a0342e0db4f63b147063d30e8cfa3e5835247 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 1 Nov 2023 17:51:40 +0000 Subject: [PATCH 5/9] CU-8692zguyq: Make no preferred name warnings only run if name status is preferred --- medcat/cat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/cat.py b/medcat/cat.py index 49d230404..eebb7e639 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -840,7 +840,7 @@ def add_and_train_concept(self, Refer to medcat.cat.cdb.CDB.add_concept """ names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config) - if not names and cui not in self.cdb.cui2preferred_name: + if not names and cui not in self.cdb.cui2preferred_name and name_status == 'P': logger.warning("No names were able to be prepared in CAT.add_and_train_concept " "method. As such no preferred name will be able to be specifeid. " "The CUI: '%s' and raw name: '%s'", cui, name) From eedca6936223db7a15beefaa7a469ad1f37585f9 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 1 Nov 2023 17:52:14 +0000 Subject: [PATCH 6/9] CU-8692zguyq: Add tests for no-preferred name warnings --- tests/test_cat.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_cat.py b/tests/test_cat.py index 62db4d44d..20f674a67 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -2,6 +2,7 @@ import os import sys import unittest +import unittest.mock import tempfile import shutil from transformers import AutoTokenizer @@ -388,6 +389,46 @@ def test_hashing(self): cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) self.assertEqual(cat.get_hash(), cat.config.version.id) + def assert_patch_called(self, method, name: str, name_status: str, nr_of_calls: int): + cui = 'CUI-%d'%(hash(name) + id(name)) + self.undertest.add_and_train_concept(cui, name, name_status=name_status) + if nr_of_calls == 0: + method.assert_not_called() + elif nr_of_calls == 1: + method.assert_called_once() + else: + raise ValueError("Undefined") + + @unittest.mock.patch('medcat.cat.logger') + def test_add_and_train_concept_cat_nowarn_long_name(self, cat_logger): + long_name = 'a very long name' + self.assert_patch_called(cat_logger.warning, name=long_name, name_status='', nr_of_calls=0) + + @unittest.mock.patch('medcat.cdb.logger') + def test_add_and_train_concept_cdb_nowarn_long_name(self, cdb_logger): + long_name = 'a very long name' + self.assert_patch_called(cdb_logger.warning, name=long_name, name_status='', nr_of_calls=0) + + @unittest.mock.patch('medcat.cat.logger') + def test_add_and_train_concept_cat_nowarn_short_name_not_pref(self, cat_logger): + short_name = 'a' + self.assert_patch_called(cat_logger.warning, name=short_name, name_status='', nr_of_calls=0) + + @unittest.mock.patch('medcat.cdb.logger') + def test_add_and_train_concept_cdb_nowarn_short_name_not_pref(self, cdb_logger): + short_name = 'a' + self.assert_patch_called(cdb_logger.warning, name=short_name, name_status='', nr_of_calls=0) + + @unittest.mock.patch('medcat.cat.logger') + def test_add_and_train_concept_cat_warns_short_name(self, cat_logger): + short_name = 'a' + self.assert_patch_called(cat_logger.warning, name=short_name, name_status='P', nr_of_calls=1) + + @unittest.mock.patch('medcat.cdb.logger') + def test_add_and_train_concept_cdb_warns_short_name(self, cdb_logger): + short_name = 'a' + self.assert_patch_called(cdb_logger.warning, name=short_name, name_status='P', nr_of_calls=1) + class ModelWithTwoConfigsLoadTests(unittest.TestCase): From e7de5e26784eede36dac4b78486029ac1efc7364 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 1 Nov 2023 19:36:07 +0000 Subject: [PATCH 7/9] CU-8692zguyq: Add Vocab.make_unigram_table to CAT tests --- tests/test_cat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_cat.py b/tests/test_cat.py index 20f674a67..e8514c11d 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -21,6 +21,7 @@ class CATTests(unittest.TestCase): def setUpClass(cls) -> None: cls.cdb = CDB.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat")) cls.vocab = Vocab.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat")) + cls.vocab.make_unigram_table() cls.cdb.config.general.spacy_model = "en_core_web_md" cls.cdb.config.ner.min_name_len = 2 cls.cdb.config.ner.upper_case_limit_len = 3 From c4a43d25ce2f9b62a7af92625abab3825e416bc1 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 28 Nov 2023 11:05:13 +0000 Subject: [PATCH 8/9] CU-8692zguyq: Move to built in asserting for logging instead of patching the method --- tests/test_cat.py | 52 ++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/tests/test_cat.py b/tests/test_cat.py index e8514c11d..c368fac31 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -2,13 +2,13 @@ import os import sys import unittest -import unittest.mock import tempfile import shutil +import logging from transformers import AutoTokenizer from medcat.vocab import Vocab -from medcat.cdb import CDB -from medcat.cat import CAT +from medcat.cdb import CDB, logger as cdb_logger +from medcat.cat import CAT, logger as cat_logger from medcat.utils.checkpoint import Checkpoint from medcat.meta_cat import MetaCAT from medcat.config_meta_cat import ConfigMetaCAT @@ -390,45 +390,37 @@ def test_hashing(self): cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) self.assertEqual(cat.get_hash(), cat.config.version.id) - def assert_patch_called(self, method, name: str, name_status: str, nr_of_calls: int): + def assertLogsDuringAddAndTrainConcept(self, logger: logging.Logger, log_level, + name: str, name_status: str, nr_of_calls: int): cui = 'CUI-%d'%(hash(name) + id(name)) - self.undertest.add_and_train_concept(cui, name, name_status=name_status) - if nr_of_calls == 0: - method.assert_not_called() - elif nr_of_calls == 1: - method.assert_called_once() - else: - raise ValueError("Undefined") - - @unittest.mock.patch('medcat.cat.logger') - def test_add_and_train_concept_cat_nowarn_long_name(self, cat_logger): + with (self.assertLogs(logger=logger, level=log_level) + if nr_of_calls == 1 + else self.assertNoLogs(logger=logger, level=log_level)): + self.undertest.add_and_train_concept(cui, name, name_status=name_status) + + def test_add_and_train_concept_cat_nowarn_long_name(self): long_name = 'a very long name' - self.assert_patch_called(cat_logger.warning, name=long_name, name_status='', nr_of_calls=0) + self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=long_name, name_status='', nr_of_calls=0) - @unittest.mock.patch('medcat.cdb.logger') - def test_add_and_train_concept_cdb_nowarn_long_name(self, cdb_logger): + def test_add_and_train_concept_cdb_nowarn_long_name(self): long_name = 'a very long name' - self.assert_patch_called(cdb_logger.warning, name=long_name, name_status='', nr_of_calls=0) + self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=long_name, name_status='', nr_of_calls=0) - @unittest.mock.patch('medcat.cat.logger') - def test_add_and_train_concept_cat_nowarn_short_name_not_pref(self, cat_logger): + def test_add_and_train_concept_cat_nowarn_short_name_not_pref(self): short_name = 'a' - self.assert_patch_called(cat_logger.warning, name=short_name, name_status='', nr_of_calls=0) + self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=short_name, name_status='', nr_of_calls=0) - @unittest.mock.patch('medcat.cdb.logger') - def test_add_and_train_concept_cdb_nowarn_short_name_not_pref(self, cdb_logger): + def test_add_and_train_concept_cdb_nowarn_short_name_not_pref(self): short_name = 'a' - self.assert_patch_called(cdb_logger.warning, name=short_name, name_status='', nr_of_calls=0) + self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='', nr_of_calls=0) - @unittest.mock.patch('medcat.cat.logger') - def test_add_and_train_concept_cat_warns_short_name(self, cat_logger): + def test_add_and_train_concept_cat_warns_short_name(self): short_name = 'a' - self.assert_patch_called(cat_logger.warning, name=short_name, name_status='P', nr_of_calls=1) + self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1) - @unittest.mock.patch('medcat.cdb.logger') - def test_add_and_train_concept_cdb_warns_short_name(self, cdb_logger): + def test_add_and_train_concept_cdb_warns_short_name(self): short_name = 'a' - self.assert_patch_called(cdb_logger.warning, name=short_name, name_status='P', nr_of_calls=1) + self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1) class ModelWithTwoConfigsLoadTests(unittest.TestCase): From 6123a42b4c1081cf2f03cebdda4ca44981e5caaf Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 28 Nov 2023 11:58:13 +0000 Subject: [PATCH 9/9] CU-8692zguyq: Add workaround for assertNoLogs on python 3.8 and 3.9 --- tests/test_cat.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/test_cat.py b/tests/test_cat.py index c368fac31..86f1c89e3 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -5,6 +5,7 @@ import tempfile import shutil import logging +import contextlib from transformers import AutoTokenizer from medcat.vocab import Vocab from medcat.cdb import CDB, logger as cdb_logger @@ -390,12 +391,28 @@ def test_hashing(self): cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) self.assertEqual(cat.get_hash(), cat.config.version.id) + def _assertNoLogs(self, logger: logging.Logger, level: int): + if hasattr(self, 'assertNoLogs'): + return self.assertNoLogs(logger=logger, level=level) + else: + return self.__assertNoLogs(logger=logger, level=level) + + @contextlib.contextmanager + def __assertNoLogs(self, logger: logging.Logger, level: int): + try: + with self.assertLogs(logger, level) as captured_logs: + yield + except AssertionError: + return + if captured_logs: + raise AssertionError("Logs were found: {}".format(captured_logs)) + def assertLogsDuringAddAndTrainConcept(self, logger: logging.Logger, log_level, name: str, name_status: str, nr_of_calls: int): cui = 'CUI-%d'%(hash(name) + id(name)) with (self.assertLogs(logger=logger, level=log_level) if nr_of_calls == 1 - else self.assertNoLogs(logger=logger, level=log_level)): + else self._assertNoLogs(logger=logger, level=log_level)): self.undertest.add_and_train_concept(cui, name, name_status=name_status) def test_add_and_train_concept_cat_nowarn_long_name(self):