From b73f91b55ce5cf846b30a5303d73d0e89d739f56 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 20 Dec 2023 14:32:03 +0000 Subject: [PATCH 1/3] CU-8693bpq82: Add fallback spacy model along with test --- medcat/cdb.py | 2 +- medcat/pipe.py | 24 +++++++++++++++++++++++- tests/test_cat.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 2ca8382a7..76cb7327e 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -526,7 +526,7 @@ def load_config(self, config_path: str) -> None: if not os.path.exists(config_path): if not self._config_from_file: # if there's no config defined anywhere - raise ValueError("Could not find a config in the CDB nor ", + raise ValueError("Could not find a config in the CDB nor " "in the config.json for this model " f"({os.path.dirname(config_path)})", ) diff --git a/medcat/pipe.py b/medcat/pipe.py index 3861267df..4b4aab945 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -22,6 +22,9 @@ logger = logging.getLogger(__name__) # different logger from the package-level one +DEFAULT_SPACY_MODEL = 'en_core_web_md' + + class Pipe(object): """A wrapper around the standard spacy pipeline. @@ -38,7 +41,23 @@ class Pipe(object): """ def __init__(self, tokenizer: Tokenizer, config: Config) -> None: - self._nlp = spacy.load(config.general.spacy_model, disable=config.general.spacy_disabled_components) + try: + self._nlp = self._init_nlp(config) + except Exception as e: + if config.general.spacy_model == DEFAULT_SPACY_MODEL: + raise e + logger.warning("Could not load spacy model from '%s'. " + "Falling back to installed en_core_web_md. " + "For best compatibility, we'd recommend " + "packaging and using your model pack with " + "the spacy model it was designed for", + config.general.spacy_model) + # we're changing the config value so that this propages + # to other places that try to load the model. E.g: + # medcat.utils.normalizers.TokenNormalizer.__init__ + config.general.spacy_model = DEFAULT_SPACY_MODEL + self._nlp = self._init_nlp(config) + print("TYPE", type(self._nlp)) if config.preprocessing.stopwords is not None: self._nlp.Defaults.stop_words = set(config.preprocessing.stopwords) self._nlp.tokenizer = tokenizer(self._nlp, config) @@ -48,6 +67,9 @@ def __init__(self, tokenizer: Tokenizer, config: Config) -> None: # Set log level logger.setLevel(self.config.general.log_level) + def _init_nlp(selef, config: Config) -> Language: + return spacy.load(config.general.spacy_model, disable=config.general.spacy_disabled_components) + def add_tagger(self, tagger: Callable, name: Optional[str] = None, additional_fields: List[str] = []) -> None: """Add any kind of a tagger for tokens. diff --git a/tests/test_cat.py b/tests/test_cat.py index 368b1e885..7012e24c7 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -10,6 +10,7 @@ from medcat.vocab import Vocab from medcat.cdb import CDB, logger as cdb_logger from medcat.cat import CAT, logger as cat_logger +from medcat.pipe import logger as pipe_logger from medcat.utils.checkpoint import Checkpoint from medcat.meta_cat import MetaCAT from medcat.config_meta_cat import ConfigMetaCAT @@ -499,6 +500,34 @@ def test_loading_model_pack_with_cdb_config_and_config_json_raises_exception(sel CAT.load_model_pack(self.model_path) +class ModelLoadsUnreadableSpacy(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cls.temp_dir = tempfile.TemporaryDirectory() + model_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples") + cdb = CDB.load(os.path.join(model_path, 'cdb.dat')) + cdb.config.general.spacy_model = os.path.join(cls.temp_dir.name, "en_core_web_md") + # save CDB in new location + cdb.save(os.path.join(cls.temp_dir.name, 'cdb.dat')) + # save config in new location + cdb.config.save(os.path.join(cls.temp_dir.name, 'config.json')) + # copy vocab into new location + vocab_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat") + cls.vocab_path = os.path.join(cls.temp_dir.name, 'vocab.dat') + shutil.copyfile(vocab_path, cls.vocab_path) + + @classmethod + def tearDownClass(cls) -> None: + # REMOVE temp dir + cls.temp_dir.cleanup() + + def test_loads_without_specified_spacy_model(self): + with self.assertLogs(logger=pipe_logger, level=logging.WARNING): + cat = CAT.load_model_pack(self.temp_dir.name) + self.assertTrue(isinstance(cat, CAT)) + + class ModelWithZeroConfigsLoadTests(unittest.TestCase): @classmethod From 99dc1ccbcdea47555b78611a049b8665ee3e2bd9 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 20 Dec 2023 14:39:03 +0000 Subject: [PATCH 2/3] CU-8693bpq82: Remove debug output --- medcat/pipe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/medcat/pipe.py b/medcat/pipe.py index 4b4aab945..3985fe045 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -57,7 +57,6 @@ def __init__(self, tokenizer: Tokenizer, config: Config) -> None: # medcat.utils.normalizers.TokenNormalizer.__init__ config.general.spacy_model = DEFAULT_SPACY_MODEL self._nlp = self._init_nlp(config) - print("TYPE", type(self._nlp)) if config.preprocessing.stopwords is not None: self._nlp.Defaults.stop_words = set(config.preprocessing.stopwords) self._nlp.tokenizer = tokenizer(self._nlp, config) From 3ed244a13adde213b48a92e84d2156012c534e23 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 20 Dec 2023 15:22:52 +0000 Subject: [PATCH 3/3] CU-8693bpq82: Add exception info to warning upon spacy model load failure and fallback --- medcat/pipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/pipe.py b/medcat/pipe.py index 3985fe045..1ad9e6766 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -51,7 +51,7 @@ def __init__(self, tokenizer: Tokenizer, config: Config) -> None: "For best compatibility, we'd recommend " "packaging and using your model pack with " "the spacy model it was designed for", - config.general.spacy_model) + config.general.spacy_model, exc_info=e) # we're changing the config value so that this propages # to other places that try to load the model. E.g: # medcat.utils.normalizers.TokenNormalizer.__init__