Merge branch 'master' into CU-8695hghww-backwards-compatibility-workflow

CogStack · Aug 30, 2024 · 1a94d2d · 1a94d2d
2 parents 731693b + b8bb4e3
commit 1a94d2d
Show file tree

Hide file tree

Showing 38 changed files with 90 additions and 96 deletions.
diff --git a/medcat/cat.py b/medcat/cat.py
@@ -54,7 +54,7 @@
 
 class CAT(object):
     """The main MedCAT class used to annotate documents, it is built on top of spaCy
-    and works as a spaCy pipline. Creates an instance of a spaCy pipline that can
+    and works as a spaCy pipeline. Creates an instance of a spaCy pipeline that can
     be used as a spacy nlp model.
 
     Args:
@@ -264,7 +264,7 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M
         if cdb_format.lower() == 'json':
             json_path = save_dir_path # in the same folder!
         else:
-            json_path = None # use dill formating
+            json_path = None # use dill formatting
         logger.info('Saving model pack with CDB in %s format', cdb_format)
 
         # expand user path to make this work with '~'
@@ -345,7 +345,7 @@ def attempt_unpack(cls, zip_path: str) -> str:
 
         model_pack_path = os.path.join(base_dir, foldername)
         if os.path.exists(model_pack_path):
-            logger.info("Found an existing unziped model pack at: {}, the provided zip will not be touched.".format(model_pack_path))
+            logger.info("Found an existing unzipped model pack at: {}, the provided zip will not be touched.".format(model_pack_path))
         else:
             logger.info("Unziping the model pack and loading models.")
             shutil.unpack_archive(zip_path, extract_dir=model_pack_path)
@@ -554,7 +554,7 @@ def _print_stats(self,
                 Each project in MedCATtrainer can have filters, do we want to respect those filters
                 when calculating metrics.
             use_overlaps (bool):
-                Allow overlapping entities, nearly always False as it is very difficult to annotate overlapping entites.
+                Allow overlapping entities, nearly always False as it is very difficult to annotate overlapping entities.
             use_cui_doc_limit (bool):
                 If True the metrics for a CUI will be only calculated if that CUI appears in a document, in other words
                 if the document was annotated for that CUI. Useful in very specific situations when during the annotation
@@ -670,7 +670,7 @@ def add_cui_to_group(self, cui: str, group_name: str) -> None:
             cui (str):
                 The concept to be added.
             group_name (str):
-                The group to whcih the concept will be added.
+                The group to which the concept will be added.
 
         Examples:
 
@@ -1222,7 +1222,7 @@ def _run_nn_components(self, docs: Dict, nn_components: List, id2text: Dict) ->
         for name, component in nn_components:
             component.config.general['disable_component_lock'] = True
 
-        # For meta_cat compoments 
+        # For meta_cat components
         for name, component in [c for c in nn_components if isinstance(c[1], MetaCAT)]:
             spacy_docs = component.pipe(spacy_docs)
         for spacy_doc in spacy_docs:
@@ -1370,7 +1370,7 @@ def multiprocessing_batch_char_size(self,
 
         docs = {}
         _start_time = time.time()
-        _batch_counter = 0 # Used for splitting the output, counts batches inbetween saves
+        _batch_counter = 0 # Used for splitting the output, counts batches between saves
         for batch in self._batch_generator(iterator, batch_size_chars, skip_ids=set(annotated_ids)):
             logger.info("Annotated until now: %s docs; Current BS: %s docs; Elapsed time: %.2f minutes",
                           len(annotated_ids),

diff --git a/medcat/cdb.py b/medcat/cdb.py
@@ -13,7 +13,6 @@
 from medcat.utils.matutils import unitvec
 from medcat.utils.ml_utils import get_lr_linking
 from medcat.config import Config, workers
-from medcat.utils.decorators import deprecated
 from medcat.utils.saving.serializer import CDBSerializer
 from medcat.utils.config_utils import get_and_del_weighted_average_from_config
 from medcat.utils.config_utils import default_weighted_average
@@ -29,7 +28,7 @@ class CDB(object):
 
     Properties:
         name2cuis (Dict[str, List[str]]):
-            Map fro concept name to CUIs - one name can map to multiple CUIs.
+            Map for concept name to CUIs - one name can map to multiple CUIs.
         name2cuis2status (Dict[str, Dict[str, str]]):
             What is the status for a given name and cui pair - each name can be:
                 P - Preferred, A - Automatic (e.g. let medcat decide), N - Not common.
@@ -58,7 +57,7 @@ class CDB(object):
             Any additional maps that are not part of the core CDB. These are usually not needed
             for the base NER+L use-case, but can be useufl for Debugging or some special stuff.
         vocab (Dict[str, int]):
-            Stores all the words tha appear in this CDB and the count for each one.
+            Stores all the words that appear in this CDB and the count for each one.
         is_dirty (bool):
             Whether or not the CDB has been changed since it was loaded or created
     """
@@ -129,7 +128,7 @@ def get_name(self, cui: str) -> str:
 
         Args:
             cui (str):
-                Concept ID or unique identifer in this database.
+                Concept ID or unique identifier in this database.
 
         Returns:
             str: The name of the concept.
@@ -148,11 +147,6 @@ def update_cui2average_confidence(self, cui: str, new_sim: float) -> None:
                                             (self.cui2count_train.get(cui, 0) + 1)
         self.is_dirty = True
 
-    @deprecated("Deprecated. For internal use only. Use CAT.unlink_concept_name instead",
-                depr_version=(1, 12, 0), removal_version=(1, 13, 0))
-    def remove_names(self, cui: str, names: Iterable[str]) -> None:
-        self._remove_names(cui, names)
-
     def _remove_names(self, cui: str, names: Iterable[str]) -> None:
         """Remove names from an existing concept - effect is this name will never again be used to link to this concept.
         This will only remove the name from the linker (namely name2cuis and name2cuis2status), the name will still be present everywhere else.
@@ -161,7 +155,7 @@ def _remove_names(self, cui: str, names: Iterable[str]) -> None:
 
         Args:
             cui (str):
-                Concept ID or unique identifer in this database.
+                Concept ID or unique identifier in this database.
             names (Iterable[str]):
                 Names to be removed (e.g list, set, or even a dict (in which case keys will be used)).
         """
@@ -194,7 +188,7 @@ def remove_cui(self, cui: str) -> None:
 
         Args:
             cui (str):
-                Concept ID or unique identifer in this database.
+                Concept ID or unique identifier in this database.
         """
         if cui in self.cui2names:
             del self.cui2names[cui]
@@ -233,7 +227,7 @@ def add_names(self, cui: str, names: Dict[str, Dict], name_status: str = 'A', fu
 
         Args:
             cui (str):
-                Concept ID or unique identifer in this database, all concepts that have
+                Concept ID or unique identifier in this database, all concepts that have
                 the same CUI will be merged internally.
             names (Dict[str, Dict]):
                 Names for this concept, or the value that if found in free text can be linked to this concept.
@@ -318,7 +312,7 @@ def _add_concept(self,
             self.name_isupper[name] = names[name]['is_upper']
 
             if name in self.name2cuis:
-                # Means we have alrady seen this name
+                # Means we have already seen this name
                 if cui not in self.name2cuis[name]:
                     # If CUI is not already linked do it
                     self.name2cuis[name].append(cui)
@@ -421,7 +415,7 @@ def update_context_vector(self,
             cui (str):
                 The concept in question.
             vectors (Dict[str, np.ndarray]):
-                Vector represenation of the context, must have the format: {'context_type': np.array(<vector>), ...}
+                Vector representation of the context, must have the format: {'context_type': np.array(<vector>), ...}
                 context_type - is usually one of: ['long', 'medium', 'short']
             negative (bool):
                 Is this negative context of positive (Default Value `False`).
@@ -601,7 +595,7 @@ def import_training(self, cdb: "CDB", overwrite: bool = True) -> None:
 
         Examples:
 
-            >>> new_cdb.import_traininig(cdb=old_cdb, owerwrite=True)
+            >>> new_cdb.import_traininig(cdb=old_cdb, overwrite=True)
         """
         # Import vectors and counts
         for cui in cdb.cui2context_vectors:

diff --git a/medcat/cdb_maker.py b/medcat/cdb_maker.py
@@ -20,13 +20,13 @@
 
 class CDBMaker(object):
     """Given a CSV as shown in https://github.com/CogStack/MedCAT/tree/master/examples/<example> it creates a CDB or
-    updates an exisitng one.
+    updates an existing one.
 
     Args:
         config (medcat.config.Config):
             Global config for MedCAT.
         cdb (medcat.cdb.CDB):
-            If set the `CDBMaker` will updat the existing `CDB` with
+            If set the `CDBMaker` will update the existing `CDB` with
             new concepts in the CSV (Default value `None`).
     """
 

diff --git a/medcat/config.py b/medcat/config.py
@@ -132,7 +132,7 @@ def merge_config(self, config_dict: Dict) -> None:
                 try:
                     setattr(self, key, value)
                 except AttributeError as err:
-                    logger.warning('Issue with setting attribtue "%s":', key, exc_info=err)
+                    logger.warning('Issue with setting attribute "%s":', key, exc_info=err)
         self.rebuild_re()
 
     def parse_config_file(self, path: str, extractor: ValueExtractor = _DEFAULT_EXTRACTOR) -> None:
@@ -281,7 +281,7 @@ class CDBMaker(MixingConfig, BaseModel):
     name_versions: list = ['LOWER', 'CLEAN']
     """Name versions to be generated."""
     multi_separator: str = '|'
-    """If multiple names or type_ids for a concept present in one row of a CSV, they are separted
+    """If multiple names or type_ids for a concept present in one row of a CSV, they are separated
     by the character below."""
     remove_parenthesis: int = 5
     """Should preferred names with parenthesis be cleaned 0 means no, else it means if longer than or equal
@@ -387,7 +387,7 @@ class General(MixingConfig, BaseModel):
     should not be used when annotating millions of documents. If `None` it will be the string "concept", if `short` it will be CUI,
     if `long` it will be CUI | Name | Confidence"""
     map_cui_to_group: bool = False
-    """If the cdb.addl_info['cui2group'] is provided and this option enabled, each CUI will be maped to the group"""
+    """If the cdb.addl_info['cui2group'] is provided and this option enabled, each CUI will be mapped to the group"""
     simple_hash: bool = False
     """Whether to use a simple hash.
 
@@ -402,7 +402,7 @@ class Config:
 class Preprocessing(MixingConfig, BaseModel):
     """The preprocessing part of the config"""
     words_to_skip: set = {'nos'}
-    """This words will be completly ignored from concepts and from the text (must be a Set)"""
+    """This words will be completely ignored from concepts and from the text (must be a Set)"""
     keep_punct: set = {'.', ':'}
     """All punct will be skipped by default, here you can set what will be kept"""
     do_not_normalize: set = {'VBD', 'VBG', 'VBN', 'VBP', 'JJS', 'JJR'}
@@ -411,7 +411,7 @@ class Preprocessing(MixingConfig, BaseModel):
     - https://spacy.io/usage/linguistic-features#pos-tagging
     - Label scheme section per model at https://spacy.io/models/en"""
     skip_stopwords: bool = False
-    """Should stopwords be skipped/ingored when processing input"""
+    """Should stopwords be skipped/ignored when processing input"""
     min_len_normalize: int = 5
     """Nothing below this length will ever be normalized (input tokens or concept names), normalized means lemmatized in this case"""
     stopwords: Optional[set] = None
@@ -433,7 +433,7 @@ class Ner(MixingConfig, BaseModel):
     min_name_len: int = 3
     """Do not detect names below this limit, skip them"""
     max_skip_tokens: int = 2
-    """When checkng tokens for concepts you can have skipped tokens inbetween
+    """When checking tokens for concepts you can have skipped tokens between
     used ones (usually spaces, new lines etc). This number tells you how many skipped can you have."""
     check_upper_case_names: bool = False
     """Check uppercase to distinguish uppercase and lowercase words that have a different meaning."""
@@ -467,13 +467,13 @@ def __eq__(self, other):
 class LinkingFilters(MixingConfig, BaseModel):
     """These describe the linking filters used alongside the model.
 
-    When no CUIs nor exlcuded CUIs are specified (the sets are empty),
+    When no CUIs nor excluded CUIs are specified (the sets are empty),
     all CUIs are accepted.
     If there are CUIs specified then only those will be accepted.
     If there are excluded CUIs specified, they are excluded.
 
     In some cases, there are extra filters as well as MedCATtrainer (MCT) export filters.
-    These are expcted to follow the following:
+    These are expected to follow the following:
     extra_cui_filter ⊆ MCT filter ⊆ Model/config filter
 
     While any other CUIs can be included in the the extra CUI filter or the MCT filter,
@@ -555,10 +555,10 @@ class Linking(MixingConfig, BaseModel):
     """Concepts that have seen less training examples than this will not be used for
     similarity calculation and will have a similarity of -1."""
     always_calculate_similarity: bool = False
-    """Do we want to calculate context similarity even for concepts that are not ambigous."""
+    """Do we want to calculate context similarity even for concepts that are not ambiguous."""
     calculate_dynamic_threshold: bool = False
     """Concepts below this similarity will be ignored. Type can be static/dynamic - if dynamic each CUI has a different TH
-    and it is calcualted as the average confidence for that CUI * similarity_threshold. Take care that dynamic works only
+    and it is calculated as the average confidence for that CUI * similarity_threshold. Take care that dynamic works only
     if the cdb was trained with calculate_dynamic_threshold = True."""
     similarity_threshold_type: str = 'static'
     similarity_threshold: float = 0.25
@@ -569,14 +569,14 @@ class Linking(MixingConfig, BaseModel):
     prefer_primary_name: float = 0.35
     """If >0 concepts for which a detection is its primary name will be preferred by that amount (0 to 1)"""
     prefer_frequent_concepts: float = 0.35
-    """If >0 concepts that are more frequent will be prefered by a multiply of this amount"""
+    """If >0 concepts that are more frequent will be preferred by a multiply of this amount"""
     subsample_after: int = 30000
     """DISABLED in code permanetly: Subsample during unsupervised training if a concept has received more than"""
     devalue_linked_concepts: bool = False
     """When adding a positive example, should it also be treated as Negative for concepts
-    which link to the postive one via names (ambigous names)."""
+    which link to the positive one via names (ambiguous names)."""
     context_ignore_center_tokens: bool = False
-    """If true when the context of a concept is calculated (embedding) the words making that concept are not taken into accout"""
+    """If true when the context of a concept is calculated (embedding) the words making that concept are not taken into account"""
 
     class Config:
         extra = Extra.allow
@@ -612,7 +612,7 @@ def rebuild_re(self) -> None:
         # Some regex that we will need
         self.word_skipper = re.compile('^({})$'.format(
             '|'.join(self.preprocessing.words_to_skip)))
-        # Very agressive punct checker, input will be lowercased
+        # Very aggressive punct checker, input will be lowercased
         self.punct_checker = re.compile(r'[^a-z0-9]+')
 
     # Override

diff --git a/medcat/config_meta_cat.py b/medcat/config_meta_cat.py
@@ -42,7 +42,7 @@ class General(MixingConfig, BaseModel):
     batch_size_eval: int = 5000
     """Number of annotations to be meta-annotated at once in eval"""
     annotate_overlapping: bool = False
-    """If set meta_anns will be calcualted for doc._.ents, otherwise for doc.ents"""
+    """If set meta_anns will be calculated for doc._.ents, otherwise for doc.ents"""
     tokenizer_name: str = 'bbpe'
     """
     Tokenizer name used with MetaCAT.

diff --git a/medcat/config_rel_cat.py b/medcat/config_rel_cat.py
@@ -86,7 +86,7 @@ class Model(MixingConfig, BaseModel):
     emb_grad: bool = True
     """If True the embeddings will also be trained"""
     ignore_cpos: bool = False
-    """If set to True center positions will be ignored when calculating represenation"""
+    """If set to True center positions will be ignored when calculating representation"""
 
     class Config:
         extra = Extra.allow

diff --git a/medcat/linking/context_based_linker.py b/medcat/linking/context_based_linker.py
@@ -70,7 +70,7 @@ def __call__(self, doc: Doc) -> Doc:
 
                     if len(name) >= cnf_l.disamb_length_limit:
                         if len(cuis) == 1:
-                            # N - means name must be disambiguated, is not the prefered
+                            # N - means name must be disambiguated, is not the preferred
                             #name of the concept, links to other concepts also.
                             if self.cdb.name2cuis2status[name][cuis[0]] != 'N':
                                 self._train(cui=cuis[0], entity=entity, doc=doc)

diff --git a/medcat/linking/vector_context_model.py b/medcat/linking/vector_context_model.py
@@ -102,7 +102,7 @@ def similarity(self, cui: str, entity: Span, doc: Doc) -> float:
             doc (Doc): The document to look in.
 
         Returns:
-            float: The simularity.
+            float: The similarity.
         """
         vectors = self.get_context_vectors(entity, doc)
         sim = self._similarity(cui, vectors)

diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py
@@ -456,7 +456,7 @@ def prepare_document(self, doc: Doc, input_ids: List, offset_mapping: List, lowe
             input_ids (List):
                 Input ids
             offset_mapping (List):
-                Offset mapings
+                Offset mappings
             lowercase (bool):
                 Whether to use lower case replace center
 
@@ -475,7 +475,7 @@ def prepare_document(self, doc: Doc, input_ids: List, offset_mapping: List, lowe
 
         samples = []
         last_ind = 0
-        ent_id2ind = {}  # Map form entitiy ID to where is it in the samples array
+        ent_id2ind = {}  # Map form entity ID to where is it in the samples array
         for ent in sorted(ents, key=lambda ent: ent.start_char):
             start = ent.start_char
             end = ent.end_char

diff --git a/medcat/ner/transformers_ner.py b/medcat/ner/transformers_ner.py
@@ -184,7 +184,7 @@ def train(self,
             # NOTE: The following is for backwards comppatibility
             #       in datasets==2.20.0 `trust_remote_code=True` must be explicitly
             #       specified, otherwise an error is raised.
-            #       On the other hand, the keyword argumnet was added in datasets==2.16.0
+            #       On the other hand, the keyword argument was added in datasets==2.16.0
             #       yet we support datasets>=2.2.0.
             #       So we need to use the kwarg if applicable and omit its use otherwise.
             if func_has_kwarg(datasets.load_dataset, 'trust_remote_code'):
@@ -196,7 +196,7 @@ def train(self,
                                       split='train',
                                       cache_dir='/tmp/')
             # We split before encoding so the split is document level, as encoding
-            #does the document spliting into max_seq_len
+            #does the document splitting into max_seq_len
             dataset = dataset.train_test_split(test_size=self.config.general['test_size']) # type: ignore
 
         # Update labelmap in case the current dataset has more labels than what we had before
@@ -330,7 +330,7 @@ def load(cls, save_dir_path: str, config_dict: Optional[Dict] = None) -> "Transf
         config = cast(ConfigTransformersNER, ConfigTransformersNER.load(os.path.join(save_dir_path, 'cat_config.json')))
         config.general['model_name'] = save_dir_path
 
-        # Overwrite loaded paramters with something new
+        # Overwrite loaded parameters with something new
         if config_dict is not None:
             config.merge_config(config_dict)