diff --git a/medcat/cat.py b/medcat/cat.py index c6f884665..621a2e831 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -54,7 +54,7 @@ class CAT(object): """The main MedCAT class used to annotate documents, it is built on top of spaCy - and works as a spaCy pipline. Creates an instance of a spaCy pipline that can + and works as a spaCy pipeline. Creates an instance of a spaCy pipeline that can be used as a spacy nlp model. Args: @@ -264,7 +264,7 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M if cdb_format.lower() == 'json': json_path = save_dir_path # in the same folder! else: - json_path = None # use dill formating + json_path = None # use dill formatting logger.info('Saving model pack with CDB in %s format', cdb_format) # expand user path to make this work with '~' @@ -345,7 +345,7 @@ def attempt_unpack(cls, zip_path: str) -> str: model_pack_path = os.path.join(base_dir, foldername) if os.path.exists(model_pack_path): - logger.info("Found an existing unziped model pack at: {}, the provided zip will not be touched.".format(model_pack_path)) + logger.info("Found an existing unzipped model pack at: {}, the provided zip will not be touched.".format(model_pack_path)) else: logger.info("Unziping the model pack and loading models.") shutil.unpack_archive(zip_path, extract_dir=model_pack_path) @@ -554,7 +554,7 @@ def _print_stats(self, Each project in MedCATtrainer can have filters, do we want to respect those filters when calculating metrics. use_overlaps (bool): - Allow overlapping entities, nearly always False as it is very difficult to annotate overlapping entites. + Allow overlapping entities, nearly always False as it is very difficult to annotate overlapping entities. use_cui_doc_limit (bool): If True the metrics for a CUI will be only calculated if that CUI appears in a document, in other words if the document was annotated for that CUI. Useful in very specific situations when during the annotation @@ -670,7 +670,7 @@ def add_cui_to_group(self, cui: str, group_name: str) -> None: cui (str): The concept to be added. group_name (str): - The group to whcih the concept will be added. + The group to which the concept will be added. Examples: @@ -1222,7 +1222,7 @@ def _run_nn_components(self, docs: Dict, nn_components: List, id2text: Dict) -> for name, component in nn_components: component.config.general['disable_component_lock'] = True - # For meta_cat compoments + # For meta_cat components for name, component in [c for c in nn_components if isinstance(c[1], MetaCAT)]: spacy_docs = component.pipe(spacy_docs) for spacy_doc in spacy_docs: @@ -1370,7 +1370,7 @@ def multiprocessing_batch_char_size(self, docs = {} _start_time = time.time() - _batch_counter = 0 # Used for splitting the output, counts batches inbetween saves + _batch_counter = 0 # Used for splitting the output, counts batches between saves for batch in self._batch_generator(iterator, batch_size_chars, skip_ids=set(annotated_ids)): logger.info("Annotated until now: %s docs; Current BS: %s docs; Elapsed time: %.2f minutes", len(annotated_ids), diff --git a/medcat/cdb.py b/medcat/cdb.py index 89fd3d8e6..3961fc921 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -13,7 +13,6 @@ from medcat.utils.matutils import unitvec from medcat.utils.ml_utils import get_lr_linking from medcat.config import Config, workers -from medcat.utils.decorators import deprecated from medcat.utils.saving.serializer import CDBSerializer from medcat.utils.config_utils import get_and_del_weighted_average_from_config from medcat.utils.config_utils import default_weighted_average @@ -29,7 +28,7 @@ class CDB(object): Properties: name2cuis (Dict[str, List[str]]): - Map fro concept name to CUIs - one name can map to multiple CUIs. + Map for concept name to CUIs - one name can map to multiple CUIs. name2cuis2status (Dict[str, Dict[str, str]]): What is the status for a given name and cui pair - each name can be: P - Preferred, A - Automatic (e.g. let medcat decide), N - Not common. @@ -58,7 +57,7 @@ class CDB(object): Any additional maps that are not part of the core CDB. These are usually not needed for the base NER+L use-case, but can be useufl for Debugging or some special stuff. vocab (Dict[str, int]): - Stores all the words tha appear in this CDB and the count for each one. + Stores all the words that appear in this CDB and the count for each one. is_dirty (bool): Whether or not the CDB has been changed since it was loaded or created """ @@ -129,7 +128,7 @@ def get_name(self, cui: str) -> str: Args: cui (str): - Concept ID or unique identifer in this database. + Concept ID or unique identifier in this database. Returns: str: The name of the concept. @@ -148,11 +147,6 @@ def update_cui2average_confidence(self, cui: str, new_sim: float) -> None: (self.cui2count_train.get(cui, 0) + 1) self.is_dirty = True - @deprecated("Deprecated. For internal use only. Use CAT.unlink_concept_name instead", - depr_version=(1, 12, 0), removal_version=(1, 13, 0)) - def remove_names(self, cui: str, names: Iterable[str]) -> None: - self._remove_names(cui, names) - def _remove_names(self, cui: str, names: Iterable[str]) -> None: """Remove names from an existing concept - effect is this name will never again be used to link to this concept. This will only remove the name from the linker (namely name2cuis and name2cuis2status), the name will still be present everywhere else. @@ -161,7 +155,7 @@ def _remove_names(self, cui: str, names: Iterable[str]) -> None: Args: cui (str): - Concept ID or unique identifer in this database. + Concept ID or unique identifier in this database. names (Iterable[str]): Names to be removed (e.g list, set, or even a dict (in which case keys will be used)). """ @@ -194,7 +188,7 @@ def remove_cui(self, cui: str) -> None: Args: cui (str): - Concept ID or unique identifer in this database. + Concept ID or unique identifier in this database. """ if cui in self.cui2names: del self.cui2names[cui] @@ -233,7 +227,7 @@ def add_names(self, cui: str, names: Dict[str, Dict], name_status: str = 'A', fu Args: cui (str): - Concept ID or unique identifer in this database, all concepts that have + Concept ID or unique identifier in this database, all concepts that have the same CUI will be merged internally. names (Dict[str, Dict]): Names for this concept, or the value that if found in free text can be linked to this concept. @@ -318,7 +312,7 @@ def _add_concept(self, self.name_isupper[name] = names[name]['is_upper'] if name in self.name2cuis: - # Means we have alrady seen this name + # Means we have already seen this name if cui not in self.name2cuis[name]: # If CUI is not already linked do it self.name2cuis[name].append(cui) @@ -421,7 +415,7 @@ def update_context_vector(self, cui (str): The concept in question. vectors (Dict[str, np.ndarray]): - Vector represenation of the context, must have the format: {'context_type': np.array(), ...} + Vector representation of the context, must have the format: {'context_type': np.array(), ...} context_type - is usually one of: ['long', 'medium', 'short'] negative (bool): Is this negative context of positive (Default Value `False`). @@ -601,7 +595,7 @@ def import_training(self, cdb: "CDB", overwrite: bool = True) -> None: Examples: - >>> new_cdb.import_traininig(cdb=old_cdb, owerwrite=True) + >>> new_cdb.import_traininig(cdb=old_cdb, overwrite=True) """ # Import vectors and counts for cui in cdb.cui2context_vectors: diff --git a/medcat/cdb_maker.py b/medcat/cdb_maker.py index 67913f8be..fbe3b97a0 100644 --- a/medcat/cdb_maker.py +++ b/medcat/cdb_maker.py @@ -20,13 +20,13 @@ class CDBMaker(object): """Given a CSV as shown in https://github.com/CogStack/MedCAT/tree/master/examples/ it creates a CDB or - updates an exisitng one. + updates an existing one. Args: config (medcat.config.Config): Global config for MedCAT. cdb (medcat.cdb.CDB): - If set the `CDBMaker` will updat the existing `CDB` with + If set the `CDBMaker` will update the existing `CDB` with new concepts in the CSV (Default value `None`). """ diff --git a/medcat/config.py b/medcat/config.py index 858b5962f..a1dd15e78 100644 --- a/medcat/config.py +++ b/medcat/config.py @@ -132,7 +132,7 @@ def merge_config(self, config_dict: Dict) -> None: try: setattr(self, key, value) except AttributeError as err: - logger.warning('Issue with setting attribtue "%s":', key, exc_info=err) + logger.warning('Issue with setting attribute "%s":', key, exc_info=err) self.rebuild_re() def parse_config_file(self, path: str, extractor: ValueExtractor = _DEFAULT_EXTRACTOR) -> None: @@ -281,7 +281,7 @@ class CDBMaker(MixingConfig, BaseModel): name_versions: list = ['LOWER', 'CLEAN'] """Name versions to be generated.""" multi_separator: str = '|' - """If multiple names or type_ids for a concept present in one row of a CSV, they are separted + """If multiple names or type_ids for a concept present in one row of a CSV, they are separated by the character below.""" remove_parenthesis: int = 5 """Should preferred names with parenthesis be cleaned 0 means no, else it means if longer than or equal @@ -387,7 +387,7 @@ class General(MixingConfig, BaseModel): should not be used when annotating millions of documents. If `None` it will be the string "concept", if `short` it will be CUI, if `long` it will be CUI | Name | Confidence""" map_cui_to_group: bool = False - """If the cdb.addl_info['cui2group'] is provided and this option enabled, each CUI will be maped to the group""" + """If the cdb.addl_info['cui2group'] is provided and this option enabled, each CUI will be mapped to the group""" simple_hash: bool = False """Whether to use a simple hash. @@ -402,7 +402,7 @@ class Config: class Preprocessing(MixingConfig, BaseModel): """The preprocessing part of the config""" words_to_skip: set = {'nos'} - """This words will be completly ignored from concepts and from the text (must be a Set)""" + """This words will be completely ignored from concepts and from the text (must be a Set)""" keep_punct: set = {'.', ':'} """All punct will be skipped by default, here you can set what will be kept""" do_not_normalize: set = {'VBD', 'VBG', 'VBN', 'VBP', 'JJS', 'JJR'} @@ -411,7 +411,7 @@ class Preprocessing(MixingConfig, BaseModel): - https://spacy.io/usage/linguistic-features#pos-tagging - Label scheme section per model at https://spacy.io/models/en""" skip_stopwords: bool = False - """Should stopwords be skipped/ingored when processing input""" + """Should stopwords be skipped/ignored when processing input""" min_len_normalize: int = 5 """Nothing below this length will ever be normalized (input tokens or concept names), normalized means lemmatized in this case""" stopwords: Optional[set] = None @@ -433,7 +433,7 @@ class Ner(MixingConfig, BaseModel): min_name_len: int = 3 """Do not detect names below this limit, skip them""" max_skip_tokens: int = 2 - """When checkng tokens for concepts you can have skipped tokens inbetween + """When checking tokens for concepts you can have skipped tokens between used ones (usually spaces, new lines etc). This number tells you how many skipped can you have.""" check_upper_case_names: bool = False """Check uppercase to distinguish uppercase and lowercase words that have a different meaning.""" @@ -467,13 +467,13 @@ def __eq__(self, other): class LinkingFilters(MixingConfig, BaseModel): """These describe the linking filters used alongside the model. - When no CUIs nor exlcuded CUIs are specified (the sets are empty), + When no CUIs nor excluded CUIs are specified (the sets are empty), all CUIs are accepted. If there are CUIs specified then only those will be accepted. If there are excluded CUIs specified, they are excluded. In some cases, there are extra filters as well as MedCATtrainer (MCT) export filters. - These are expcted to follow the following: + These are expected to follow the following: extra_cui_filter ⊆ MCT filter ⊆ Model/config filter While any other CUIs can be included in the the extra CUI filter or the MCT filter, @@ -555,10 +555,10 @@ class Linking(MixingConfig, BaseModel): """Concepts that have seen less training examples than this will not be used for similarity calculation and will have a similarity of -1.""" always_calculate_similarity: bool = False - """Do we want to calculate context similarity even for concepts that are not ambigous.""" + """Do we want to calculate context similarity even for concepts that are not ambiguous.""" calculate_dynamic_threshold: bool = False """Concepts below this similarity will be ignored. Type can be static/dynamic - if dynamic each CUI has a different TH - and it is calcualted as the average confidence for that CUI * similarity_threshold. Take care that dynamic works only + and it is calculated as the average confidence for that CUI * similarity_threshold. Take care that dynamic works only if the cdb was trained with calculate_dynamic_threshold = True.""" similarity_threshold_type: str = 'static' similarity_threshold: float = 0.25 @@ -569,14 +569,14 @@ class Linking(MixingConfig, BaseModel): prefer_primary_name: float = 0.35 """If >0 concepts for which a detection is its primary name will be preferred by that amount (0 to 1)""" prefer_frequent_concepts: float = 0.35 - """If >0 concepts that are more frequent will be prefered by a multiply of this amount""" + """If >0 concepts that are more frequent will be preferred by a multiply of this amount""" subsample_after: int = 30000 """DISABLED in code permanetly: Subsample during unsupervised training if a concept has received more than""" devalue_linked_concepts: bool = False """When adding a positive example, should it also be treated as Negative for concepts - which link to the postive one via names (ambigous names).""" + which link to the positive one via names (ambiguous names).""" context_ignore_center_tokens: bool = False - """If true when the context of a concept is calculated (embedding) the words making that concept are not taken into accout""" + """If true when the context of a concept is calculated (embedding) the words making that concept are not taken into account""" class Config: extra = Extra.allow @@ -612,7 +612,7 @@ def rebuild_re(self) -> None: # Some regex that we will need self.word_skipper = re.compile('^({})$'.format( '|'.join(self.preprocessing.words_to_skip))) - # Very agressive punct checker, input will be lowercased + # Very aggressive punct checker, input will be lowercased self.punct_checker = re.compile(r'[^a-z0-9]+') # Override diff --git a/medcat/config_meta_cat.py b/medcat/config_meta_cat.py index ab79259b5..ef8f908f2 100644 --- a/medcat/config_meta_cat.py +++ b/medcat/config_meta_cat.py @@ -42,7 +42,7 @@ class General(MixingConfig, BaseModel): batch_size_eval: int = 5000 """Number of annotations to be meta-annotated at once in eval""" annotate_overlapping: bool = False - """If set meta_anns will be calcualted for doc._.ents, otherwise for doc.ents""" + """If set meta_anns will be calculated for doc._.ents, otherwise for doc.ents""" tokenizer_name: str = 'bbpe' """ Tokenizer name used with MetaCAT. diff --git a/medcat/config_rel_cat.py b/medcat/config_rel_cat.py index d2426da93..dfa3b0099 100644 --- a/medcat/config_rel_cat.py +++ b/medcat/config_rel_cat.py @@ -86,7 +86,7 @@ class Model(MixingConfig, BaseModel): emb_grad: bool = True """If True the embeddings will also be trained""" ignore_cpos: bool = False - """If set to True center positions will be ignored when calculating represenation""" + """If set to True center positions will be ignored when calculating representation""" class Config: extra = Extra.allow diff --git a/medcat/linking/context_based_linker.py b/medcat/linking/context_based_linker.py index f2fde23f2..3740f380c 100644 --- a/medcat/linking/context_based_linker.py +++ b/medcat/linking/context_based_linker.py @@ -70,7 +70,7 @@ def __call__(self, doc: Doc) -> Doc: if len(name) >= cnf_l.disamb_length_limit: if len(cuis) == 1: - # N - means name must be disambiguated, is not the prefered + # N - means name must be disambiguated, is not the preferred #name of the concept, links to other concepts also. if self.cdb.name2cuis2status[name][cuis[0]] != 'N': self._train(cui=cuis[0], entity=entity, doc=doc) diff --git a/medcat/linking/vector_context_model.py b/medcat/linking/vector_context_model.py index e4875c32f..eea391419 100644 --- a/medcat/linking/vector_context_model.py +++ b/medcat/linking/vector_context_model.py @@ -102,7 +102,7 @@ def similarity(self, cui: str, entity: Span, doc: Doc) -> float: doc (Doc): The document to look in. Returns: - float: The simularity. + float: The similarity. """ vectors = self.get_context_vectors(entity, doc) sim = self._similarity(cui, vectors) diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py index cff63ed9f..8c73e6178 100644 --- a/medcat/meta_cat.py +++ b/medcat/meta_cat.py @@ -456,7 +456,7 @@ def prepare_document(self, doc: Doc, input_ids: List, offset_mapping: List, lowe input_ids (List): Input ids offset_mapping (List): - Offset mapings + Offset mappings lowercase (bool): Whether to use lower case replace center @@ -475,7 +475,7 @@ def prepare_document(self, doc: Doc, input_ids: List, offset_mapping: List, lowe samples = [] last_ind = 0 - ent_id2ind = {} # Map form entitiy ID to where is it in the samples array + ent_id2ind = {} # Map form entity ID to where is it in the samples array for ent in sorted(ents, key=lambda ent: ent.start_char): start = ent.start_char end = ent.end_char diff --git a/medcat/ner/transformers_ner.py b/medcat/ner/transformers_ner.py index 6b58cfec6..32eb23520 100644 --- a/medcat/ner/transformers_ner.py +++ b/medcat/ner/transformers_ner.py @@ -184,7 +184,7 @@ def train(self, # NOTE: The following is for backwards comppatibility # in datasets==2.20.0 `trust_remote_code=True` must be explicitly # specified, otherwise an error is raised. - # On the other hand, the keyword argumnet was added in datasets==2.16.0 + # On the other hand, the keyword argument was added in datasets==2.16.0 # yet we support datasets>=2.2.0. # So we need to use the kwarg if applicable and omit its use otherwise. if func_has_kwarg(datasets.load_dataset, 'trust_remote_code'): @@ -196,7 +196,7 @@ def train(self, split='train', cache_dir='/tmp/') # We split before encoding so the split is document level, as encoding - #does the document spliting into max_seq_len + #does the document splitting into max_seq_len dataset = dataset.train_test_split(test_size=self.config.general['test_size']) # type: ignore # Update labelmap in case the current dataset has more labels than what we had before @@ -330,7 +330,7 @@ def load(cls, save_dir_path: str, config_dict: Optional[Dict] = None) -> "Transf config = cast(ConfigTransformersNER, ConfigTransformersNER.load(os.path.join(save_dir_path, 'cat_config.json'))) config.general['model_name'] = save_dir_path - # Overwrite loaded paramters with something new + # Overwrite loaded parameters with something new if config_dict is not None: config.merge_config(config_dict) diff --git a/medcat/ner/vocab_based_annotator.py b/medcat/ner/vocab_based_annotator.py index 457387371..f195a8197 100644 --- a/medcat/ner/vocab_based_annotator.py +++ b/medcat/ner/vocab_based_annotator.py @@ -124,7 +124,7 @@ def CheckAnnotation(self, name, tkns, doc, to_disamb, doc_words): if self.cdb.name2cui2status[name][cui] == 'P': # Means this name should be used for training as it nearly always links to #the concept with this CUI - return cui # Break the loop, one name marked with 'P' linkes to max 1 concept + return cui # Break the loop, one name marked with 'P' links to max 1 concept return None else: @@ -169,7 +169,7 @@ def CheckAnnotation(self, name, tkns, doc, to_disamb, doc_words): cui = list(self.cdb.name2cui[name])[0] self._cat._add_ann(cui, doc, tkns, acc=1, name=name) elif self._cat.train and name in self.pref_names and len(name) > 3: - # If training use prefered names as ground truth + # If training use preferred names as ground truth cuis = self.cdb.name2cui[name] for cui in cuis: if name == self.cdb.cui2pref_name.get(cui, 'nan-nan'): diff --git a/medcat/ner/vocab_based_ner.py b/medcat/ner/vocab_based_ner.py index 73737c4f0..97a24dca1 100644 --- a/medcat/ner/vocab_based_ner.py +++ b/medcat/ner/vocab_based_ner.py @@ -23,7 +23,7 @@ def __init__(self, cdb: CDB, config: Config) -> None: # Override def __call__(self, doc: Doc) -> Doc: """Detect candidates for concepts - linker will then be able to do the rest. It adds `entities` to the - doc._.ents and each entity can have the entitiy._.link_candidates - that the linker will resolve. + doc._.ents and each entity can have the entity._.link_candidates - that the linker will resolve. Args: doc (Doc): diff --git a/medcat/pipe.py b/medcat/pipe.py index 65f552c30..f038b0eb4 100644 --- a/medcat/pipe.py +++ b/medcat/pipe.py @@ -59,7 +59,7 @@ def __init__(self, tokenizer: Tokenizer, config: Config) -> None: "packaging and using your model pack with " "the spacy model it was designed for", config.general.spacy_model, exc_info=e) - # we're changing the config value so that this propages + # we're changing the config value so that this propagates # to other places that try to load the model. E.g: # medcat.utils.normalizers.TokenNormalizer.__init__ ensure_spacy_model(DEFAULT_SPACY_MODEL) @@ -128,7 +128,7 @@ def add_ner(self, ner: NER, name: Optional[str] = None) -> None: Span.set_extension('confidence', default=-1, force=True) Span.set_extension('id', default=0, force=True) - # Do not set this property if a vocabulary apporach is not used, this name must + # Do not set this property if a vocabulary approach is not used, this name must #refer to a name2cuis in the cdb. Span.set_extension('detected_name', default=None, force=True) Span.set_extension('link_candidates', default=None, force=True) diff --git a/medcat/preprocessing/cleaners.py b/medcat/preprocessing/cleaners.py index be4b844ae..946f48c2c 100644 --- a/medcat/preprocessing/cleaners.py +++ b/medcat/preprocessing/cleaners.py @@ -13,7 +13,7 @@ def prepare_name(raw_name: str, nlp: Language, names: Dict, config: Config) -> D Args: raw_name (str): - Thre raw name to prepare. + The raw name to prepare. nlp (Language): Spacy nlp model. names (Dict): diff --git a/medcat/stats/kfold.py b/medcat/stats/kfold.py index e6ae2013d..c4162b9ed 100644 --- a/medcat/stats/kfold.py +++ b/medcat/stats/kfold.py @@ -137,7 +137,7 @@ def create_folds(self) -> List[MedCATTrainerExport]: """Create folds. Raises: - ValueError: If somethign went wrong. + ValueError: If something went wrong. Returns: List[MedCATTrainerExport]: The created folds. diff --git a/medcat/stats/stats.py b/medcat/stats/stats.py index e467e0519..5a596e938 100644 --- a/medcat/stats/stats.py +++ b/medcat/stats/stats.py @@ -292,7 +292,7 @@ def get_stats(cat, Each project in MedCATtrainer can have filters, do we want to respect those filters when calculating metrics. use_overlaps (bool): - Allow overlapping entities, nearly always False as it is very difficult to annotate overlapping entites. + Allow overlapping entities, nearly always False as it is very difficult to annotate overlapping entities. use_cui_doc_limit (bool): If True the metrics for a CUI will be only calculated if that CUI appears in a document, in other words if the document was annotated for that CUI. Useful in very specific situations when during the annotation diff --git a/medcat/tokenizers/meta_cat_tokenizers.py b/medcat/tokenizers/meta_cat_tokenizers.py index 93d8b51ed..4c4daf200 100644 --- a/medcat/tokenizers/meta_cat_tokenizers.py +++ b/medcat/tokenizers/meta_cat_tokenizers.py @@ -102,7 +102,7 @@ def __call__(self, text: Union[str, List[str]]) -> Union[Dict, List[Dict]]: return output else: - raise Exception("Unsuported input type, supported: text/list, but got: {}".format(type(text))) + raise Exception("Unsupported input type, supported: text/list, but got: {}".format(type(text))) def save(self, dir_path: str) -> None: self.hf_tokenizers = self.ensure_tokenizer() @@ -179,7 +179,7 @@ def __call__(self, text: Union[str, List[str]]) -> Union[Dict, List[Dict]]: }) return output else: - raise Exception("Unsuported input type, supported: text/list, but got: {}".format(type(text))) + raise Exception("Unsupported input type, supported: text/list, but got: {}".format(type(text))) def save(self, dir_path: str) -> None: self.hf_tokenizers = self.ensure_tokenizer() diff --git a/medcat/utils/cdb_state.py b/medcat/utils/cdb_state.py index 794a40109..0b43abc3b 100644 --- a/medcat/utils/cdb_state.py +++ b/medcat/utils/cdb_state.py @@ -95,7 +95,7 @@ def apply_cdb_state(cdb, state: CDBState) -> None: def load_and_apply_cdb_state(cdb, file_path: str) -> None: """Delete current CDB state and apply CDB state from file. - This first delets the current state of the CDB. + This first deletes the current state of the CDB. This is to save memory. The idea is that saving the staet on disk will save on RAM usage. But it wouldn't really work too well if upon load, two instances were still in diff --git a/medcat/utils/data_utils.py b/medcat/utils/data_utils.py index 4cabd3a2a..90ef1d345 100644 --- a/medcat/utils/data_utils.py +++ b/medcat/utils/data_utils.py @@ -43,7 +43,7 @@ def load_data(data_path: str, require_annotations: bool = True, order_by_num_ann require_annotations (bool): This will require anns but on project level, any doc in a project needs anns. order_by_num_ann (bool): - Whether to order by number of annoations. Defaults to True. + Whether to order by number of annotations. Defaults to True. Returns: Dict: The loaded data. @@ -415,7 +415,7 @@ def consolidate_double_annotations(data_path: str, out_path: str, require_double Args: data_path (str): - Output from MedCATtrainer - projects containig the same documents must have the same name. + Output from MedCATtrainer - projects containing the same documents must have the same name. out_path (str): The consolidated data will be saved here - usually only annotations where both annotators agree require_double (bool): @@ -430,7 +430,7 @@ def consolidate_double_annotations(data_path: str, out_path: str, require_double level will be checked. Returns: - Dict: The consolidated annoation. + Dict: The consolidated annotation. """ d_stats_proj: Dict = {} data: Dict = load_data(data_path, require_annotations=True) diff --git a/medcat/utils/make_vocab.py b/medcat/utils/make_vocab.py index 79bb86dd7..70e9fa111 100644 --- a/medcat/utils/make_vocab.py +++ b/medcat/utils/make_vocab.py @@ -23,7 +23,7 @@ class MakeVocab(object): vocab (medcat.vocab.Vocab, optional): Vocabulary to be extended, leave as None if you want to make a new Vocab. Default: None word_tokenizer (): - A custom tokenizer for word spliting - used if embeddings are BERT or similar. + A custom tokenizer for word splitting - used if embeddings are BERT or similar. Default: None Examples: To make a vocab and train word embeddings. diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 2242984b2..578e378e6 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -269,7 +269,7 @@ def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, Used for dynamic thresholding. Holds the average confidence for this CUI given the training examples. name2cuis (Dict[str, List[str]]): - Map fro concept name to CUIs - one name can map to multiple CUIs. + Map for concept name to CUIs - one name can map to multiple CUIs. name2cuis2status (Dict[str, Dict[str, str]]): What is the status for a given name and cui pair - each name can be: P - Preferred, A - Automatic (e.g. let medcat decide), N - Not common. diff --git a/medcat/utils/meta_cat/data_utils.py b/medcat/utils/meta_cat/data_utils.py index c4dc5f9c2..17059d7f4 100644 --- a/medcat/utils/meta_cat/data_utils.py +++ b/medcat/utils/meta_cat/data_utils.py @@ -29,7 +29,7 @@ def prepare_from_json(data: Dict, replace_center (Optional[str]): If not None the center word (concept) will be replaced with whatever this is. prerequisites (Dict): - A map of prerequisities, for example our data has two meta-annotations (experiencer, negation). Assume I want to create + A map of prerequisites, for example our data has two meta-annotations (experiencer, negation). Assume I want to create a dataset for `negation` but only in those cases where `experiencer=patient`, my prerequisites would be: {'Experiencer': 'Patient'} - Take care that the CASE has to match whatever is in the data. Defaults to `{}`. lowercase (bool): @@ -53,7 +53,7 @@ def prepare_from_json(data: Dict, doc_text = tokenizer(text) for ann in document.get('annotations', document.get('entities', - {}).values()): # A hack to suport entities and annotations + {}).values()): # A hack to support entities and annotations cui = ann['cui'] skip = False if 'meta_anns' in ann and prerequisites: @@ -154,8 +154,8 @@ def prepare_for_oversampled_data(data: List, def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict] = None, category_undersample=None) -> Tuple: - """Converts the category values in the data outputed by `prepare_from_json` - into integere values. + """Converts the category values in the data outputted by `prepare_from_json` + into integer values. Args: data (Dict): @@ -243,7 +243,7 @@ def json_to_fake_spacy(data: Dict, id2text: Dict) -> Iterable: Args: data(Dict): - Output from cat formated as: {: : None: # Disable training for the embeddings - IMPORTANT self.embeddings.weight.requires_grad = config.model['emb_grad'] - # Create the RNN cell - devide + # Create the RNN cell - divide self.rnn = nn.LSTM(input_size=config.model['input_size'], hidden_size=config.model['hidden_size'] // config.model['num_directions'], num_layers=config.model['num_layers'], diff --git a/medcat/utils/ml_utils.py b/medcat/utils/ml_utils.py index 0a7521218..af3e0807c 100644 --- a/medcat/utils/ml_utils.py +++ b/medcat/utils/ml_utils.py @@ -3,7 +3,7 @@ def get_lr_linking(config, cui_count): return config.linking['optim']['lr'] elif config.linking['optim']['type'] == 'linear': lr = config.linking['optim']['base_lr'] - cui_count += 1 # Just in case incrase by 1 + cui_count += 1 # Just in case increase by 1 return max(lr / cui_count, config.linking['optim']['min_lr']) else: raise Exception("Optimizer not implemented") @@ -63,7 +63,7 @@ def build_vocab_from_hf(model_name, hf_tokenizer, vocab): vec = embs[i] vocab.add_word(word=tkn, vec=vec, replace=True) - # Crate the new unigram table + # Create the new unigram table vocab.reset_counts() vocab.make_unigram_table() except Exception: diff --git a/medcat/utils/ner/deid.py b/medcat/utils/ner/deid.py index 688bb1ea6..8a4310a2f 100644 --- a/medcat/utils/ner/deid.py +++ b/medcat/utils/ner/deid.py @@ -52,7 +52,7 @@ class DeIdModel(NerModel): This wraps a CAT instance and simplifies its use as a de-identification model. - It provies methods for creating one from a TransformersNER + It provides methods for creating one from a TransformersNER as well as loading from a model pack (along with some validation). It also exposes some useful parts of the CAT it wraps such as diff --git a/medcat/utils/ner/model.py b/medcat/utils/ner/model.py index 0e17ca6db..23d32c658 100644 --- a/medcat/utils/ner/model.py +++ b/medcat/utils/ner/model.py @@ -15,7 +15,7 @@ class NerModel: This wraps a CAT instance and simplifies its use as a NER model. - It provies methods for creating one from a TransformersNER + It provides methods for creating one from a TransformersNER as well as loading from a model pack (along with some validation). It also exposes some useful parts of the CAT it wraps such as diff --git a/medcat/utils/postprocessing.py b/medcat/utils/postprocessing.py index c5df429f6..9fce764f6 100644 --- a/medcat/utils/postprocessing.py +++ b/medcat/utils/postprocessing.py @@ -40,7 +40,7 @@ def create_main_ann(cdb: CDB, doc: Doc, tuis: Optional[List] = None) -> None: from all the annotations for this document. Args: - cdb (CDB): The Context Databse. + cdb (CDB): The Context Database. doc (Doc): Spacy document. tuis (Optional[List], optional): The type IDs. Defaults to None. """ diff --git a/medcat/utils/preprocess_snomed.py b/medcat/utils/preprocess_snomed.py index 20409a481..ac78548a7 100644 --- a/medcat/utils/preprocess_snomed.py +++ b/medcat/utils/preprocess_snomed.py @@ -72,7 +72,7 @@ class Snomed: release (str): Release of SNOMED CT folder. uk_ext (bool, optional): Specifies whether the version is a SNOMED UK extension released after 2021. Defaults to False. uk_drug_ext (bool, optional): Specifies whether the version is a SNOMED UK drug extension. Defaults to False. - au_ext (bool, optional): Specifies wether the version is a AU release. Defaults to False. + au_ext (bool, optional): Specifies whether the version is a AU release. Defaults to False. """ def __init__(self, data_path, uk_ext=False, uk_drug_ext=False, au_ext: bool = False): @@ -84,7 +84,7 @@ def __init__(self, data_path, uk_ext=False, uk_drug_ext=False, au_ext: bool = Fa if ((self.uk_ext or self.uk_drug_ext) and # using lexicographical comparison below # e.g "20240101" > "20231122" results in True - # yet "20231121" > "20231122" reults in False + # yet "20231121" > "20231122" results in False len(self.release) == len("20231122") and self.release >= "20231122"): # NOTE for UK extensions starting from 20231122 the # OPCS4 refset ID seems to be different diff --git a/medcat/utils/preprocess_umls.py b/medcat/utils/preprocess_umls.py index 7c47f451a..2cd684c4b 100644 --- a/medcat/utils/preprocess_umls.py +++ b/medcat/utils/preprocess_umls.py @@ -108,9 +108,9 @@ def to_concept_df(self) -> pd.DataFrame: df = df.rename(columns=medcat_csv_mapper) - # pop all unneccessary columns + # pop all unnecessary columns - # all initial collumns should have been renamed + # all initial columns should have been renamed for col_name in self.main_columns + self.sem_types_columns: if col_name in df.columns: df.pop(col_name) diff --git a/medcat/utils/relation_extraction/rel_dataset.py b/medcat/utils/relation_extraction/rel_dataset.py index 714cccd30..ce2f26f6b 100644 --- a/medcat/utils/relation_extraction/rel_dataset.py +++ b/medcat/utils/relation_extraction/rel_dataset.py @@ -21,7 +21,7 @@ class RelData(Dataset): def __init__(self, tokenizer: TokenizerWrapperBERT, config: ConfigRelCAT, cdb: CDB = CDB()): """ Use this class to create a dataset for relation annotations from CSV exports, MedCAT exports or Spacy Documents (assuming the documents got generated by MedCAT, - if they did not then please set the required paramenters manually to match MedCAT output, + if they did not then please set the required parameters manually to match MedCAT output, see /medcat/cat.py#_add_nested_ent) If you are using this to create relations from CSV it is assumed that your entities/concepts of diff --git a/medcat/utils/relation_extraction/tokenizer.py b/medcat/utils/relation_extraction/tokenizer.py index af3db5145..2256993e4 100644 --- a/medcat/utils/relation_extraction/tokenizer.py +++ b/medcat/utils/relation_extraction/tokenizer.py @@ -46,7 +46,7 @@ def __call__(self, text, truncation: Optional[bool] = True): return output else: raise Exception( - "Unsuported input type, supported: text/list, but got: {}".format(type(text))) + "Unsupported input type, supported: text/list, but got: {}".format(type(text))) def save(self, dir_path): path = os.path.join(dir_path, self.name) diff --git a/medcat/utils/relation_extraction/utils.py b/medcat/utils/relation_extraction/utils.py index b0a2b8094..544905ca4 100644 --- a/medcat/utils/relation_extraction/utils.py +++ b/medcat/utils/relation_extraction/utils.py @@ -208,7 +208,7 @@ def put_blanks(relation_data: List, blanking_threshold: float = 0.5) -> List: def create_tokenizer_pretrain(tokenizer: TokenizerWrapperBERT, tokenizer_path: str): """ - This method simply adds special tokens that we enouncter + This method simply adds special tokens that we encounter Args: tokenizer (TokenizerWrapperBERT): BERT tokenizer. diff --git a/medcat/utils/saving/coding.py b/medcat/utils/saving/coding.py index 89f9c0651..21b9f02d1 100644 --- a/medcat/utils/saving/coding.py +++ b/medcat/utils/saving/coding.py @@ -92,7 +92,7 @@ def try_encode(self, obj): class PatternDecoder(PartDecoder): def try_decode(self, dct: dict) -> Union[dict, re.Pattern]: - """Decode re.Patttern from input dicts. + """Decode re.Pattern from input dicts. Args: dct (dict): The input dict diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index cf95d2ca1..299169757 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -25,18 +25,18 @@ def get_direct_dependencies() -> Set[str]: - """Get the set of direct dependeny names. + """Get the set of direct dependency names. The current implementation reads install_requires.txt for dependenceies, removes comments, whitespace, quotes; removes the versions and returns the names as a set. Returns: - Set[str]: The set of direct dependeny names. + Set[str]: The set of direct dependency names. """ req_file = INSTALL_REQUIRES_FILE_PATH if not os.path.exists(req_file): - # When pip-installed. See note above near constant definiation + # When pip-installed. See note above near constant definition req_file = INSTALL_REQUIRES_FILE_PATH_PIP with open(req_file) as f: # read every line, strip quotes and comments diff --git a/medcat/utils/saving/serializer.py b/medcat/utils/saving/serializer.py index 22289b58b..a03af8097 100644 --- a/medcat/utils/saving/serializer.py +++ b/medcat/utils/saving/serializer.py @@ -1,4 +1,4 @@ -"""This modlue is responsible for the (new) methods of saving and loading parts of MedCAT. +"""This module is responsible for the (new) methods of saving and loading parts of MedCAT. The idea is to move away from saving medcat files using the dill/pickle. And to save as well as load them in some other way. @@ -36,7 +36,7 @@ def __init__(self, folder: str, name: str) -> None: if not os.path.exists(folder): os.makedirs(folder) elif not os.path.isdir(folder): - raise ValueError(f'Folder expected, got fille: {folder}') + raise ValueError(f'Folder expected, got file: {folder}') if os.path.isdir(self.file_name): raise ValueError( f'Expected file, found folder: {self.file_name}') @@ -111,7 +111,7 @@ def serialize(self, cdb, overwrite: bool = False) -> None: If `json_path` was specified to the constructor, this will serialize some of the parts that take up more memory in JSON files in said directory. In that case, the rest of the info is saved into the `main_path` passed to - the consturctor + the constructor Otherwise, everything is saved to the `main_path` using `dill.dump` just like in previous cases. diff --git a/medcat/utils/spacy_compatibility.py b/medcat/utils/spacy_compatibility.py index a64737f21..687dcdec8 100644 --- a/medcat/utils/spacy_compatibility.py +++ b/medcat/utils/spacy_compatibility.py @@ -1,4 +1,4 @@ -"""This module attempts to read the spacy compatibilty of +"""This module attempts to read the spacy compatibility of a model pack and (if necessary) compare it to the installed spacy version. """ @@ -65,7 +65,7 @@ def get_installed_spacy_version() -> str: """Get the spacy version installed currently. Returns: - str: The currently installed spacy verison. + str: The currently installed spacy version. """ return spacy.__version__ diff --git a/medcat/utils/versioning.py b/medcat/utils/versioning.py index 09c53d4fd..005213421 100644 --- a/medcat/utils/versioning.py +++ b/medcat/utils/versioning.py @@ -168,13 +168,13 @@ def _get_relevant_files(self, ignore_hidden: bool = True) -> List[str]: return [os.path.join(self.model_pack_path, fn) # ignores hidden files for fn in os.listdir(self.model_pack_path) if (ignore_hidden and not fn.startswith("."))] - def _check_existance(self, files_to_copy: List[str], new_path: str, overwrite: bool): + def _check_existence(self, files_to_copy: List[str], new_path: str, overwrite: bool): if overwrite: return # ignore all if not os.path.exists(new_path): os.makedirs(new_path) return # all good, new folder - # check file existance in new (existing) path + # check file existence in new (existing) path for file_to_copy in files_to_copy: new_file_name = os.path.join( new_path, os.path.basename(file_to_copy)) @@ -216,13 +216,13 @@ def upgrade(self, new_path: str, overwrite: bool = False) -> None: IncorrectModel: If model pack does not need an upgrade """ if not self.needs_upgrade(): - raise IncorrectModel(f"Model pack does not need ugprade: {self.model_pack_path} " + raise IncorrectModel(f"Model pack does not need upgrade: {self.model_pack_path} " f"since it's at version: {self.current_version}") logger.info("Starting to upgrade %s at (version %s)", self.model_pack_path, self.current_version) files_to_copy = self._get_relevant_files() try: - self._check_existance(files_to_copy, new_path, overwrite) + self._check_existence(files_to_copy, new_path, overwrite) except ValueError as e: raise e logger.debug("Copying files from %s", self.model_pack_path) diff --git a/medcat/vocab.py b/medcat/vocab.py index 53af0eb2e..56bd1e0d9 100644 --- a/medcat/vocab.py +++ b/medcat/vocab.py @@ -25,7 +25,7 @@ def __init__(self) -> None: self.unigram_table: np.ndarray = np.array([]) def inc_or_add(self, word: str, cnt: int = 1, vec: Optional[np.ndarray] = None) -> None: - """Add a word or incrase its count. + """Add a word or increase its count. Args: word(str): @@ -77,7 +77,7 @@ def inc_wc(self, word: str, cnt: int = 1) -> None: word(str): For which word to increase the count cnt(int): - By how muhc to incrase the count (Default value = 1) + By how muhc to increase the count (Default value = 1) """ self.item(word)['cnt'] += cnt