From 394e17b5723c31d6e9157cf134fffdd247eba109 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Tue, 17 Sep 2024 10:07:00 +0300 Subject: [PATCH] CU-8695m5q4x: Fix issues detecting 1-token concepts (#485) --- medcat/ner/vocab_based_ner.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/medcat/ner/vocab_based_ner.py b/medcat/ner/vocab_based_ner.py index 97a24dca..259699ff 100644 --- a/medcat/ner/vocab_based_ner.py +++ b/medcat/ner/vocab_based_ner.py @@ -42,13 +42,21 @@ def __call__(self, doc: Doc) -> Doc: name_versions = [tkn._.norm, tkn.lower_] name = "" + nv_in_snames = [] + nv_in_names = [] for name_version in name_versions: + # NOTE: if the entire token is an actual concept, we want to capture that + # previous implementation could fail in those cases if name_version in self.cdb.snames: - if name: - name = name + self.config.general.separator + name_version - else: - name = name_version - break + nv_in_snames.append(name_version) + if name_version in self.cdb.name2cuis: + nv_in_names.append(name_version) + if nv_in_names: + # TODO: should we prefer 0th (i.e the normalised version) or last (the lower case version) + name = nv_in_names[0] + elif nv_in_snames: + # TODO: should we prefer 0th (i.e the normalised version) or last (the lower case version) + name = nv_in_snames[0] if name in self.cdb.name2cuis and not tkn.is_stop: maybe_annotate_name(name, tkns, doc, self.cdb, self.config)