From b139738bfaa1a8986514f67bd9ad8968bf8c6ce7 Mon Sep 17 00:00:00 2001 From: Jan Trienes Date: Mon, 14 Dec 2020 08:00:43 +0100 Subject: [PATCH 1/9] Update all experiment dependencies --- environment.yml | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/environment.yml b/environment.yml index 5128a01..8136f4d 100644 --- a/environment.yml +++ b/environment.yml @@ -2,25 +2,25 @@ name: deidentify channels: - conda-forge dependencies: - - python=3.7.2 - - pip=19.1 - - tqdm=4.29.1 - - pandas=0.23.4 - - matplotlib=3.0.2 - - seaborn=0.9.0 - - scikit-learn=0.20.3 + - python=3.7.9 + - pip=20.3.1 + - tqdm=4.54.1 + - pandas=1.1.3 + - matplotlib=3.3.2 + - seaborn=0.11.0 + - scikit-learn=0.23.2 - unidecode=1.0.23 - pyyaml=5.1 - joblib=0.13.2 - pip: - - spacy==2.2.1 - - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm==2.2.0 - - https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-2.2.1/nl_core_news_sm-2.2.1.tar.gz#egg=nl_core_news_sm==2.2.1 + - spacy==2.3.5 + - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz#egg=en_core_web_sm==2.3.1 + - https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-2.3.0/nl_core_news_sm-2.3.0.tar.gz#egg=nl_core_news_sm==2.3.0 - deduce==1.0.2 - py-dateinfer==0.4.5 - - loguru==0.4.0 - - nameparser==1.0.2 + - loguru==0.5.3 + - nameparser==1.0.6 - sklearn-crfsuite==0.3.6 - - flair==0.6.0.post1 + - flair==0.7 - requests - - torch==1.6.0 + - torch==1.7.1 From 5a5c1ac64e1fc03cb59b5132761860d342788017 Mon Sep 17 00:00:00 2001 From: Jan Trienes Date: Mon, 14 Dec 2020 08:01:27 +0100 Subject: [PATCH 2/9] Fix tests to accomodate change in spaCy sentence tokenizer --- tests/methods/test_flair_utils.py | 36 ++++++++++++++++++--------- tests/tokenizer/test_tokenizer_ons.py | 17 +++++++------ 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/tests/methods/test_flair_utils.py b/tests/methods/test_flair_utils.py index 5ea23d5..0ffa861 100644 --- a/tests/methods/test_flair_utils.py +++ b/tests/methods/test_flair_utils.py @@ -12,32 +12,44 @@ def test_standoff_to_flair_sents(): docs = corpus.train sents, parsed_docs = flair_utils.standoff_to_flair_sents(docs, tokenizer) - assert len(sents) == 10 - assert len(parsed_docs) == 10 + assert len(sents) == 14 + assert len(parsed_docs) == 14 bio_tags = [token.get_tag('ner').value for token in sents[0]] token_texts = [token.text for token in sents[0]] - assert token_texts == [ 'Linders', ',', 'Xandro', - '<', - 't.njg.nmmeso@rcrmb.nl', + '<' + ] + assert bio_tags == [ + 'B-Name', + 'I-Name', + 'I-Name', + 'O' + ] + + bio_tags = [token.get_tag('ner').value for token in sents[1]] + token_texts = [token.text for token in sents[1]] + assert token_texts == [ + 't.njg.nmmeso@rcrmb.nl' + ] + assert bio_tags == [ + 'B-Email' + ] + + bio_tags = [token.get_tag('ner').value for token in sents[2]] + token_texts = [token.text for token in sents[2]] + assert token_texts == [ '>', '', '07', 'apr', '.', - '', + '' ] - assert bio_tags == [ - 'B-Name', - 'I-Name', - 'I-Name', - 'O', - 'B-Email', 'O', 'O', 'B-Date', diff --git a/tests/tokenizer/test_tokenizer_ons.py b/tests/tokenizer/test_tokenizer_ons.py index a634131..78b0058 100644 --- a/tests/tokenizer/test_tokenizer_ons.py +++ b/tests/tokenizer/test_tokenizer_ons.py @@ -2,30 +2,33 @@ tokenizer = TokenizerOns() + def test_tokenizer(): text = '=== Answer: 1234 ===\ntest a b c d.\n=== Report: 1234 ===\nMw. test test test' doc = tokenizer.parse_text(text) tokens = [t.text for t in doc] - assert tokens == ['=== Answer: 1234 ===\n', 'test', 'a', 'b', 'c', - 'd.', '\n', '=== Report: 1234 ===\n', 'Mw.', 'test', 'test', 'test'] + assert tokens == [ + '=== Answer: 1234 ===\n', 'test', 'a', 'b', 'c', 'd.', '\n', '=== Report: 1234 ===\n', + 'Mw.', 'test', 'test', 'test' + ] def test_sentence_segmentation(): - text = '=== Answer: 1234 ===\ntest a b c d.\n=== Report: 1234 ===\nMw. test test test' + text = '=== Answer: 1234 ===\nDit is een zin.\n=== Report: 1234 ===\nMw. heeft goed gegeten.' doc = tokenizer.parse_text(text) sents = [sent.text for sent in doc.sents] assert sents == [ '=== Answer: 1234 ===\n', - 'test a b c d.\n', + 'Dit is een zin.\n', '=== Report: 1234 ===\n', - 'Mw. test test test' + 'Mw. heeft goed gegeten.' ] sents = list(doc.sents) assert [token.text for token in sents[0]] == ['=== Answer: 1234 ===\n'] - assert [token.text for token in sents[1]] == ['test', 'a', 'b', 'c', 'd.', '\n'] + assert [token.text for token in sents[1]] == ['Dit', 'is', 'een', 'zin', '.', '\n'] assert [token.text for token in sents[2]] == ['=== Report: 1234 ===\n'] - assert [token.text for token in sents[3]] == ['Mw.', 'test', 'test', 'test'] + assert [token.text for token in sents[3]] == ['Mw.', 'heeft', 'goed', 'gegeten', '.'] From 01adbe6629f6bb80d12115772a4927af87ce80be Mon Sep 17 00:00:00 2001 From: Jan Trienes Date: Tue, 15 Dec 2020 13:20:10 +0100 Subject: [PATCH 3/9] Use corpus specific tokenizer in evaluation The Ons tokenizer has a few tokenizer rules to accommodate some domain specific tokenization issues. We can re-use this tokenizer to not negatively impact the token-based evaluation --- deidentify/evaluation/evaluator.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/deidentify/evaluation/evaluator.py b/deidentify/evaluation/evaluator.py index 0871335..18fe8a9 100644 --- a/deidentify/evaluation/evaluator.py +++ b/deidentify/evaluation/evaluator.py @@ -2,7 +2,6 @@ from typing import List import numpy as np -import spacy from loguru import logger from sklearn.metrics import confusion_matrix from spacy.gold import biluo_tags_from_offsets @@ -20,25 +19,23 @@ def flatten(lists): class Evaluator: - def __init__(self, gold: List[Document], predicted: List[Document], language='nl', - tokenizer=None): + def __init__(self, gold: List[Document], predicted: List[Document], language='nl'): self.gold = gold self.predicted = predicted self.tags = sorted(list(set(ann.tag for doc in gold for ann in doc.annotations))) - if tokenizer: - self.tokenize = tokenizer - else: - if language not in self.supported_languages(): - logger.warning( - 'Unknown language {} for evaluation. Fallback to "en"'.format(language)) - language = 'en' + if language not in self.supported_languages(): + logger.warning( + 'Unknown language {} for evaluation. Fallback to "en"'.format(language)) + language = 'en' - if language == 'nl': - self.tokenize = spacy.load('nl_core_news_sm') - else: - self.tokenize = spacy.load('en_core_web_sm') + if language == 'nl': + from deidentify.tokenizer.tokenizer_ons import TokenizerOns + self.tokenizer = TokenizerOns(disable=('tagger', 'parser', 'ner')) + else: + from deidentify.tokenizer.tokenizer_en import TokenizerEN + self.tokenizer = TokenizerEN(disable=('tagger', 'parser', 'ner')) @staticmethod def supported_languages(): @@ -108,7 +105,7 @@ def token_level_blind(self): return metric def token_annotations(self, doc, tag_blind=False, entity_tag=ENTITY_TAG): - parsed = self.tokenize(doc.text, disable=("tagger", "parser", "ner")) + parsed = self.tokenizer.parse_text(doc.text) entities = [(int(ann.start), int(ann.end), ann.tag) for ann in doc.annotations] biluo_tags = biluo_tags_from_offsets(parsed, entities) From 9b80a513c4e559ce9e5749748e11e714c439f3c1 Mon Sep 17 00:00:00 2001 From: Jan Trienes Date: Tue, 15 Dec 2020 13:29:31 +0100 Subject: [PATCH 4/9] Add additional infixes for the Ons tokenizer We add `()/` to accommodate some domain specific cases ways of writing --- deidentify/tokenizer/tokenizer_ons.py | 5 +++++ tests/tokenizer/test_tokenizer_ons.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/deidentify/tokenizer/tokenizer_ons.py b/deidentify/tokenizer/tokenizer_ons.py index ac4fe53..f2e04d9 100644 --- a/deidentify/tokenizer/tokenizer_ons.py +++ b/deidentify/tokenizer/tokenizer_ons.py @@ -75,6 +75,11 @@ def _metadata_sentence_segmentation(doc): NLP.tokenizer.add_special_case(case.lower(), [{ORTH: case.lower()}]) +infixes = NLP.Defaults.infixes + [r'\(', r'\)', r'/'] +infix_regex = spacy.util.compile_infix_regex(infixes) +NLP.tokenizer.infix_finditer = infix_regex.finditer + + class TokenizerOns(Tokenizer): def parse_text(self, text: str) -> spacy.tokens.doc.Doc: diff --git a/tests/tokenizer/test_tokenizer_ons.py b/tests/tokenizer/test_tokenizer_ons.py index 78b0058..9b9cb77 100644 --- a/tests/tokenizer/test_tokenizer_ons.py +++ b/tests/tokenizer/test_tokenizer_ons.py @@ -32,3 +32,17 @@ def test_sentence_segmentation(): assert [token.text for token in sents[1]] == ['Dit', 'is', 'een', 'zin', '.', '\n'] assert [token.text for token in sents[2]] == ['=== Report: 1234 ===\n'] assert [token.text for token in sents[3]] == ['Mw.', 'heeft', 'goed', 'gegeten', '.'] + + +def test_infix_split_on_parenthesis(): + text = 'GRZ(12-12-2020).' + doc = tokenizer.parse_text(text) + tokens = [t.text for t in doc] + assert tokens == 'GRZ ( 12-12-2020 ) .'.split() + + +def test_infix_split_on_forward_slash(): + text = 'Groot/Kempers' + doc = tokenizer.parse_text(text) + tokens = [t.text for t in doc] + assert tokens == 'Groot / Kempers'.split() From f2689e4d42173bf06fc3e2599d7956e1387bfe3e Mon Sep 17 00:00:00 2001 From: Jan Trienes Date: Tue, 15 Dec 2020 13:44:07 +0100 Subject: [PATCH 5/9] Use stricter infix regex for forward slash (not if preceeded by digit) --- deidentify/tokenizer/tokenizer_ons.py | 2 +- tests/tokenizer/test_tokenizer_ons.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/deidentify/tokenizer/tokenizer_ons.py b/deidentify/tokenizer/tokenizer_ons.py index f2e04d9..c5a21c1 100644 --- a/deidentify/tokenizer/tokenizer_ons.py +++ b/deidentify/tokenizer/tokenizer_ons.py @@ -75,7 +75,7 @@ def _metadata_sentence_segmentation(doc): NLP.tokenizer.add_special_case(case.lower(), [{ORTH: case.lower()}]) -infixes = NLP.Defaults.infixes + [r'\(', r'\)', r'/'] +infixes = NLP.Defaults.infixes + [r'\(', r'\)', r'(?<=[\D])\/(?=[\D])'] infix_regex = spacy.util.compile_infix_regex(infixes) NLP.tokenizer.infix_finditer = infix_regex.finditer diff --git a/tests/tokenizer/test_tokenizer_ons.py b/tests/tokenizer/test_tokenizer_ons.py index 9b9cb77..e2e1c51 100644 --- a/tests/tokenizer/test_tokenizer_ons.py +++ b/tests/tokenizer/test_tokenizer_ons.py @@ -46,3 +46,10 @@ def test_infix_split_on_forward_slash(): doc = tokenizer.parse_text(text) tokens = [t.text for t in doc] assert tokens == 'Groot / Kempers'.split() + + +def test_infix_split_on_forward_slash_exclude_dates(): + text = '13/01/2020' + doc = tokenizer.parse_text(text) + tokens = [t.text for t in doc] + assert tokens == ['13/01/2020'] From c701fdace52c7d8e0933a8d6e5056278978560b9 Mon Sep 17 00:00:00 2001 From: Jan Trienes Date: Tue, 15 Dec 2020 13:49:27 +0100 Subject: [PATCH 6/9] Remove obsolete tokenizer arg when initializing Evaluator in sig. test --- deidentify/evaluation/significance_testing.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/deidentify/evaluation/significance_testing.py b/deidentify/evaluation/significance_testing.py index 9b68066..0132848 100644 --- a/deidentify/evaluation/significance_testing.py +++ b/deidentify/evaluation/significance_testing.py @@ -18,20 +18,16 @@ def _load_yaml(yaml_file): return config -def noop(): - return None - - def micro_f1(gold: List[Document], predicted: List[Document]): - return evaluator.Evaluator(gold, predicted, tokenizer=noop).entity_level().f_score() + return evaluator.Evaluator(gold, predicted).entity_level().f_score() def micro_precision(gold: List[Document], predicted: List[Document]): - return evaluator.Evaluator(gold, predicted, tokenizer=noop).entity_level().precision() + return evaluator.Evaluator(gold, predicted).entity_level().precision() def micro_recall(gold: List[Document], predicted: List[Document]): - return evaluator.Evaluator(gold, predicted, tokenizer=noop).entity_level().recall() + return evaluator.Evaluator(gold, predicted).entity_level().recall() class SignificanceReport: From ce87f374e0c1b4889efa480d879f6f98363f8097 Mon Sep 17 00:00:00 2001 From: Jan Trienes Date: Tue, 15 Dec 2020 15:10:13 +0100 Subject: [PATCH 7/9] Silence multiple occurences of spaCy W030 --- deidentify/evaluation/evaluator.py | 9 +++++++++ deidentify/methods/tagging_utils.py | 11 ++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/deidentify/evaluation/evaluator.py b/deidentify/evaluation/evaluator.py index 18fe8a9..f89d2b2 100644 --- a/deidentify/evaluation/evaluator.py +++ b/deidentify/evaluation/evaluator.py @@ -1,3 +1,4 @@ +import warnings from collections import namedtuple from typing import List @@ -12,6 +13,10 @@ Entity = namedtuple('Entity', ['doc_name', 'start', 'end', 'tag']) ENTITY_TAG = 'ENT' +# Silence spaCy warning regarding misaligned entity boundaries. It will show up multiple times +# because the message changes with the input text. +# More info on the warning: https://github.com/explosion/spaCy/issues/5727 +warnings.filterwarnings('ignore', message=r'.*W030.*') def flatten(lists): return [e for l in lists for e in l] @@ -119,6 +124,10 @@ def token_annotations(self, doc, tag_blind=False, entity_tag=ENTITY_TAG): # # https://spacy.io/api/goldparse#biluo_tags_from_offsets tags.append('O') + warnings.warn( + 'Some entities could not be aligned in the text. Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment.', + UserWarning + ) elif tag_blind: tags.append(entity_tag) else: diff --git a/deidentify/methods/tagging_utils.py b/deidentify/methods/tagging_utils.py index 9d9745a..ecbb472 100644 --- a/deidentify/methods/tagging_utils.py +++ b/deidentify/methods/tagging_utils.py @@ -1,6 +1,6 @@ """Utility methods to convert between standoff and BIO format. """ - +import warnings from collections import defaultdict, namedtuple from typing import List, Tuple @@ -15,6 +15,11 @@ Token = namedtuple('Token', ['text', 'pos_tag', 'label', 'ner_tag']) ParsedDoc = namedtuple('ParsedDoc', ['spacy_doc', 'name', 'text']) +# Silence spaCy warning regarding misaligned entity boundaries. It will show up multiple times +# because the message changes with the input text. +# More info on the warning: https://github.com/explosion/spaCy/issues/5727 +warnings.filterwarnings('ignore', message=r'.*W030.*') + def standoff_to_sents(docs: List[Document], tokenizer: Tokenizer, @@ -220,6 +225,10 @@ def _doc_to_bio(parsed_doc: spacy.tokens.Doc, annotations: List[Annotation]): # Returned by spacy if token boundaries mismatch entity boundaries. # https://spacy.io/api/goldparse#biluo_tags_from_offsets tags.append('O') + warnings.warn( + 'Some entities could not be aligned in the text. Use `spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)` to check the alignment.', + UserWarning + ) else: tags.append(biluo_to_bio[tag[0:2]] + tag[2:]) From 2949f5aacae912bfdb5953243db57c6bdb9459da Mon Sep 17 00:00:00 2001 From: Jan Trienes Date: Wed, 16 Dec 2020 14:40:00 +0100 Subject: [PATCH 8/9] Switch to entity-level metrics in model summary --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 530b154..b0eb0e2 100644 --- a/README.md +++ b/README.md @@ -155,12 +155,12 @@ We provide a number of pre-trained models for the Dutch language. The models wer | Name | Tagger | Language | Dataset | F1* | Precision* | Recall* | Tags | |------|--------|----------|---------|----|-----------|--------|--------| -| [DEDUCE (Menger et al., 2018)](https://www.sciencedirect.com/science/article/abs/pii/S0736585316307365)** | `DeduceTagger` | Dutch | NUT | 0.7564 | 0.9092 | 0.6476 | [8 PHI Tags](https://github.com/nedap/deidentify/blob/168ad67aec586263250900faaf5a756d3b8dd6fa/deidentify/methods/deduce/run_deduce.py#L17) | -| [model_crf_ons_tuned-v0.1.0](https://github.com/nedap/deidentify/releases/tag/model_crf_ons_tuned-v0.1.0) | `CRFTagger` | Dutch | NUT | 0.9048 | 0.9632 | 0.8530 | [15 PHI Tags](https://github.com/nedap/deidentify/releases/tag/model_crf_ons_tuned-v0.1.0) | -| [model_bilstmcrf_ons_fast-v0.1.0](https://github.com/nedap/deidentify/releases/tag/model_bilstmcrf_ons_fast-v0.1.0) | `FlairTagger` | Dutch | NUT | 0.9461 | 0.9591 | 0.9335 | [15 PHI Tags](https://github.com/nedap/deidentify/releases/tag/model_bilstmcrf_ons_fast-v0.1.0) | -| [model_bilstmcrf_ons_large-v0.1.0](https://github.com/nedap/deidentify/releases/tag/model_bilstmcrf_ons_large-v0.1.0) | `FlairTagger` | Dutch | NUT | 0.9505 | 0.9683 | 0.9333 | [15 PHI Tags](https://github.com/nedap/deidentify/releases/tag/model_bilstmcrf_ons_large-v0.1.0) | +| [DEDUCE (Menger et al., 2018)](https://www.sciencedirect.com/science/article/abs/pii/S0736585316307365)** | `DeduceTagger` | Dutch | NUT | 0.6649 | 0.8192 | 0.5595 | [8 PHI Tags](https://github.com/nedap/deidentify/blob/168ad67aec586263250900faaf5a756d3b8dd6fa/deidentify/methods/deduce/run_deduce.py#L17) | +| [model_crf_ons_tuned-v0.2.0](https://github.com/nedap/deidentify/releases/tag/model_crf_ons_tuned-v0.2.0) | `CRFTagger` | Dutch | NUT | 0.8511 | 0.9337 | 0.7820 | [15 PHI Tags](https://github.com/nedap/deidentify/releases/tag/model_crf_ons_tuned-v0.2.0) | +| [model_bilstmcrf_ons_fast-v0.2.0](https://github.com/nedap/deidentify/releases/tag/model_bilstmcrf_ons_fast-v0.2.0) | `FlairTagger` | Dutch | NUT | 0.8914 | 0.9101 | 0.8735 | [15 PHI Tags](https://github.com/nedap/deidentify/releases/tag/model_bilstmcrf_ons_fast-v0.2.0) | +| [model_bilstmcrf_ons_large-v0.2.0](https://github.com/nedap/deidentify/releases/tag/model_bilstmcrf_ons_large-v0.2.0) | `FlairTagger` | Dutch | NUT | 0.8990 | 0.9240 | 0.8754 | [15 PHI Tags](https://github.com/nedap/deidentify/releases/tag/model_bilstmcrf_ons_large-v0.2.0) | -*\*All scores are token-level (tag-blind) precision/recall/F1 obtained on the test portion of each dataset. For additional metrics, see the corresponding model release.* +*\*All scores are micro-averaged entity-level precision/recall/F1 obtained on the test portion of each dataset. For additional metrics, see the corresponding model release.* *\*\*DEDUCE was developed on a dataset of psychiatric nursing notes and treatment plans. The numbers reported here were obtained by applying DEDUCE to our NUT dataset. For more information on the development of DEDUCE, see the paper by [Menger et al. (2018)](https://www.sciencedirect.com/science/article/abs/pii/S0736585316307365).* From e12e42693e8c07f4a451c25a2b1d0c3e0641c343 Mon Sep 17 00:00:00 2001 From: Jan Trienes Date: Wed, 16 Dec 2020 14:42:47 +0100 Subject: [PATCH 9/9] Update model identifiers in several places --- README.md | 6 +++--- demo.py | 2 +- tests/taggers/test_crf_tagger.py | 2 +- tests/taggers/test_flair_tagger.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 15e1293..2ae94a3 100644 --- a/README.md +++ b/README.md @@ -20,10 +20,10 @@ Create a new virtual environment with an environment manager of your choice. The pip install deidentify ``` -We use the spaCy tokenizer. For good compatibility with the pre-trained models, we recommend using the same spaCy tokenization models that were used at de-identification model training time: +We use the spaCy tokenizer. For good compatibility with the pre-trained models, we recommend using the same version that we used to train the de-identification models. ```sh -pip install https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-2.2.1/nl_core_news_sm-2.2.1.tar.gz#egg=nl_core_news_sm==2.2.1 +pip install https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-2.3.0/nl_core_news_sm-2.3.0.tar.gz#egg=nl_core_news_sm==2.3.0 ``` ### Example Usage @@ -48,7 +48,7 @@ documents = [ ] # Select downloaded model -model = 'model_bilstmcrf_ons_fast-v0.1.0' +model = 'model_bilstmcrf_ons_fast-v0.2.0' # Instantiate tokenizer tokenizer = TokenizerFactory().tokenizer(corpus='ons', disable=("tagger", "ner")) diff --git a/demo.py b/demo.py index 0355d2a..b045e9d 100644 --- a/demo.py +++ b/demo.py @@ -15,7 +15,7 @@ ] # Select downloaded model -model = 'model_bilstmcrf_ons_fast-v0.1.0' +model = 'model_bilstmcrf_ons_fast-v0.2.0' # Instantiate tokenizer tokenizer = TokenizerFactory().tokenizer(corpus='ons', disable=("tagger", "ner")) diff --git a/tests/taggers/test_crf_tagger.py b/tests/taggers/test_crf_tagger.py index 5802405..b42f0fe 100644 --- a/tests/taggers/test_crf_tagger.py +++ b/tests/taggers/test_crf_tagger.py @@ -3,7 +3,7 @@ from deidentify.tokenizer import TokenizerFactory tokenizer = TokenizerFactory().tokenizer(corpus='ons') -tagger = CRFTagger(model='model_crf_ons_tuned-v0.1.0', tokenizer=tokenizer) +tagger = CRFTagger(model='model_crf_ons_tuned-v0.2.0', tokenizer=tokenizer) def test_annotate(): diff --git a/tests/taggers/test_flair_tagger.py b/tests/taggers/test_flair_tagger.py index e2f4faa..975555d 100644 --- a/tests/taggers/test_flair_tagger.py +++ b/tests/taggers/test_flair_tagger.py @@ -3,7 +3,7 @@ from deidentify.tokenizer import TokenizerFactory tokenizer = TokenizerFactory().tokenizer(corpus='ons') -tagger = FlairTagger(model='model_bilstmcrf_ons_fast-v0.1.0', tokenizer=tokenizer) +tagger = FlairTagger(model='model_bilstmcrf_ons_fast-v0.2.0', tokenizer=tokenizer) def test_annotate():