diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py index 0fa1262917d1..8cb5f6687276 100644 --- a/src/transformers/models/markuplm/tokenization_markuplm.py +++ b/src/transformers/models/markuplm/tokenization_markuplm.py @@ -48,7 +48,6 @@ VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", - "tags_dict": "tags_dict.json", } PRETRAINED_VOCAB_FILES_MAP = { @@ -60,10 +59,6 @@ "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/merges.txt", "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/merges.txt", }, - "tags_dict": { - "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/tags_dict.json", - "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/tags_dict.json", - }, } @@ -261,8 +256,8 @@ def __init__( with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) - with open(tags_dict, encoding="utf-8") as tags_dict_handle: - self.tags_dict = json.load(tags_dict_handle) + + self.tags_dict = tags_dict self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() @@ -425,9 +420,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = merge_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] ) - tags_dict_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["tags_dict"] - ) # save vocab_file with open(vocab_file, "w", encoding="utf-8") as f: @@ -447,11 +439,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = writer.write(" ".join(bpe_tokens) + "\n") index += 1 - # save tags_dict_file - with open(tags_dict_file, "w", encoding="utf-8") as f: - f.write(json.dumps(self.tags_dict, ensure_ascii=False)) - - return vocab_file, merge_file, tags_dict_file + return vocab_file, merge_file def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py index d8a26d9e0502..f5583d54c734 100644 --- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py +++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py @@ -53,7 +53,6 @@ "vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json", - "tags_dict": "tags_dict.json", } PRETRAINED_VOCAB_FILES_MAP = { @@ -65,10 +64,6 @@ "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/merges.txt", "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/merges.txt", }, - "tags_dict": { - "microsoft/markuplm-base": "https://huggingface.co/microsoft/markuplm-base/resolve/main/tags_dict.json", - "microsoft/markuplm-large": "https://huggingface.co/microsoft/markuplm-large/resolve/main/tags_dict.json", - }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { @@ -179,8 +174,8 @@ def __init__( self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) self.add_prefix_space = add_prefix_space - with open(tags_dict, encoding="utf-8") as tags_dict_handle: - self.tags_dict = json.load(tags_dict_handle) + + self.tags_dict = tags_dict tokenizer_component = "post_processor" tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None) @@ -727,8 +722,4 @@ def create_token_type_ids_from_sequences( def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) - tags_dict_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["tags_dict"] - ) - - return tuple(files) + (tags_dict_file,) + return tuple(files) diff --git a/tests/test_tokenization_markuplm.py b/tests/test_tokenization_markuplm.py index febb973247d5..4e009ca0c8e6 100644 --- a/tests/test_tokenization_markuplm.py +++ b/tests/test_tokenization_markuplm.py @@ -71,18 +71,19 @@ def setUp(self): ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3} + self.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3} self.special_tokens_map = {"unk_token": ""} self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - self.tags_dict = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["tags_dict"]) + self.tokenizer_config_file = os.path.join(self.tmpdirname, "tokenizer_config.json") + with open(self.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") with open(self.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - with open(self.tags_dict, "w", encoding="utf-8") as fp: - fp.write(json.dumps(tags_dict) + "\n") + with open(self.tokenizer_config_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps({"tags_dict": self.tags_dict})) # def get_clean_sequence(self, tokenizer): # html_string = " hello world "