diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 7e988e7fdd73..967255574965 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -531,10 +531,12 @@ class BertGenerationConverter(SpmConverter): class PegasusConverter(SpmConverter): def vocab(self, proto): vocab = [ - (self.original_tokenizer.pad_token, 0), - (self.original_tokenizer.eos_token, 0), + (self.original_tokenizer.pad_token, 0.0), + (self.original_tokenizer.eos_token, 0.0), + (self.original_tokenizer.mask_token_sent, 0.0), + (self.original_tokenizer.mask_token, 0.0), ] - vocab += [(f"unk_{i}", -100) for i in range(2, 2 + self.original_tokenizer.offset)] + vocab += [(f"", -100.0) for i in range(2, self.original_tokenizer.offset)] vocab += [(piece.piece, piece.score) for piece in proto.pieces[2:]] return vocab @@ -543,13 +545,10 @@ def unk_id(self, proto): def post_processor(self): eos = self.original_tokenizer.eos_token - return processors.TemplateProcessing( - single=["$A", eos], - pair=["$A", "$B", eos], - special_tokens=[ - (eos, self.original_tokenizer.eos_token_id), - ], - ) + special_tokens = [ + (eos, self.original_tokenizer.eos_token_id), + ] + return processors.TemplateProcessing(single=["$A", eos], pair=["$A", "$B", eos], special_tokens=special_tokens) class T5Converter(SpmConverter): diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py index f538cc970188..abc305e4f142 100644 --- a/src/transformers/models/albert/tokenization_albert_fast.py +++ b/src/transformers/models/albert/tokenization_albert_fast.py @@ -71,10 +71,10 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast): """ - Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece - `__. This tokenizer inherits from - :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this - superclass for more information regarding those methods + Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram + `__. This tokenizer + inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods Args: vocab_file (:obj:`str`): diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py index 55a609b3c23d..01735bdc4053 100644 --- a/src/transformers/models/camembert/tokenization_camembert_fast.py +++ b/src/transformers/models/camembert/tokenization_camembert_fast.py @@ -60,8 +60,8 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast): """ Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from - :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `SentencePiece - `__. + :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE + `__. This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py index 879c876afc41..c25b291c2a05 100644 --- a/src/transformers/models/mbart/tokenization_mbart_fast.py +++ b/src/transformers/models/mbart/tokenization_mbart_fast.py @@ -67,7 +67,8 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast): """ - Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library). + Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library). Based on `BPE + `__. :class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast` and adds a new :meth:`~transformers.MBartTokenizerFast.prepare_seq2seq_batch`. diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py index 5728338276d2..099bdf3e7b31 100644 --- a/src/transformers/models/pegasus/tokenization_pegasus.py +++ b/src/transformers/models/pegasus/tokenization_pegasus.py @@ -12,11 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Optional +import os +from shutil import copyfile +from typing import Dict, List, Optional, Tuple + +import sentencepiece as spm from ...file_utils import add_start_docstrings +from ...tokenization_utils import PreTrainedTokenizer from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding -from ..reformer.tokenization_reformer import ReformerTokenizer +from ...utils import logging SPIECE_UNDERLINE = "▁" @@ -32,31 +37,145 @@ } -class PegasusTokenizer(ReformerTokenizer): +logger = logging.get_logger(__name__) + + +class PegasusTokenizer(PreTrainedTokenizer): r""" - Construct a Pegasus tokenizer. + Construct a PEGASUS tokenizer. Based on `SentencePiece `__. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. + Users should refer to this superclass for more information regarding those methods. - :class:`~transformers.PegasusTokenizer` is identical to :class:`~transformers.ReformerTokenizer` and adds a new - :meth:`~transformers.PegasusTokenizer.prepare_seq2seq_batch` + Args: + vocab_file (:obj:`str`): + `SentencePiece `__ file (generally has a `.spm` extension) that + contains the vocabulary necessary to instantiate a tokenizer. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sequence token. - Refer to superclass :class:`~transformers.ReformerTokenizer` for usage examples and documentation concerning the - initialization parameters and other methods. + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end of + sequence. The token used is the :obj:`sep_token`. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for masking single token values. This is the token used when training this model with masked + language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining. + It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive + Summarization `__. + mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for masking whole target sentences. This is the token used when training this model with gap + sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during + pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for + Abstractive Summarization `__. + additional_special_tokens (:obj:`List[str]`, `optional`): + Additional special tokens used by the tokenizer. If no additional_special_tokens are provided and + are used as additional special tokens corresponding to the `original PEGASUS + tokenizer + `__ + that uses the tokens 2 - 104 only for pretraining """ - offset = 103 # entries 2-104 are only used for pretraining + vocab_files_names = VOCAB_FILES_NAMES + + offset = 103 # entries 2 - 104 are only used for pretraining vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] + + def __init__( + self, + vocab_file, + pad_token="", + eos_token="", + unk_token="", + mask_token="", + mask_token_sent="", + additional_special_tokens=None, + **kwargs + ): + if additional_special_tokens is not None: + assert isinstance( + additional_special_tokens, list + ), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}" + + additional_special_tokens_extended = ( + ([mask_token_sent] + additional_special_tokens) + if mask_token_sent not in additional_special_tokens + else additional_special_tokens + ) + # fill additional tokens with ..., in case not all additional tokens are already taken + additional_special_tokens_extended += [ + f"" for i in range(len(additional_special_tokens_extended), self.offset - 1) + ] + + if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended): + raise ValueError( + f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of tokens. Found {additional_special_tokens_extended}." + ) + additional_special_tokens = additional_special_tokens_extended + else: + additional_special_tokens = [mask_token_sent] + additional_special_tokens += [f"" for i in range(2, self.offset)] - def __init__(self, *args, pad_token="", **kwargs): - super().__init__(*args, **kwargs, pad_token="") - # Don't use reserved words added_token_encoder, added_tokens_decoder because of - # AssertionError: Non-consecutive added token '1' found. in from_pretrained - assert len(self.added_tokens_decoder) == 0 - self.encoder: Dict[int, str] = {0: self.pad_token, 1: self.eos_token} - # entries 2-104 are only used for pretraining and called unk_2, ...unk_104 - self.encoder.update({i: f"unk_{i}" for i in range(2, self.offset + 2)}) + super().__init__( + eos_token=eos_token, + unk_token=unk_token, + mask_token=mask_token, + pad_token=pad_token, + mask_token_sent=mask_token_sent, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + self.vocab_file = vocab_file + self.sp_model = spm.SentencePieceProcessor() + self.sp_model.Load(vocab_file) + self.mask_token_sent = mask_token_sent + + # add special tokens to encoder dict + self.encoder: Dict[int, str] = { + 0: self.pad_token, + 1: self.eos_token, + 2: self.mask_token_sent, + 3: self.mask_token, + } + # entries 2-104 are only used for pretraining and called , , unk_2, ...unk_102 + # mask_token_sent is already added to list -> so start at 1 + self.encoder.update({i + 3: additional_special_tokens[i] for i in range(1, self.offset - 1)}) self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()} + @property + def vocab_size(self) -> int: + return len(self.sp_model) + self.offset + + def get_vocab(self) -> Dict[str, int]: + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + self.sp_model = spm.SentencePieceProcessor() + self.sp_model.Load(self.vocab_file) + + def _tokenize(self, text, sample=False): + """Take as input a string and return a list of strings (tokens) for words/sub-words""" + if not sample: + pieces = self.sp_model.EncodeAsPieces(text) + else: + pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) + return pieces + def _convert_token_to_id(self, token: str) -> int: """ Converts a token (str) to an id using the vocab. """ if token in self.decoder: @@ -73,13 +192,13 @@ def _convert_id_to_token(self, index: int) -> str: elif index in self.added_tokens_encoder: return self.added_tokens_encoder[index] else: - # assert index > self.offset, f"cannot decode ids between 2 and {self.offset}. Got {index}" token = self.sp_model.IdToPiece(index - self.offset) return token - @property - def vocab_size(self) -> int: - return len(self.sp_model) + self.offset + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + out_string = self.sp_model.decode_pieces(tokens) + return out_string def num_special_tokens_to_add(self, pair=False): """Just EOS""" @@ -88,7 +207,11 @@ def num_special_tokens_to_add(self, pair=False): def _special_token_mask(self, seq): all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp all_special_ids.remove(self.unk_token_id) # is only sometimes special - assert all_special_ids == set([0, 1]) + + assert all_special_ids == set( + range(len(self.additional_special_tokens) + 3) + ), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}" + return [1 if x in all_special_ids else 0 for x in seq] def get_special_tokens_mask( @@ -105,7 +228,7 @@ def get_special_tokens_mask( def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]: """ Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating - and adding special tokens. A Pegasus sequence has the following format, where ``X`` represents the sequence: + and adding special tokens. A PEGASUS sequence has the following format, where ``X`` represents the sequence: - single sequence: ``X `` - pair of sequences: ``A B `` (not intended use) @@ -156,3 +279,16 @@ def prepare_seq2seq_batch( labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"] model_inputs["labels"] = labels return model_inputs + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) diff --git a/src/transformers/models/pegasus/tokenization_pegasus_fast.py b/src/transformers/models/pegasus/tokenization_pegasus_fast.py index e221eb4b54b0..c9b0d0763140 100644 --- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py +++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py @@ -12,11 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional +""" Tokenization class for model PEGASUS.""" + + +import os +from shutil import copyfile +from typing import List, Optional, Tuple from ...file_utils import add_start_docstrings, is_sentencepiece_available from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding -from ..reformer.tokenization_reformer_fast import ReformerTokenizerFast +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging if is_sentencepiece_available(): @@ -25,6 +31,9 @@ PegasusTokenizer = None +logger = logging.get_logger(__name__) + + SPIECE_UNDERLINE = "▁" VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} @@ -39,21 +48,112 @@ } -class PegasusTokenizerFast(ReformerTokenizerFast): +class PegasusTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram + `__. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. + Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + `SentencePiece `__ file (generally has a `.spm` extension) that + contains the vocabulary necessary to instantiate a tokenizer. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end of + sequence. The token used is the :obj:`sep_token`. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for masking single token values. This is the token used when training this model with masked + language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining. + It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive + Summarization `__. + mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for masking whole target sentences. This is the token used when training this model with gap + sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during + pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for + Abstractive Summarization `__. + additional_special_tokens (:obj:`List[str]`, `optional`): + Additional special tokens used by the tokenizer. If no additional_special_tokens are provided and + are used as additional special tokens corresponding to the `original PEGASUS + tokenizer + `__ + that uses the tokens 2 - 104 only for pretraining + """ offset = 103 # entries 2-104 are only used for pretraining vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES slow_tokenizer_class = PegasusTokenizer + model_input_names = ["attention_mask"] + + def __init__( + self, + vocab_file, + tokenizer_file=None, + pad_token="", + eos_token="", + unk_token="", + mask_token="", + mask_token_sent="", + additional_special_tokens=None, + **kwargs + ): + if additional_special_tokens is not None: + assert isinstance( + additional_special_tokens, list + ), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}" + + additional_special_tokens_extended = ( + ([mask_token_sent] + additional_special_tokens) + if mask_token_sent not in additional_special_tokens + else additional_special_tokens + ) + # fill additional tokens with ..., in case not all additional tokens are already taken + additional_special_tokens_extended += [ + f"" for i in range(len(additional_special_tokens_extended), self.offset - 1) + ] + + if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended): + raise ValueError( + f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of tokens. Found {additional_special_tokens_extended}." + ) + additional_special_tokens = additional_special_tokens_extended + else: + additional_special_tokens = [mask_token_sent] + additional_special_tokens += [f"" for i in range(2, self.offset)] + + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + pad_token=pad_token, + eos_token=eos_token, + unk_token=unk_token, + mask_token=mask_token, + mask_token_sent=mask_token_sent, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) - # def num_special_tokens_to_add(self, pair=False): - # """Just EOS""" - # return 1 + self.vocab_file = vocab_file def _special_token_mask(self, seq): all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp all_special_ids.remove(self.unk_token_id) # is only sometimes special - assert all_special_ids == set([0, 1]) + + assert all_special_ids == set( + range(len(self.additional_special_tokens) + 3) + ), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}" + return [1 if x in all_special_ids else 0 for x in seq] def get_special_tokens_mask( @@ -117,3 +217,16 @@ def prepare_seq2seq_batch( labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"] model_inputs["labels"] = labels return model_inputs + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) diff --git a/src/transformers/models/reformer/tokenization_reformer_fast.py b/src/transformers/models/reformer/tokenization_reformer_fast.py index 1a3d58f84d40..21deced05d31 100644 --- a/src/transformers/models/reformer/tokenization_reformer_fast.py +++ b/src/transformers/models/reformer/tokenization_reformer_fast.py @@ -64,8 +64,8 @@ class ReformerTokenizerFast(PreTrainedTokenizerFast): """ - Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece - `__ . + Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram + `__. This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py index 5b259ea087d0..e4ad4a306643 100644 --- a/src/transformers/models/t5/tokenization_t5_fast.py +++ b/src/transformers/models/t5/tokenization_t5_fast.py @@ -75,8 +75,8 @@ class T5TokenizerFast(PreTrainedTokenizerFast): """ - Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece - `__ . + Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram + `__. This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py index 8a7b15807749..b8c9f0918e46 100644 --- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py +++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py @@ -66,8 +66,8 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast): """ Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from - :class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `SentencePiece - `__. + :class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `BPE + `__. This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. diff --git a/src/transformers/models/xlnet/tokenization_xlnet_fast.py b/src/transformers/models/xlnet/tokenization_xlnet_fast.py index 60e1010dae2c..84af74070d4c 100644 --- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py +++ b/src/transformers/models/xlnet/tokenization_xlnet_fast.py @@ -62,8 +62,8 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast): """ - Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece - `__. + Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram + `__. This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. diff --git a/tests/test_tokenization_pegasus.py b/tests/test_tokenization_pegasus.py index ad26075da69f..7542f590c71e 100644 --- a/tests/test_tokenization_pegasus.py +++ b/tests/test_tokenization_pegasus.py @@ -26,21 +26,34 @@ def setUp(self): tokenizer.save_pretrained(self.tmpdirname) @cached_property - def pegasus_large_tokenizer(self): + def _large_tokenizer(self): return PegasusTokenizer.from_pretrained("google/pegasus-large") - @unittest.skip("add_tokens does not work yet") - def test_swap_special_token(self): - pass - def get_tokenizer(self, **kwargs) -> PegasusTokenizer: return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self, tokenizer): return ("This is a test", "This is a test") - def test_pegasus_large_tokenizer_settings(self): - tokenizer = self.pegasus_large_tokenizer + def test_mask_tokens_rust_pegasus(self): + rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname) + py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname) + raw_input_str = "Let's see which is the better one It seems like this was important " + rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0] + py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0] + # TODO: (Thom, Patrick) - this fails because the rust tokenizer does not know about the , , and those yet + self.assertListEqual(py_ids, rust_ids) + + def test_large_mask_tokens(self): + tokenizer = self._large_tokenizer + # masks whole sentence while masks single word + raw_input_str = " To ensure a flow of bank resolutions." + desired_result = [2, 413, 615, 114, 3, 1971, 113, 1679, 10710, 107, 1] + ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0] + self.assertListEqual(desired_result, ids) + + def test_large_tokenizer_settings(self): + tokenizer = self._large_tokenizer # The tracebacks for the following asserts are **better** without messages or self.assertEqual assert tokenizer.vocab_size == 96103 assert tokenizer.pad_token_id == 0 @@ -48,20 +61,18 @@ def test_pegasus_large_tokenizer_settings(self): assert tokenizer.offset == 103 assert tokenizer.unk_token_id == tokenizer.offset + 2 == 105 assert tokenizer.unk_token == "" - assert tokenizer.mask_token is None - assert tokenizer.mask_token_id is None assert tokenizer.model_max_length == 1024 raw_input_str = "To ensure a smooth flow of bank resolutions." desired_result = [413, 615, 114, 2291, 1971, 113, 1679, 10710, 107, 1] ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0] self.assertListEqual(desired_result, ids) - assert tokenizer.convert_ids_to_tokens([0, 1, 2]) == ["", "", "unk_2"] + assert tokenizer.convert_ids_to_tokens([0, 1, 2, 3]) == ["", "", "", ""] @require_torch - def test_pegasus_large_seq2seq_truncation(self): + def test_large_seq2seq_truncation(self): src_texts = ["This is going to be way too long." * 150, "short example"] tgt_texts = ["not super long but more than 5 tokens", "tiny"] - batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch( + batch = self._large_tokenizer.prepare_seq2seq_batch( src_texts, tgt_texts=tgt_texts, max_target_length=5, return_tensors="pt" ) assert batch.input_ids.shape == (2, 1024)