diff --git a/.circleci/config.yml b/.circleci/config.yml
index aedbe4f55398..f3b13d26eb52 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -3,6 +3,22 @@ orbs:
     gcp-gke: circleci/gcp-gke@1.0.4
     go: circleci/go@1.3.0
 
+commands:
+  skip-job-on-doc-only-changes:
+    description: "Do not continue this job and exit with success for PRs with only doc changes"
+    steps:
+
+      - run:
+          name: docs-only changes skip check
+          command: |
+            if git diff --name-only << pipeline.git.base_revision >>...<< pipeline.git.revision >> | egrep -qv '\.(md|rst)$'
+            then
+                echo "Non-docs were modified in this PR, proceeding normally"
+            else
+                echo "Only docs were modified in this PR, quitting this job"
+                circleci step halt
+            fi
+
 # TPU REFERENCES
 references:
     checkout_ml_testing: &checkout_ml_testing
@@ -72,6 +88,7 @@ jobs:
         parallelism: 1
         steps:
             - checkout
+            - skip-job-on-doc-only-changes
             - restore_cache:
                   keys:
                       - v0.4-torch_and_tf-{{ checksum "setup.py" }}
@@ -98,6 +115,7 @@ jobs:
         parallelism: 1
         steps:
             - checkout
+            - skip-job-on-doc-only-changes
             - restore_cache:
                   keys:
                       - v0.4-torch-{{ checksum "setup.py" }}
@@ -124,6 +142,7 @@ jobs:
         parallelism: 1
         steps:
             - checkout
+            - skip-job-on-doc-only-changes
             - restore_cache:
                   keys:
                       - v0.4-tf-{{ checksum "setup.py" }}
@@ -150,6 +169,7 @@ jobs:
         parallelism: 1
         steps:
             - checkout
+            - skip-job-on-doc-only-changes
             - restore_cache:
                 keys:
                     - v0.4-flax-{{ checksum "setup.py" }}
@@ -176,6 +196,7 @@ jobs:
         parallelism: 1
         steps:
             - checkout
+            - skip-job-on-doc-only-changes
             - restore_cache:
                   keys:
                       - v0.4-torch-{{ checksum "setup.py" }}
@@ -202,6 +223,7 @@ jobs:
         parallelism: 1
         steps:
             - checkout
+            - skip-job-on-doc-only-changes
             - restore_cache:
                   keys:
                       - v0.4-tf-{{ checksum "setup.py" }}
@@ -226,6 +248,7 @@ jobs:
             RUN_CUSTOM_TOKENIZERS: yes
         steps:
             - checkout
+            - skip-job-on-doc-only-changes
             - restore_cache:
                   keys:
                       - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
@@ -253,6 +276,7 @@ jobs:
         parallelism: 1
         steps:
             - checkout
+            - skip-job-on-doc-only-changes
             - restore_cache:
                   keys:
                       - v0.4-torch_examples-{{ checksum "setup.py" }}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 098f71f44020..64293ef7a203 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -125,7 +125,7 @@ Follow these steps to start contributing:
    $ git checkout -b a-descriptive-name-for-my-changes
    ```
 
-   **do not** work on the `master` branch.
+   **Do not** work on the `master` branch.
 
 4. Set up a development environment by running the following command in a virtual environment:
 
diff --git a/docs/source/preprocessing.rst b/docs/source/preprocessing.rst
index 10e27814c052..a684f8aaeb2c 100644
--- a/docs/source/preprocessing.rst
+++ b/docs/source/preprocessing.rst
@@ -2,7 +2,6 @@ Preprocessing data
 =======================================================================================================================
 
 In this tutorial, we'll explore how to preprocess your data using 🤗 Transformers. The main tool for this is what we
-
 call a :doc:`tokenizer <main_classes/tokenizer>`. You can build one using the tokenizer class associated to the model
 you would like to use, or directly with the :class:`~transformers.AutoTokenizer` class.
 
@@ -52,7 +51,7 @@ The tokenizer can decode a list of token ids in a proper sentence:
     "[CLS] Hello, I'm a single sentence! [SEP]"
 
 As you can see, the tokenizer automatically added some special tokens that the model expects. Not all models need
-special tokens; for instance, if we had used` gtp2-medium` instead of `bert-base-cased` to create our tokenizer, we
+special tokens; for instance, if we had used `gpt2-medium` instead of `bert-base-cased` to create our tokenizer, we
 would have seen the same sentence as the original one here. You can disable this behavior (which is only advised if you
 have added those special tokens yourself) by passing ``add_special_tokens=False``.
 
diff --git a/docs/source/quicktour.rst b/docs/source/quicktour.rst
index 5b0ca708177f..9d1444e2d6c0 100644
--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@@ -240,7 +240,9 @@ activations of the model.
            [ 0.08181786, -0.04179301]], dtype=float32)>,)
 
 The model can return more than just the final activations, which is why the output is a tuple. Here we only asked for
-the final activations, so we get a tuple with one element. .. note::
+the final activations, so we get a tuple with one element.
+
+.. note::
 
     All 🤗 Transformers models (PyTorch or TensorFlow) return the activations of the model *before* the final activation
     function (like SoftMax) since this final activation function is often fused with the loss.
diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
index 670a6a3a9db8..e8a646006a08 100644
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -70,8 +70,8 @@ inference.
     optimizations afterwards.
 
 .. note::
-    For more information about the optimizations enabled by ONNXRuntime, please have a look at the (`ONNXRuntime Github
-    <https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers>`_)
+    For more information about the optimizations enabled by ONNXRuntime, please have a look at the `ONNXRuntime Github
+    <https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers>`_.
 
 Quantization
 -----------------------------------------------------------------------------------------------------------------------
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 0b1f14c74700..b1b3408acb6a 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -547,10 +547,12 @@ class BertGenerationConverter(SpmConverter):
 class PegasusConverter(SpmConverter):
     def vocab(self, proto):
         vocab = [
-            (self.original_tokenizer.pad_token, 0),
-            (self.original_tokenizer.eos_token, 0),
+            (self.original_tokenizer.pad_token, 0.0),
+            (self.original_tokenizer.eos_token, 0.0),
+            (self.original_tokenizer.mask_token_sent, 0.0),
+            (self.original_tokenizer.mask_token, 0.0),
         ]
-        vocab += [(f"unk_{i}", -100) for i in range(2, 2 + self.original_tokenizer.offset)]
+        vocab += [(f"<unk_{i}>", -100.0) for i in range(2, self.original_tokenizer.offset)]
         vocab += [(piece.piece, piece.score) for piece in proto.pieces[2:]]
         return vocab
 
@@ -559,13 +561,10 @@ def unk_id(self, proto):
 
     def post_processor(self):
         eos = self.original_tokenizer.eos_token
-        return processors.TemplateProcessing(
-            single=["$A", eos],
-            pair=["$A", "$B", eos],
-            special_tokens=[
-                (eos, self.original_tokenizer.eos_token_id),
-            ],
-        )
+        special_tokens = [
+            (eos, self.original_tokenizer.eos_token_id),
+        ]
+        return processors.TemplateProcessing(single=["$A", eos], pair=["$A", "$B", eos], special_tokens=special_tokens)
 
 
 class T5Converter(SpmConverter):
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 6ad0a6ccd210..d49c661513de 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -20,14 +20,14 @@
 
 def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Tensor]:
     """
-    Very simple data collator that simply collates batches of dict-like objects and erforms special handling for
+    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
     potential keys named:
 
         - ``label``: handles a single value (int or float) per object
         - ``label_ids``: handles a list of values per object
 
-    Des not do any additional preprocessing: property names of the input object will be used as corresponding inputs to
-    the model. See glue and ner for example of how it's useful.
+    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
+    to the model. See glue and ner for example of how it's useful.
     """
 
     # In this function we'll make the assumption that all `features` in the batch
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index eb6999e868a6..254c06792f38 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -315,6 +315,7 @@ def generate(
         decoder_start_token_id: Optional[int] = None,
         use_cache: Optional[bool] = None,
         prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        diverse_sequences: Optional[bool] = False,
         **model_kwargs
     ) -> torch.LongTensor:
         r"""
@@ -388,6 +389,9 @@ def generate(
                 conditioned on the previously generated tokens :obj:`inputs_ids` and the batch ID :obj:`batch_id`. This
                 argument is useful for constrained generation conditioned on the prefix, as described in
                 `Autoregressive Entity Retrieval <https://arxiv.org/abs/2010.00904>`__.
+            diverse_sequences (:obj:`bool`, `optional`, defaults to False):
+                if :obj:`True`, greedy search or sampling method can generate sequences with each sequence started with
+                one of the :obj:`num_return_sequences` different tokens predicted.
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If the
                 model is an Encoder-Decoder model, encoder specific kwargs should not be prefixed and decoder specific
@@ -499,6 +503,9 @@ def generate(
         # set model_kwargs
         model_kwargs["use_cache"] = use_cache
 
+        # set num_return_sequences
+        model_kwargs["num_return_sequences"] = num_return_sequences
+
         # get distribution pre_processing samplers
         logits_processor = self._get_logits_processor(
             repetition_penalty=repetition_penalty,
@@ -511,7 +518,7 @@ def generate(
         )
 
         if is_greedy_gen_mode:
-            if num_return_sequences > 1:
+            if (num_return_sequences > 1) and not diverse_sequences:
                 raise ValueError(
                     f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search."
                 )
@@ -523,22 +530,26 @@ def generate(
                 max_length=max_length,
                 pad_token_id=pad_token_id,
                 eos_token_id=eos_token_id,
+                diverse_sequences=diverse_sequences,
                 **model_kwargs,
             )
 
         elif is_sample_gen_mode:
             # get probability distribution warper
             logits_warper = self._get_logits_warper(
-                top_k=top_k, top_p=top_p, temperature=temperature, num_beams=num_beams
-            )
-
-            # expand input_ids with `num_return_sequences` additional sequences per batch
-            input_ids, model_kwargs = self._expand_inputs_for_generation(
-                input_ids,
-                expand_size=num_return_sequences,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-                **model_kwargs,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                num_beams=num_beams,
             )
+            if not diverse_sequences:
+                # expand input_ids with `num_return_sequences` additional sequences per batch
+                input_ids, model_kwargs = self._expand_inputs_for_generation(
+                    input_ids,
+                    expand_size=num_return_sequences,
+                    is_encoder_decoder=self.config.is_encoder_decoder,
+                    **model_kwargs,
+                )
 
             # sample
             return self.sample(
@@ -548,6 +559,7 @@ def generate(
                 max_length=max_length,
                 pad_token_id=pad_token_id,
                 eos_token_id=eos_token_id,
+                diverse_sequences=diverse_sequences,
                 **model_kwargs,
             )
 
@@ -626,6 +638,7 @@ def greedy_search(
         max_length: Optional[int] = None,
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = None,
+        diverse_sequences: Optional[bool] = False,
         **model_kwargs
     ):
         r"""
@@ -646,6 +659,9 @@ def greedy_search(
                 The id of the `padding` token.
             eos_token_id (:obj:`int`, `optional`):
                 The id of the `end-of-sequence` token.
+            diverse_sequences (:obj:`bool`, `optional`, defaults to False):
+                if :obj:`True` the method can generate sequences with each sequence started with one of the top
+                :obj:`num_return_sequences` tokens predicted.
             model_kwargs:
                 Additional model specific keyword arguments will be forwarded to the :obj:`forward` function of the
                 model. If model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
@@ -693,6 +709,12 @@ def greedy_search(
         sequence_lengths, unfinished_sequences, cur_len = self._init_sequence_length_for_generation(
             input_ids, max_length
         )
+        if ("num_return_sequences" in model_kwargs) and diverse_sequences:
+            num_return_sequences = model_kwargs["num_return_sequences"]
+            starting_length = cur_len
+        else:
+            num_return_sequences = 1
+            starting_length = None
 
         while cur_len < max_length:
             # prepare model inputs
@@ -705,14 +727,32 @@ def greedy_search(
             # pre-process distribution
             scores = logits_processor(input_ids, next_token_logits)
 
-            # argmax
-            next_tokens = torch.argmax(scores, dim=-1)
-
-            # add code that transfomers next_tokens to tokens_to_add
+            # argmax or top-num_return_sequences tokens
+            if (cur_len == starting_length) and (num_return_sequences > 1) and diverse_sequences:
+                top_num = int(min(num_return_sequences, scores.size(-1)))  # Safety check
+                next_tokens = torch.topk(scores, top_num).indices.reshape(-1)
+                # Once we got next_tokens, we have expand metadata
+                unfinished_sequences = unfinished_sequences.repeat_interleave(num_return_sequences, dim=0)
+                sequence_lengths = sequence_lengths.repeat_interleave(num_return_sequences, dim=0)
+            else:
+                next_tokens = torch.argmax(scores, dim=-1)
+
+            # add code that transforms next_tokens (to tokens_to_add)
             if eos_token_id is not None:
                 assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined."
                 next_tokens = next_tokens * unfinished_sequences + (pad_token_id) * (1 - unfinished_sequences)
 
+            if (cur_len == starting_length) and (num_return_sequences > 1) and diverse_sequences:
+                # expand input_ids with `num_return_sequences` additional sequences per batch
+                input_ids, model_kwargs = self._expand_inputs_for_generation(
+                    input_ids,
+                    expand_size=num_return_sequences,
+                    is_encoder_decoder=self.config.is_encoder_decoder,
+                    **model_kwargs,
+                )
+                # have to drop past_key_values because input_ids resizing
+                outputs["past_key_values"] = None
+
             # add token and increase length by one
             input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
 
@@ -727,7 +767,7 @@ def greedy_search(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
 
-            # stop when there is a </s> in each sentence, or if we exceed the maximul length
+            # stop when there is a </s> in each sentence, or if we exceed the maximum length
             if unfinished_sequences.max() == 0:
                 break
 
@@ -744,6 +784,7 @@ def sample(
         max_length: Optional[int] = None,
         pad_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = None,
+        diverse_sequences: Optional[bool] = False,
         **model_kwargs
     ):
         r"""
@@ -768,6 +809,9 @@ def sample(
                 The id of the `padding` token.
             eos_token_id (:obj:`int`, `optional`):
                 The id of the `end-of-sequence` token.
+            diverse_sequences (:obj:`bool`, `optional`, defaults to False):
+                if :obj:`True` the method can generate sequences with each sequence started with one of
+                :obj:`num_return_sequences` different first tokens predicted.
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
                 model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
@@ -823,6 +867,12 @@ def sample(
         sequence_lengths, unfinished_sequences, cur_len = self._init_sequence_length_for_generation(
             input_ids, max_length
         )
+        if ("num_return_sequences" in model_kwargs) and diverse_sequences:
+            num_return_sequences = model_kwargs["num_return_sequences"]
+            starting_length = cur_len
+        else:
+            num_return_sequences = 1
+            starting_length = None
 
         # auto-regressive generation
         while cur_len < max_length:
@@ -837,15 +887,34 @@ def sample(
             scores = logits_processor(input_ids, next_token_logits)
             scores = logits_warper(input_ids, scores)
 
-            # sample
-            probs = F.softmax(scores, dim=-1)
-            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-
-            # add code that transfomers next_tokens to tokens_to_add
+            if (cur_len == starting_length) and (num_return_sequences > 1) and diverse_sequences:
+                # sample num_return_sequences tokens
+                probs = F.softmax(scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=num_return_sequences).reshape(-1)
+                # Once we got next_tokens, we have expand metadata
+                unfinished_sequences = unfinished_sequences.repeat_interleave(num_return_sequences, dim=0)
+                sequence_lengths = sequence_lengths.repeat_interleave(num_return_sequences, dim=0)
+            else:
+                # sample
+                probs = F.softmax(scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+
+            # add code that transforms next_tokens (to tokens_to_add)
             if eos_token_id is not None:
                 assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined."
                 next_tokens = next_tokens * unfinished_sequences + (pad_token_id) * (1 - unfinished_sequences)
 
+            if (cur_len == starting_length) and (num_return_sequences > 1) and diverse_sequences:
+                # expand input_ids with `num_return_sequences` additional sequences per batch
+                input_ids, model_kwargs = self._expand_inputs_for_generation(
+                    input_ids,
+                    expand_size=num_return_sequences,
+                    is_encoder_decoder=self.config.is_encoder_decoder,
+                    **model_kwargs,
+                )
+                # have to drop past_key_values because input_ids resizing
+                outputs["past_key_values"] = None
+
             # add token and increase length by one
             input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
             cur_len = cur_len + 1
diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py
index f538cc970188..abc305e4f142 100644
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -71,10 +71,10 @@
 
 class AlbertTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__. This tokenizer inherits from
-    :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this
-    superclass for more information regarding those methods
+    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__. This tokenizer
+    inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods
 
     Args:
         vocab_file (:obj:`str`):
diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py
index 55a609b3c23d..01735bdc4053 100644
--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -60,8 +60,8 @@
 class CamembertTokenizerFast(PreTrainedTokenizerFast):
     """
     Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
-    :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__.
+    :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
 
     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py
index 879c876afc41..c25b291c2a05 100644
--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -67,7 +67,8 @@
 
 class MBartTokenizerFast(XLMRobertaTokenizerFast):
     """
-    Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library). Based on `BPE
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
 
     :class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast` and adds
     a new :meth:`~transformers.MBartTokenizerFast.prepare_seq2seq_batch`.
diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py
index 09e9ac2262c9..79a20e3264ec 100644
--- a/src/transformers/models/mt5/configuration_mt5.py
+++ b/src/transformers/models/mt5/configuration_mt5.py
@@ -60,6 +60,8 @@ class MT5Config(PretrainedConfig):
             testing).
         feed_forward_proj (:obj:`string`, `optional`, defaults to :obj:`"gated-gelu"`):
             Type of feed forward layer to be used. Should be one of :obj:`"relu"` or :obj:`"gated-gelu"`.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
     """
     model_type = "mt5"
     keys_to_ignore_at_inference = ["past_key_values"]
@@ -79,6 +81,7 @@ def __init__(
         initializer_factor=1.0,
         feed_forward_proj="gated-gelu",
         is_encoder_decoder=True,
+        use_cache=True,
         tokenizer_class="T5Tokenizer",
         tie_word_embeddings=False,
         pad_token_id=0,
@@ -109,6 +112,7 @@ def __init__(
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_factor = initializer_factor
         self.feed_forward_proj = feed_forward_proj
+        self.use_cache = use_cache
 
     @property
     def hidden_size(self):
diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py
index 5728338276d2..099bdf3e7b31 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus.py
@@ -12,11 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional
+import os
+from shutil import copyfile
+from typing import Dict, List, Optional, Tuple
+
+import sentencepiece as spm
 
 from ...file_utils import add_start_docstrings
+from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
-from ..reformer.tokenization_reformer import ReformerTokenizer
+from ...utils import logging
 
 
 SPIECE_UNDERLINE = "▁"
@@ -32,31 +37,145 @@
 }
 
 
-class PegasusTokenizer(ReformerTokenizer):
+logger = logging.get_logger(__name__)
+
+
+class PegasusTokenizer(PreTrainedTokenizer):
     r"""
-    Construct a Pegasus tokenizer.
+    Construct a PEGASUS tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
 
-    :class:`~transformers.PegasusTokenizer` is identical to :class:`~transformers.ReformerTokenizer` and adds a new
-    :meth:`~transformers.PegasusTokenizer.prepare_seq2seq_batch`
+    Args:
+        vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
 
-    Refer to superclass :class:`~transformers.ReformerTokenizer` for usage examples and documentation concerning the
-    initialization parameters and other methods.
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
+            The token used for masking single token values. This is the token used when training this model with masked
+            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
+            It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
+            Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
+        mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
+            The token used for masking whole target sentences. This is the token used when training this model with gap
+            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
+            pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
+            Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
+        additional_special_tokens (:obj:`List[str]`, `optional`):
+            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
+            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
+            tokenizer
+            <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
+            that uses the tokens 2 - 104 only for pretraining
     """
-    offset = 103  # entries 2-104 are only used for pretraining
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    offset = 103  # entries 2 - 104 are only used for pretraining
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        pad_token="<pad>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        mask_token="<mask_2>",
+        mask_token_sent="<mask_1>",
+        additional_special_tokens=None,
+        **kwargs
+    ):
+        if additional_special_tokens is not None:
+            assert isinstance(
+                additional_special_tokens, list
+            ), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
+
+            additional_special_tokens_extended = (
+                ([mask_token_sent] + additional_special_tokens)
+                if mask_token_sent not in additional_special_tokens
+                else additional_special_tokens
+            )
+            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
+            additional_special_tokens_extended += [
+                f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
+            ]
+
+            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
+                raise ValueError(
+                    f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
+                )
+            additional_special_tokens = additional_special_tokens_extended
+        else:
+            additional_special_tokens = [mask_token_sent]
+            additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
 
-    def __init__(self, *args, pad_token="<pad>", **kwargs):
-        super().__init__(*args, **kwargs, pad_token="<pad>")
-        # Don't use reserved words added_token_encoder, added_tokens_decoder because of
-        # AssertionError: Non-consecutive added token '1' found. in from_pretrained
-        assert len(self.added_tokens_decoder) == 0
-        self.encoder: Dict[int, str] = {0: self.pad_token, 1: self.eos_token}
-        # entries 2-104 are only used for pretraining and called unk_2, ...unk_104
-        self.encoder.update({i: f"unk_{i}" for i in range(2, self.offset + 2)})
+        super().__init__(
+            eos_token=eos_token,
+            unk_token=unk_token,
+            mask_token=mask_token,
+            pad_token=pad_token,
+            mask_token_sent=mask_token_sent,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+        self.mask_token_sent = mask_token_sent
+
+        # add special tokens to encoder dict
+        self.encoder: Dict[int, str] = {
+            0: self.pad_token,
+            1: self.eos_token,
+            2: self.mask_token_sent,
+            3: self.mask_token,
+        }
+        # entries 2-104 are only used for pretraining and called <mask_1>, <mask_2>, unk_2, ...unk_102
+        # mask_token_sent is already added to list -> so start at 1
+        self.encoder.update({i + 3: additional_special_tokens[i] for i in range(1, self.offset - 1)})
         self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
 
+    @property
+    def vocab_size(self) -> int:
+        return len(self.sp_model) + self.offset
+
+    def get_vocab(self) -> Dict[str, int]:
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(self.vocab_file)
+
+    def _tokenize(self, text, sample=False):
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        if not sample:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+        return pieces
+
     def _convert_token_to_id(self, token: str) -> int:
         """ Converts a token (str) to an id using the vocab. """
         if token in self.decoder:
@@ -73,13 +192,13 @@ def _convert_id_to_token(self, index: int) -> str:
         elif index in self.added_tokens_encoder:
             return self.added_tokens_encoder[index]
         else:
-            # assert index > self.offset, f"cannot decode ids between 2 and {self.offset}. Got {index}"
             token = self.sp_model.IdToPiece(index - self.offset)
         return token
 
-    @property
-    def vocab_size(self) -> int:
-        return len(self.sp_model) + self.offset
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = self.sp_model.decode_pieces(tokens)
+        return out_string
 
     def num_special_tokens_to_add(self, pair=False):
         """Just EOS"""
@@ -88,7 +207,11 @@ def num_special_tokens_to_add(self, pair=False):
     def _special_token_mask(self, seq):
         all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
         all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
-        assert all_special_ids == set([0, 1])
+
+        assert all_special_ids == set(
+            range(len(self.additional_special_tokens) + 3)
+        ), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
+
         return [1 if x in all_special_ids else 0 for x in seq]
 
     def get_special_tokens_mask(
@@ -105,7 +228,7 @@ def get_special_tokens_mask(
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
-        and adding special tokens. A Pegasus sequence has the following format, where ``X`` represents the sequence:
+        and adding special tokens. A PEGASUS sequence has the following format, where ``X`` represents the sequence:
 
         - single sequence: ``X </s>``
         - pair of sequences: ``A B </s>`` (not intended use)
@@ -156,3 +279,16 @@ def prepare_seq2seq_batch(
         labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
         model_inputs["labels"] = labels
         return model_inputs
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
diff --git a/src/transformers/models/pegasus/tokenization_pegasus_fast.py b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
index e221eb4b54b0..c9b0d0763140 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
@@ -12,11 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Optional
+""" Tokenization class for model PEGASUS."""
+
+
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
 
 from ...file_utils import add_start_docstrings, is_sentencepiece_available
 from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
-from ..reformer.tokenization_reformer_fast import ReformerTokenizerFast
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
 
 
 if is_sentencepiece_available():
@@ -25,6 +31,9 @@
     PegasusTokenizer = None
 
 
+logger = logging.get_logger(__name__)
+
+
 SPIECE_UNDERLINE = "▁"
 
 VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
@@ -39,21 +48,112 @@
 }
 
 
-class PegasusTokenizerFast(ReformerTokenizerFast):
+class PegasusTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
+            The token used for masking single token values. This is the token used when training this model with masked
+            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
+            It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
+            Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
+        mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
+            The token used for masking whole target sentences. This is the token used when training this model with gap
+            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
+            pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
+            Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
+        additional_special_tokens (:obj:`List[str]`, `optional`):
+            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
+            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
+            tokenizer
+            <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
+            that uses the tokens 2 - 104 only for pretraining
+    """
     offset = 103  # entries 2-104 are only used for pretraining
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
     slow_tokenizer_class = PegasusTokenizer
+    model_input_names = ["attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        tokenizer_file=None,
+        pad_token="<pad>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        mask_token="<mask_2>",
+        mask_token_sent="<mask_1>",
+        additional_special_tokens=None,
+        **kwargs
+    ):
+        if additional_special_tokens is not None:
+            assert isinstance(
+                additional_special_tokens, list
+            ), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
+
+            additional_special_tokens_extended = (
+                ([mask_token_sent] + additional_special_tokens)
+                if mask_token_sent not in additional_special_tokens
+                else additional_special_tokens
+            )
+            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
+            additional_special_tokens_extended += [
+                f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
+            ]
+
+            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
+                raise ValueError(
+                    f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
+                )
+            additional_special_tokens = additional_special_tokens_extended
+        else:
+            additional_special_tokens = [mask_token_sent]
+            additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
+
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            pad_token=pad_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            mask_token=mask_token,
+            mask_token_sent=mask_token_sent,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
 
-    # def num_special_tokens_to_add(self, pair=False):
-    #     """Just EOS"""
-    #     return 1
+        self.vocab_file = vocab_file
 
     def _special_token_mask(self, seq):
         all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
         all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
-        assert all_special_ids == set([0, 1])
+
+        assert all_special_ids == set(
+            range(len(self.additional_special_tokens) + 3)
+        ), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
+
         return [1 if x in all_special_ids else 0 for x in seq]
 
     def get_special_tokens_mask(
@@ -117,3 +217,16 @@ def prepare_seq2seq_batch(
         labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
         model_inputs["labels"] = labels
         return model_inputs
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
diff --git a/src/transformers/models/reformer/tokenization_reformer_fast.py b/src/transformers/models/reformer/tokenization_reformer_fast.py
index 1a3d58f84d40..21deced05d31 100644
--- a/src/transformers/models/reformer/tokenization_reformer_fast.py
+++ b/src/transformers/models/reformer/tokenization_reformer_fast.py
@@ -64,8 +64,8 @@
 
 class ReformerTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__ .
+    Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
 
     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py
index 5b259ea087d0..e4ad4a306643 100644
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -75,8 +75,8 @@
 
 class T5TokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__ .
+    Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
 
     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
index 8a7b15807749..b8c9f0918e46 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
@@ -66,8 +66,8 @@
 class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
     """
     Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
-    :class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__.
+    :class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `BPE
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
 
     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
diff --git a/src/transformers/models/xlnet/tokenization_xlnet_fast.py b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
index 60e1010dae2c..84af74070d4c 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
@@ -62,8 +62,8 @@
 
 class XLNetTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__.
+    Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
 
     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
diff --git a/tests/test_generation_utils.py b/tests/test_generation_utils.py
index dee7873fb9ac..a5c879bba343 100644
--- a/tests/test_generation_utils.py
+++ b/tests/test_generation_utils.py
@@ -151,6 +151,17 @@ def test_greedy_generate(self):
                 **logits_process_kwargs,
             )
 
+            output_ids_generate_diverse_sequences = model.generate(
+                input_ids,
+                attention_mask=attention_mask,
+                do_sample=False,
+                num_beams=1,
+                max_length=max_length,
+                num_return_sequences=4,
+                diverse_sequences=True,
+                **logits_process_kwargs,
+            )
+
             if model.config.is_encoder_decoder:
                 encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
                     model, input_ids, attention_mask
@@ -165,7 +176,19 @@ def test_greedy_generate(self):
                     logits_processor=logits_processor,
                     **kwargs,
                 )
+                output_ids_greedy_diverse_sequences = model.greedy_search(
+                    input_ids,
+                    max_length=max_length,
+                    attention_mask=attention_mask,
+                    logits_processor=logits_processor,
+                    num_return_sequences=4,
+                    diverse_sequences=True,
+                    **kwargs,
+                )
             self.assertListEqual(output_ids_generate.tolist(), output_ids_greedy.tolist())
+            self.assertListEqual(
+                output_ids_generate_diverse_sequences.tolist(), output_ids_greedy_diverse_sequences.tolist()
+            )
 
     def test_sample_generate(self):
         for model_class in self.all_generative_model_classes:
@@ -192,6 +215,17 @@ def test_sample_generate(self):
                 **logits_warper_kwargs,
                 **process_kwargs,
             )
+            torch.manual_seed(0)
+            output_ids_generate_diverse = model.generate(
+                input_ids,
+                do_sample=True,
+                num_beams=1,
+                max_length=max_length,
+                attention_mask=attention_mask,
+                diverse_sequences=True,
+                **logits_warper_kwargs,
+                **process_kwargs,
+            )
 
             torch.manual_seed(0)
             kwargs = {}
@@ -213,7 +247,20 @@ def test_sample_generate(self):
                     logits_warper=logits_warper,
                     **kwargs,
                 )
+                torch.manual_seed(0)
+                output_ids_sample_diverse = model.sample(
+                    input_ids_clone,
+                    attention_mask=attention_mask_clone,
+                    max_length=max_length,
+                    logits_processor=logits_processor,
+                    logits_warper=logits_warper,
+                    diverse_sequences=True,
+                    **kwargs,
+                )
             self.assertListEqual(output_ids_generate.tolist(), output_ids_sample.tolist())
+            self.assertListEqual(output_ids_generate_diverse.tolist(), output_ids_sample_diverse.tolist())
+            self.assertListEqual(output_ids_generate.tolist(), output_ids_generate_diverse.tolist())
+            self.assertListEqual(output_ids_sample.tolist(), output_ids_sample_diverse.tolist())
 
             # check `generate()` and `sample()` yield equal results for `num_return_sequences`
             num_return_sequences = 3
@@ -231,18 +278,39 @@ def test_sample_generate(self):
                 **logits_warper_kwargs,
                 **process_kwargs,
             )
+            torch.manual_seed(0)
+            output_ids_generate_diverse = model.generate(
+                input_ids,
+                do_sample=True,
+                num_beams=1,
+                max_length=max_length,
+                num_return_sequences=num_return_sequences,
+                attention_mask=attention_mask,
+                diverse_sequences=True,
+                **logits_warper_kwargs,
+                **process_kwargs,
+            )
 
             torch.manual_seed(0)
             kwargs = {}
+            kwargs_diverse = {}
             if model.config.is_encoder_decoder:
                 encoder_outputs, input_ids_clone, attention_mask_clone = self._get_encoder_outputs(
                     model, input_ids, attention_mask, num_interleave=num_return_sequences
                 )
+                (
+                    encoder_outputs_diverse,
+                    input_ids_clone_diverse,
+                    attention_mask_clone_diverse,
+                ) = self._get_encoder_outputs(model, input_ids, attention_mask, num_interleave=1)
                 kwargs["encoder_outputs"] = encoder_outputs
+                kwargs_diverse["encoder_outputs"] = encoder_outputs_diverse
                 input_ids_clone = input_ids_clone.repeat_interleave(num_return_sequences, dim=0)
             else:
                 attention_mask_clone = attention_mask.repeat_interleave(num_return_sequences, dim=0)
                 input_ids_clone = input_ids.repeat_interleave(num_return_sequences, dim=0)
+                attention_mask_clone_diverse = attention_mask.clone()
+                input_ids_clone_diverse = input_ids.clone()
 
             with torch.no_grad():
                 output_ids_sample = model.sample(
@@ -253,7 +321,19 @@ def test_sample_generate(self):
                     logits_warper=logits_warper,
                     **kwargs,
                 )
+                torch.manual_seed(0)
+                output_ids_sample_diverse = model.sample(
+                    input_ids_clone_diverse,
+                    attention_mask=attention_mask_clone_diverse,
+                    max_length=max_length,
+                    logits_processor=logits_processor,
+                    logits_warper=logits_warper,
+                    num_return_sequences=num_return_sequences,
+                    diverse_sequences=True,
+                    **kwargs_diverse,
+                )
             self.assertListEqual(output_ids_generate.tolist(), output_ids_sample.tolist())
+            self.assertListEqual(output_ids_generate_diverse.tolist(), output_ids_sample_diverse.tolist())
 
     def test_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
diff --git a/tests/test_tokenization_pegasus.py b/tests/test_tokenization_pegasus.py
index ad26075da69f..7542f590c71e 100644
--- a/tests/test_tokenization_pegasus.py
+++ b/tests/test_tokenization_pegasus.py
@@ -26,21 +26,34 @@ def setUp(self):
         tokenizer.save_pretrained(self.tmpdirname)
 
     @cached_property
-    def pegasus_large_tokenizer(self):
+    def _large_tokenizer(self):
         return PegasusTokenizer.from_pretrained("google/pegasus-large")
 
-    @unittest.skip("add_tokens does not work yet")
-    def test_swap_special_token(self):
-        pass
-
     def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
         return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         return ("This is a test", "This is a test")
 
-    def test_pegasus_large_tokenizer_settings(self):
-        tokenizer = self.pegasus_large_tokenizer
+    def test_mask_tokens_rust_pegasus(self):
+        rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
+        py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
+        raw_input_str = "Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important </s> <pad> <pad> <pad>"
+        rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
+        py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
+        # TODO: (Thom, Patrick) - this fails because the rust tokenizer does not know about the <mask_1>, <mask_2>, and those <unk_token_x> yet
+        self.assertListEqual(py_ids, rust_ids)
+
+    def test_large_mask_tokens(self):
+        tokenizer = self._large_tokenizer
+        # <mask_1> masks whole sentence while <mask_2> masks single word
+        raw_input_str = "<mask_1> To ensure a <mask_2> flow of bank resolutions."
+        desired_result = [2, 413, 615, 114, 3, 1971, 113, 1679, 10710, 107, 1]
+        ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
+        self.assertListEqual(desired_result, ids)
+
+    def test_large_tokenizer_settings(self):
+        tokenizer = self._large_tokenizer
         # The tracebacks for the following asserts are **better** without messages or self.assertEqual
         assert tokenizer.vocab_size == 96103
         assert tokenizer.pad_token_id == 0
@@ -48,20 +61,18 @@ def test_pegasus_large_tokenizer_settings(self):
         assert tokenizer.offset == 103
         assert tokenizer.unk_token_id == tokenizer.offset + 2 == 105
         assert tokenizer.unk_token == "<unk>"
-        assert tokenizer.mask_token is None
-        assert tokenizer.mask_token_id is None
         assert tokenizer.model_max_length == 1024
         raw_input_str = "To ensure a smooth flow of bank resolutions."
         desired_result = [413, 615, 114, 2291, 1971, 113, 1679, 10710, 107, 1]
         ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
         self.assertListEqual(desired_result, ids)
-        assert tokenizer.convert_ids_to_tokens([0, 1, 2]) == ["<pad>", "</s>", "unk_2"]
+        assert tokenizer.convert_ids_to_tokens([0, 1, 2, 3]) == ["<pad>", "</s>", "<mask_1>", "<mask_2>"]
 
     @require_torch
-    def test_pegasus_large_seq2seq_truncation(self):
+    def test_large_seq2seq_truncation(self):
         src_texts = ["This is going to be way too long." * 150, "short example"]
         tgt_texts = ["not super long but more than 5 tokens", "tiny"]
-        batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch(
+        batch = self._large_tokenizer.prepare_seq2seq_batch(
             src_texts, tgt_texts=tgt_texts, max_target_length=5, return_tensors="pt"
         )
         assert batch.input_ids.shape == (2, 1024)