From b59603ae4fec40e66312f692a262b468639c115d Mon Sep 17 00:00:00 2001
From: vumichien <vumichien1692@gmail.com>
Date: Fri, 1 Apr 2022 00:17:10 +0900
Subject: [PATCH 01/13] Add doctest BERT

---
 src/transformers/models/bert/modeling_bert.py | 23 +++++++++----------
 .../models/bert/modeling_tf_bert.py           | 22 +++++++++---------
 utils/documentation_tests.txt                 |  2 ++
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 6e1a41a6b3bd..27bf474dad1d 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -1087,9 +1087,6 @@ def forward(
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
-
-        >>> prediction_logits = outputs.prediction_logits
-        >>> seq_relationship_logits = outputs.seq_relationship_logits
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1214,9 +1211,6 @@ def forward(
         >>> model = BertLMHeadModel.from_pretrained("bert-base-cased", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
-
-        >>> prediction_logits = outputs.logits
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1315,6 +1309,8 @@ def set_output_embeddings(self, new_embeddings):
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="'paris'",
+        expected_loss=0.88,
     )
     def forward(
         self,
@@ -1443,9 +1439,6 @@ def forward(
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
         >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
 
-        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
-        >>> logits = outputs.logits
-        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
         ```
         """
 
@@ -1517,9 +1510,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="textattack/bert-base-uncased-yelp-polarity",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="'LABEL_1'",
+        expected_loss=0.01,
     )
     def forward(
         self,
@@ -1716,9 +1711,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="dbmdz/bert-large-cased-finetuned-conll03-english",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']",
+        expected_loss=0.01,
     )
     def forward(
         self,
@@ -1797,9 +1794,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="deepset/bert-base-cased-squad2",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="'a nice puppet'",
+        expected_loss=7.41,
     )
     def forward(
         self,
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 6dfae3d5fb60..618cc518056d 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -1201,10 +1201,6 @@ def call(
         >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
         >>> model = TFBertForPreTraining.from_pretrained("bert-base-uncased")
         >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[
-        ...     None, :
-        >>> ]  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> prediction_scores, seq_relationship_scores = outputs[:2]
         ```"""
         outputs = self.bert(
             input_ids=input_ids,
@@ -1288,6 +1284,8 @@ def get_prefix_bias_name(self) -> str:
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="'P a r i s'",
+        expected_loss=0.81,
     )
     def call(
         self,
@@ -1536,10 +1534,6 @@ def call(
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="tf")
-
-        >>> logits = model(encoding["input_ids"], token_type_ids=encoding["token_type_ids"])[0]
-        >>> assert logits[0][0] < logits[0][1]  # the next sentence was random
         ```"""
         outputs = self.bert(
             input_ids=input_ids,
@@ -1611,9 +1605,11 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="nlptown/bert-base-multilingual-uncased-sentiment",
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="'5 stars'",
+        expected_loss=3.81,
     )
     def call(
         self,
@@ -1840,9 +1836,11 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="dbmdz/bert-large-cased-finetuned-conll03-english",
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']",
+        expected_loss=0.01,
     )
     def call(
         self,
@@ -1931,9 +1929,11 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="huggingface-course/bert-finetuned-squad",
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="'nice puppet'",
+        expected_loss=8.03,
     )
     def call(
         self,
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 372e63ad232b..9e6e9bca585f 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -7,6 +7,8 @@ src/transformers/generation_utils.py
 src/transformers/models/bart/modeling_bart.py
 src/transformers/models/bart/modeling_bart.py
 src/transformers/models/beit/modeling_beit.py
+src/transformers/models/bert/modeling_bert.py
+src/transformers/models/bert/modeling_tf_bert.py
 src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
 src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
 src/transformers/models/blenderbot/modeling_blenderbot.py

From e60a4bf5e0abadef0d4b1fb6b43fe2da8074763d Mon Sep 17 00:00:00 2001
From: vumichien <vumichien1692@gmail.com>
Date: Fri, 1 Apr 2022 00:41:00 +0900
Subject: [PATCH 02/13] make fixup

---
 src/transformers/models/bert/modeling_bert.py    | 1 -
 src/transformers/models/bert/modeling_tf_bert.py | 5 ++++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 27bf474dad1d..1a78338d3069 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -1438,7 +1438,6 @@ def forward(
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
         >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
-
         ```
         """
 
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 618cc518056d..844fed262cd4 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -1200,7 +1200,10 @@ def call(
 
         >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
         >>> model = TFBertForPreTraining.from_pretrained("bert-base-uncased")
-        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[:None, :]
+        >>> # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores, seq_relationship_score = outputs[:2]
         ```"""
         outputs = self.bert(
             input_ids=input_ids,

From 2ea5ebe866182998c5d34705e8cfd5397d2711a2 Mon Sep 17 00:00:00 2001
From: vumichien <vumichien1692@gmail.com>
Date: Fri, 1 Apr 2022 00:52:06 +0900
Subject: [PATCH 03/13] fix typo

---
 src/transformers/models/bert/modeling_bert.py    | 10 ++++++++++
 src/transformers/models/bert/modeling_tf_bert.py |  6 +++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 1a78338d3069..3950b65f4715 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -1087,6 +1087,9 @@ def forward(
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1211,6 +1214,9 @@ def forward(
         >>> model = BertLMHeadModel.from_pretrained("bert-base-cased", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1438,6 +1444,10 @@ def forward(
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
         >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
         ```
         """
 
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 844fed262cd4..6e754e4c8ddd 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -1203,7 +1203,7 @@ def call(
         >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[:None, :]
         >>> # Batch size 1
         >>> outputs = model(input_ids)
-        >>> prediction_scores, seq_relationship_score = outputs[:2]
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]
         ```"""
         outputs = self.bert(
             input_ids=input_ids,
@@ -1537,6 +1537,10 @@ def call(
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="tf")
+
+        >>> logits = model(encoding["input_ids"], token_type_ids=encoding["token_type_ids"])[0]
+        >>> assert logits[0][0] < logits[0][1]  # the next sentence was random
         ```"""
         outputs = self.bert(
             input_ids=input_ids,

From 1193bfff3427d8577dab47c13c9544a175a933fb Mon Sep 17 00:00:00 2001
From: vumichien <vumichien1692@gmail.com>
Date: Sat, 2 Apr 2022 13:56:37 +0900
Subject: [PATCH 04/13] change checkpoints

---
 src/transformers/models/bert/modeling_bert.py | 28 ++++++-------------
 .../models/bert/modeling_tf_bert.py           | 18 ++++++++----
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 3950b65f4715..8c6c45cb4df7 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -1090,6 +1090,7 @@ def forward(
 
         >>> prediction_logits = outputs.prediction_logits
         >>> seq_relationship_logits = outputs.seq_relationship_logits
+
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1156,7 +1157,12 @@ def set_output_embeddings(self, new_embeddings):
         self.cls.predictions.decoder = new_embeddings
 
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -1199,25 +1205,6 @@ def forward(
             use_cache (`bool`, *optional*):
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
-        >>> import torch
-
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-        >>> config = BertConfig.from_pretrained("bert-base-cased")
-        >>> config.is_decoder = True
-        >>> model = BertLMHeadModel.from_pretrained("bert-base-cased", config=config)
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
-
-        >>> prediction_logits = outputs.logits
-        ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if labels is not None:
@@ -1448,6 +1435,7 @@ def forward(
         >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
         >>> logits = outputs.logits
         >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
+
         ```
         """
 
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 6e754e4c8ddd..7261db312e96 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -1200,10 +1200,12 @@ def call(
 
         >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
         >>> model = TFBertForPreTraining.from_pretrained("bert-base-uncased")
-        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[:None, :]
-        >>> # Batch size 1
+        >>> input_ids = tokenizer("Hello, my dog is cute", add_special_tokens=True, return_tensors="tf") # Batch size 1
+
         >>> outputs = model(input_ids)
-        >>> prediction_scores, seq_relationship_scores = outputs[:2]
+        >>> prediction_logits, seq_relationship_logits = outputs[:2]
+
+
         ```"""
         outputs = self.bert(
             input_ids=input_ids,
@@ -1541,6 +1543,10 @@ def call(
 
         >>> logits = model(encoding["input_ids"], token_type_ids=encoding["token_type_ids"])[0]
         >>> assert logits[0][0] < logits[0][1]  # the next sentence was random
+
+
+
+
         ```"""
         outputs = self.bert(
             input_ids=input_ids,
@@ -1612,11 +1618,11 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nlptown/bert-base-multilingual-uncased-sentiment",
+        checkpoint="ydshieh/bert-base-uncased-yelp-polarity",
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'5 stars'",
-        expected_loss=3.81,
+        expected_output="'LABEL_1'",
+        expected_loss=0.01,
     )
     def call(
         self,

From e4a426f6897fd9f59e825729df7c518d52057065 Mon Sep 17 00:00:00 2001
From: vumichien <vumichien1692@gmail.com>
Date: Sat, 2 Apr 2022 14:06:44 +0900
Subject: [PATCH 05/13] make fixup

---
 src/transformers/models/bert/modeling_bert.py | 48 +++++++++----------
 .../models/bert/modeling_tf_bert.py           | 10 ++--
 2 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 8c6c45cb4df7..5acabea17515 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -1090,7 +1090,6 @@ def forward(
 
         >>> prediction_logits = outputs.prediction_logits
         >>> seq_relationship_logits = outputs.seq_relationship_logits
-
         ```
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1181,30 +1180,28 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
         r"""
-            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-                if the model is configured as a decoder.
-            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
-                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be
-                in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100`
-                are ignored (masked), the loss is only computed for the tokens with labels n `[0, ...,
-                config.vocab_size]`
-            past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-                Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up
-                decoding.
-
-                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
-                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if labels is not None:
@@ -1435,7 +1432,6 @@ def forward(
         >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
         >>> logits = outputs.logits
         >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
-
         ```
         """
 
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 7261db312e96..56bdb11f58d9 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -1200,12 +1200,12 @@ def call(
 
         >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
         >>> model = TFBertForPreTraining.from_pretrained("bert-base-uncased")
-        >>> input_ids = tokenizer("Hello, my dog is cute", add_special_tokens=True, return_tensors="tf") # Batch size 1
+        >>> input_ids = tokenizer(
+        ...     "Hello, my dog is cute", add_special_tokens=True, return_tensors="tf"
+        >>> )  # Batch size 1
 
         >>> outputs = model(input_ids)
         >>> prediction_logits, seq_relationship_logits = outputs[:2]
-
-
         ```"""
         outputs = self.bert(
             input_ids=input_ids,
@@ -1543,10 +1543,6 @@ def call(
 
         >>> logits = model(encoding["input_ids"], token_type_ids=encoding["token_type_ids"])[0]
         >>> assert logits[0][0] < logits[0][1]  # the next sentence was random
-
-
-
-
         ```"""
         outputs = self.bert(
             input_ids=input_ids,

From 390a13f5a20538bfff4ed2f64c1371642220c48c Mon Sep 17 00:00:00 2001
From: vumichien <vumichien1692@gmail.com>
Date: Tue, 5 Apr 2022 12:38:36 +0900
Subject: [PATCH 06/13] define doctest output value, update doctest for
 mobilebert

---
 src/transformers/models/bert/modeling_bert.py | 27 ++++++++++++----
 .../models/bert/modeling_tf_bert.py           | 20 ++++++++++--
 .../models/mobilebert/modeling_mobilebert.py  | 31 +++++++++++++++----
 .../mobilebert/modeling_tf_mobilebert.py      | 26 ++++++++++++++--
 utils/documentation_tests.txt                 |  2 ++
 5 files changed, 88 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 5acabea17515..22cc50d5255b 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -63,6 +63,21 @@
 _CONFIG_FOR_DOC = "BertConfig"
 _TOKENIZER_FOR_DOC = "BertTokenizer"
 
+# TokenClassification docstring
+_TOKEN_CLASS_EXPECTED_OUTPUT = (
+    "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', " "'I-LOC'] "
+)
+_TOKEN_CLASS_EXPECTED_LOSS = 0.01
+
+# QuestionAnswering docstring
+_QA_EXPECTED_OUTPUT = "'a nice puppet'"
+_QA_EXPECTED_LOSS = 7.41
+
+# SequenceClassification docstring
+_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
+_SEQ_CLASS_EXPECTED_LOSS = 0.01
+
+
 BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "bert-base-uncased",
     "bert-large-uncased",
@@ -1506,8 +1521,8 @@ def __init__(self, config):
         checkpoint="textattack/bert-base-uncased-yelp-polarity",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'LABEL_1'",
-        expected_loss=0.01,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     def forward(
         self,
@@ -1707,8 +1722,8 @@ def __init__(self, config):
         checkpoint="dbmdz/bert-large-cased-finetuned-conll03-english",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']",
-        expected_loss=0.01,
+        expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
     )
     def forward(
         self,
@@ -1790,8 +1805,8 @@ def __init__(self, config):
         checkpoint="deepset/bert-base-cased-squad2",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'a nice puppet'",
-        expected_loss=7.41,
+        expected_output=_QA_EXPECTED_OUTPUT,
+        expected_loss=_QA_EXPECTED_LOSS,
     )
     def forward(
         self,
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 56bdb11f58d9..356810e68544 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -69,6 +69,20 @@
 _CONFIG_FOR_DOC = "BertConfig"
 _TOKENIZER_FOR_DOC = "BertTokenizer"
 
+# TokenClassification docstring
+_TOKEN_CLASS_EXPECTED_OUTPUT = (
+    "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', " "'I-LOC'] "
+)
+_TOKEN_CLASS_EXPECTED_LOSS = 0.01
+
+# QuestionAnswering docstring
+_QA_EXPECTED_OUTPUT = "'a nice puppet'"
+_QA_EXPECTED_LOSS = 7.41
+
+# SequenceClassification docstring
+_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
+_SEQ_CLASS_EXPECTED_LOSS = 0.01
+
 TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "bert-base-uncased",
     "bert-large-uncased",
@@ -1938,11 +1952,11 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="huggingface-course/bert-finetuned-squad",
+        checkpoint="ydshieh/bert-base-cased-squad2",
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'nice puppet'",
-        expected_loss=8.03,
+        expected_output="'a nice puppet'",
+        expected_loss=7.41,
     )
     def call(
         self,
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index d57b83570006..227a1c815a8d 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -59,6 +59,18 @@
 _CONFIG_FOR_DOC = "MobileBertConfig"
 _TOKENIZER_FOR_DOC = "MobileBertTokenizer"
 
+# TokenClassification docstring
+_TOKEN_CLASS_EXPECTED_OUTPUT = "['I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']"
+_TOKEN_CLASS_EXPECTED_LOSS = 0.03
+
+# QuestionAnswering docstring
+_QA_EXPECTED_OUTPUT = "'a nice puppet'"
+_QA_EXPECTED_LOSS = 0
+
+# SequenceClassification docstring
+_SEQ_CLASS_EXPECTED_OUTPUT = "'others'"
+_SEQ_CLASS_EXPECTED_LOSS = "4.72"
+
 MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = ["google/mobilebert-uncased"]
 
 
@@ -962,9 +974,8 @@ def forward(
         >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
         >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
 
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
-        ...     0
-        >>> )  # Batch size 1
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
+        >>> # Batch size 1
         >>> outputs = model(input_ids)
 
         >>> prediction_logits = outputs.prediction_logits
@@ -1039,6 +1050,8 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Em
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="'paris'",
+        expected_loss=0.57,
     )
     def forward(
         self,
@@ -1229,9 +1242,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="lordtt13/emo-mobilebert",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     def forward(
         self,
@@ -1330,9 +1345,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="csarron/mobilebert-uncased-squad-v2",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output=_QA_EXPECTED_OUTPUT,
+        expected_loss=_QA_EXPECTED_LOSS,
     )
     def forward(
         self,
@@ -1536,9 +1553,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="mrm8488/mobilebert-finetuned-ner",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
     )
     def forward(
         self,
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index 007be43f5f06..9739b2f45f7a 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -63,6 +63,18 @@
 _CONFIG_FOR_DOC = "MobileBertConfig"
 _TOKENIZER_FOR_DOC = "MobileBertTokenizer"
 
+# TokenClassification docstring
+_TOKEN_CLASS_EXPECTED_OUTPUT = "['I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']"
+_TOKEN_CLASS_EXPECTED_LOSS = 0.03
+
+# QuestionAnswering docstring
+_QA_EXPECTED_OUTPUT = "'a nice puppet'"
+_QA_EXPECTED_LOSS = 0
+
+# SequenceClassification docstring
+_SEQ_CLASS_EXPECTED_OUTPUT = "'others'"
+_SEQ_CLASS_EXPECTED_LOSS = "4.72"
+
 TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/mobilebert-uncased",
     # See all MobileBERT models at https://huggingface.co/models?filter=mobilebert
@@ -1078,6 +1090,8 @@ def get_prefix_bias_name(self):
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="'p a r i s'",
+        expected_loss=0.57,
     )
     def call(
         self,
@@ -1270,9 +1284,11 @@ def __init__(self, config, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="vumichien/emo-mobilebert",
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     def call(
         self,
@@ -1363,9 +1379,11 @@ def __init__(self, config, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="vumichien/mobilebert-uncased-squad-v2",
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output=_QA_EXPECTED_OUTPUT,
+        expected_loss=_QA_EXPECTED_LOSS,
     )
     def call(
         self,
@@ -1609,9 +1627,11 @@ def __init__(self, config, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="vumichien/mobilebert-finetuned-ner",
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
     )
     def call(
         self,
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 9e6e9bca585f..2360a2148866 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -25,6 +25,8 @@ src/transformers/models/marian/modeling_marian.py
 src/transformers/models/marian/modeling_marian.py
 src/transformers/models/mbart/modeling_mbart.py
 src/transformers/models/mbart/modeling_mbart.py
+src/transformers/models/mobilebert/modeling_mobilebert.py
+src/transformers/models/mobilebert/modeling_tf_mobilebert.py
 src/transformers/models/pegasus/modeling_pegasus.py
 src/transformers/models/pegasus/modeling_pegasus.py
 src/transformers/models/plbart/modeling_plbart.py

From 1b5d204f63c95bbcf51b23169cc1a9fe23c22f26 Mon Sep 17 00:00:00 2001
From: vumichien <vumichien1692@gmail.com>
Date: Wed, 6 Apr 2022 00:10:18 +0900
Subject: [PATCH 07/13] solve fix-copies

---
 src/transformers/models/bert/modeling_bert.py            | 9 ++++++---
 src/transformers/models/bert/modeling_tf_bert.py         | 9 ++++++---
 .../models/mobilebert/modeling_mobilebert.py             | 9 ++++++---
 .../models/mobilebert/modeling_tf_mobilebert.py          | 9 ++++++---
 4 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 22cc50d5255b..8db38fdbee10 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -64,16 +64,19 @@
 _TOKENIZER_FOR_DOC = "BertTokenizer"
 
 # TokenClassification docstring
+_CHECKPOINT_FOR_TOKEN_CLASS = "dbmdz/bert-large-cased-finetuned-conll03-english"
 _TOKEN_CLASS_EXPECTED_OUTPUT = (
     "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', " "'I-LOC'] "
 )
 _TOKEN_CLASS_EXPECTED_LOSS = 0.01
 
 # QuestionAnswering docstring
+_CHECKPOINT_FOR_QA = "deepset/bert-base-cased-squad2"
 _QA_EXPECTED_OUTPUT = "'a nice puppet'"
 _QA_EXPECTED_LOSS = 7.41
 
 # SequenceClassification docstring
+_CHECKPOINT_FOR_SEQ_CLASS = "textattack/bert-base-uncased-yelp-polarity"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
 _SEQ_CLASS_EXPECTED_LOSS = 0.01
 
@@ -1518,7 +1521,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="textattack/bert-base-uncased-yelp-polarity",
+        checkpoint=_CHECKPOINT_FOR_SEQ_CLASS,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
@@ -1719,7 +1722,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="dbmdz/bert-large-cased-finetuned-conll03-english",
+        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASS,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
@@ -1802,7 +1805,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="deepset/bert-base-cased-squad2",
+        checkpoint=_CHECKPOINT_FOR_QA,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_QA_EXPECTED_OUTPUT,
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 356810e68544..df8e4f310968 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -70,16 +70,19 @@
 _TOKENIZER_FOR_DOC = "BertTokenizer"
 
 # TokenClassification docstring
+_CHECKPOINT_FOR_TOKEN_CLASS = "dbmdz/bert-large-cased-finetuned-conll03-english"
 _TOKEN_CLASS_EXPECTED_OUTPUT = (
     "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', " "'I-LOC'] "
 )
 _TOKEN_CLASS_EXPECTED_LOSS = 0.01
 
 # QuestionAnswering docstring
+_CHECKPOINT_FOR_QA = "ydshieh/bert-base-cased-squad2"
 _QA_EXPECTED_OUTPUT = "'a nice puppet'"
 _QA_EXPECTED_LOSS = 7.41
 
 # SequenceClassification docstring
+_CHECKPOINT_FOR_SEQ_CLASS = "ydshieh/bert-base-uncased-yelp-polarity"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
 _SEQ_CLASS_EXPECTED_LOSS = 0.01
 
@@ -1628,7 +1631,7 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="ydshieh/bert-base-uncased-yelp-polarity",
+        checkpoint=_CHECKPOINT_FOR_SEQ_CLASS,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output="'LABEL_1'",
@@ -1859,7 +1862,7 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="dbmdz/bert-large-cased-finetuned-conll03-english",
+        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASS,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output="['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']",
@@ -1952,7 +1955,7 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="ydshieh/bert-base-cased-squad2",
+        checkpoint=_CHECKPOINT_FOR_QA,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output="'a nice puppet'",
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index 227a1c815a8d..91c0582aed37 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -60,14 +60,17 @@
 _TOKENIZER_FOR_DOC = "MobileBertTokenizer"
 
 # TokenClassification docstring
+_CHECKPOINT_FOR_TOKEN_CLASS = "mrm8488/mobilebert-finetuned-ner"
 _TOKEN_CLASS_EXPECTED_OUTPUT = "['I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']"
 _TOKEN_CLASS_EXPECTED_LOSS = 0.03
 
 # QuestionAnswering docstring
+_CHECKPOINT_FOR_QA = "csarron/mobilebert-uncased-squad-v2"
 _QA_EXPECTED_OUTPUT = "'a nice puppet'"
 _QA_EXPECTED_LOSS = 0
 
 # SequenceClassification docstring
+_CHECKPOINT_FOR_SEQ_CLASS = "lordtt13/emo-mobilebert"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'others'"
 _SEQ_CLASS_EXPECTED_LOSS = "4.72"
 
@@ -1242,7 +1245,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="lordtt13/emo-mobilebert",
+        checkpoint=_CHECKPOINT_FOR_SEQ_CLASS,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
@@ -1345,7 +1348,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="csarron/mobilebert-uncased-squad-v2",
+        checkpoint=_CHECKPOINT_FOR_QA,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_QA_EXPECTED_OUTPUT,
@@ -1553,7 +1556,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="mrm8488/mobilebert-finetuned-ner",
+        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASS,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index 9739b2f45f7a..c1abad43df74 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -64,14 +64,17 @@
 _TOKENIZER_FOR_DOC = "MobileBertTokenizer"
 
 # TokenClassification docstring
+_CHECKPOINT_FOR_TOKEN_CLASS = "vumichien/mobilebert-finetuned-ner"
 _TOKEN_CLASS_EXPECTED_OUTPUT = "['I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']"
 _TOKEN_CLASS_EXPECTED_LOSS = 0.03
 
 # QuestionAnswering docstring
+_CHECKPOINT_FOR_QA = "vumichien/mobilebert-uncased-squad-v2"
 _QA_EXPECTED_OUTPUT = "'a nice puppet'"
 _QA_EXPECTED_LOSS = 0
 
 # SequenceClassification docstring
+_CHECKPOINT_FOR_SEQ_CLASS = "vumichien/emo-mobilebert"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'others'"
 _SEQ_CLASS_EXPECTED_LOSS = "4.72"
 
@@ -1284,7 +1287,7 @@ def __init__(self, config, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="vumichien/emo-mobilebert",
+        checkpoint=_CHECKPOINT_FOR_SEQ_CLASS,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
@@ -1379,7 +1382,7 @@ def __init__(self, config, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="vumichien/mobilebert-uncased-squad-v2",
+        checkpoint=_CHECKPOINT_FOR_QA,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_QA_EXPECTED_OUTPUT,
@@ -1627,7 +1630,7 @@ def __init__(self, config, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="vumichien/mobilebert-finetuned-ner",
+        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASS,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,

From 67a7e92c4ff45361d4d62ad8effa1dfe2294da9d Mon Sep 17 00:00:00 2001
From: vumichien <vumichien1692@gmail.com>
Date: Fri, 8 Apr 2022 23:48:15 +0900
Subject: [PATCH 08/13] update QA target start index and end index

---
 src/transformers/models/bert/modeling_bert.py              | 4 ++++
 src/transformers/models/bert/modeling_tf_bert.py           | 7 ++++---
 src/transformers/models/mobilebert/modeling_mobilebert.py  | 6 +++++-
 .../models/mobilebert/modeling_tf_mobilebert.py            | 6 +++++-
 4 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 8db38fdbee10..749f56e023e7 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -74,6 +74,8 @@
 _CHECKPOINT_FOR_QA = "deepset/bert-base-cased-squad2"
 _QA_EXPECTED_OUTPUT = "'a nice puppet'"
 _QA_EXPECTED_LOSS = 7.41
+_QA_TARGET_START_INDEX = 14
+_QA_TARGET_END_INDEX = 15
 
 # SequenceClassification docstring
 _CHECKPOINT_FOR_SEQ_CLASS = "textattack/bert-base-uncased-yelp-polarity"
@@ -1808,6 +1810,8 @@ def __init__(self, config):
         checkpoint=_CHECKPOINT_FOR_QA,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
+        qa_target_start_index=_QA_TARGET_START_INDEX,
+        qa_target_end_index=_QA_TARGET_END_INDEX,
         expected_output=_QA_EXPECTED_OUTPUT,
         expected_loss=_QA_EXPECTED_LOSS,
     )
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index d8511fe606bf..b996fdd896f4 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -80,6 +80,8 @@
 _CHECKPOINT_FOR_QA = "ydshieh/bert-base-cased-squad2"
 _QA_EXPECTED_OUTPUT = "'a nice puppet'"
 _QA_EXPECTED_LOSS = 7.41
+_QA_TARGET_START_INDEX = 14
+_QA_TARGET_END_INDEX = 15
 
 # SequenceClassification docstring
 _CHECKPOINT_FOR_SEQ_CLASS = "ydshieh/bert-base-uncased-yelp-polarity"
@@ -1214,9 +1216,8 @@ def call(
 
         >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
         >>> model = TFBertForPreTraining.from_pretrained("bert-base-uncased")
-        >>> input_ids = tokenizer(
-        ...     "Hello, my dog is cute", add_special_tokens=True, return_tensors="tf"
-        >>> )  # Batch size 1
+        >>> input_ids = tokenizer("Hello, my dog is cute", add_special_tokens=True, return_tensors="tf")
+        >>> # Batch size 1
 
         >>> outputs = model(input_ids)
         >>> prediction_logits, seq_relationship_logits = outputs[:2]
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index 91c0582aed37..3f2587a78e4c 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -67,7 +67,9 @@
 # QuestionAnswering docstring
 _CHECKPOINT_FOR_QA = "csarron/mobilebert-uncased-squad-v2"
 _QA_EXPECTED_OUTPUT = "'a nice puppet'"
-_QA_EXPECTED_LOSS = 0
+_QA_EXPECTED_LOSS = 3.98
+_QA_TARGET_START_INDEX = 12
+_QA_TARGET_END_INDEX = 13
 
 # SequenceClassification docstring
 _CHECKPOINT_FOR_SEQ_CLASS = "lordtt13/emo-mobilebert"
@@ -1351,6 +1353,8 @@ def __init__(self, config):
         checkpoint=_CHECKPOINT_FOR_QA,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
+        qa_target_start_index=_QA_TARGET_START_INDEX,
+        qa_target_end_index=_QA_TARGET_END_INDEX,
         expected_output=_QA_EXPECTED_OUTPUT,
         expected_loss=_QA_EXPECTED_LOSS,
     )
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index fca1365f7494..da0a7e21a65b 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -71,7 +71,9 @@
 # QuestionAnswering docstring
 _CHECKPOINT_FOR_QA = "vumichien/mobilebert-uncased-squad-v2"
 _QA_EXPECTED_OUTPUT = "'a nice puppet'"
-_QA_EXPECTED_LOSS = 0
+_QA_EXPECTED_LOSS = 3.98
+_QA_TARGET_START_INDEX = 12
+_QA_TARGET_END_INDEX = 13
 
 # SequenceClassification docstring
 _CHECKPOINT_FOR_SEQ_CLASS = "vumichien/emo-mobilebert"
@@ -1379,6 +1381,8 @@ def __init__(self, config, *inputs, **kwargs):
         checkpoint=_CHECKPOINT_FOR_QA,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
+        qa_target_start_index=_QA_TARGET_START_INDEX,
+        qa_target_end_index=_QA_TARGET_END_INDEX,
         expected_output=_QA_EXPECTED_OUTPUT,
         expected_loss=_QA_EXPECTED_LOSS,
     )

From 3d9e6611c2e5bb755e3924339d915dc2c03c0ca3 Mon Sep 17 00:00:00 2001
From: vumichien <vumichien1692@gmail.com>
Date: Sat, 9 Apr 2022 11:37:43 +0900
Subject: [PATCH 09/13] change checkpoint for docs and reuse defined variable

---
 .../models/bert/modeling_tf_bert.py            | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index b996fdd896f4..b990f2851eed 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -65,7 +65,7 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "bert-base-cased"
+_CHECKPOINT_FOR_DOC = "bert-base-uncased"
 _CONFIG_FOR_DOC = "BertConfig"
 _TOKENIZER_FOR_DOC = "BertTokenizer"
 
@@ -1304,8 +1304,8 @@ def get_prefix_bias_name(self) -> str:
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'P a r i s'",
-        expected_loss=0.81,
+        expected_output="'p a r i s'",
+        expected_loss=0.88,
     )
     def call(
         self,
@@ -1630,8 +1630,8 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
         checkpoint=_CHECKPOINT_FOR_SEQ_CLASS,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'LABEL_1'",
-        expected_loss=0.01,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     def call(
         self,
@@ -1859,8 +1859,8 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
         checkpoint=_CHECKPOINT_FOR_TOKEN_CLASS,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']",
-        expected_loss=0.01,
+        expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
     )
     def call(
         self,
@@ -1951,8 +1951,8 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
         checkpoint=_CHECKPOINT_FOR_QA,
         output_type=TFQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'a nice puppet'",
-        expected_loss=7.41,
+        expected_output=_QA_EXPECTED_OUTPUT,
+        expected_loss=_QA_EXPECTED_LOSS,
     )
     def call(
         self,

From 8524a1ec5d26b88a3a37a9130fd3fa9199f31f34 Mon Sep 17 00:00:00 2001
From: Minh Chien Vu <31467068+vumichien@users.noreply.github.com>
Date: Mon, 11 Apr 2022 21:41:31 +0900
Subject: [PATCH 10/13] Update src/transformers/models/bert/modeling_tf_bert.py

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
---
 src/transformers/models/bert/modeling_tf_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index b990f2851eed..7eb268bd17dd 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -1304,7 +1304,7 @@ def get_prefix_bias_name(self) -> str:
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'p a r i s'",
+        expected_output="'paris'",
         expected_loss=0.88,
     )
     def call(

From 84f09538ee895740242def0c9434c88d8e9d7da5 Mon Sep 17 00:00:00 2001
From: Minh Chien Vu <31467068+vumichien@users.noreply.github.com>
Date: Mon, 11 Apr 2022 21:44:00 +0900
Subject: [PATCH 11/13] Apply suggestions from code review

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
---
 src/transformers/models/bert/modeling_bert.py          |  4 ++--
 src/transformers/models/bert/modeling_tf_bert.py       |  8 ++++----
 .../models/mobilebert/modeling_mobilebert.py           |  8 ++++----
 .../models/mobilebert/modeling_tf_mobilebert.py        | 10 +++++-----
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 749f56e023e7..ec54eff3162f 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -1523,7 +1523,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_SEQ_CLASS,
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION ,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
@@ -1724,7 +1724,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASS,
+        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION ,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 7eb268bd17dd..b64e203253a3 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -70,7 +70,7 @@
 _TOKENIZER_FOR_DOC = "BertTokenizer"
 
 # TokenClassification docstring
-_CHECKPOINT_FOR_TOKEN_CLASS = "dbmdz/bert-large-cased-finetuned-conll03-english"
+_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english"
 _TOKEN_CLASS_EXPECTED_OUTPUT = (
     "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', " "'I-LOC'] "
 )
@@ -84,7 +84,7 @@
 _QA_TARGET_END_INDEX = 15
 
 # SequenceClassification docstring
-_CHECKPOINT_FOR_SEQ_CLASS = "ydshieh/bert-base-uncased-yelp-polarity"
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "ydshieh/bert-base-uncased-yelp-polarity"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
 _SEQ_CLASS_EXPECTED_LOSS = 0.01
 
@@ -1627,7 +1627,7 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_SEQ_CLASS,
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
@@ -1856,7 +1856,7 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASS,
+        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index 3f2587a78e4c..4eb1fbf69d44 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -60,7 +60,7 @@
 _TOKENIZER_FOR_DOC = "MobileBertTokenizer"
 
 # TokenClassification docstring
-_CHECKPOINT_FOR_TOKEN_CLASS = "mrm8488/mobilebert-finetuned-ner"
+_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "mrm8488/mobilebert-finetuned-ner"
 _TOKEN_CLASS_EXPECTED_OUTPUT = "['I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']"
 _TOKEN_CLASS_EXPECTED_LOSS = 0.03
 
@@ -72,7 +72,7 @@
 _QA_TARGET_END_INDEX = 13
 
 # SequenceClassification docstring
-_CHECKPOINT_FOR_SEQ_CLASS = "lordtt13/emo-mobilebert"
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "lordtt13/emo-mobilebert"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'others'"
 _SEQ_CLASS_EXPECTED_LOSS = "4.72"
 
@@ -1247,7 +1247,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_SEQ_CLASS,
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
@@ -1560,7 +1560,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASS,
+        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index da0a7e21a65b..2e2282cce6bc 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -64,7 +64,7 @@
 _TOKENIZER_FOR_DOC = "MobileBertTokenizer"
 
 # TokenClassification docstring
-_CHECKPOINT_FOR_TOKEN_CLASS = "vumichien/mobilebert-finetuned-ner"
+_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "vumichien/mobilebert-finetuned-ner"
 _TOKEN_CLASS_EXPECTED_OUTPUT = "['I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']"
 _TOKEN_CLASS_EXPECTED_LOSS = 0.03
 
@@ -76,7 +76,7 @@
 _QA_TARGET_END_INDEX = 13
 
 # SequenceClassification docstring
-_CHECKPOINT_FOR_SEQ_CLASS = "vumichien/emo-mobilebert"
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "vumichien/emo-mobilebert"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'others'"
 _SEQ_CLASS_EXPECTED_LOSS = "4.72"
 
@@ -1092,7 +1092,7 @@ def get_prefix_bias_name(self):
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="'p a r i s'",
+        expected_output="'paris'",
         expected_loss=0.57,
     )
     def call(
@@ -1284,7 +1284,7 @@ def __init__(self, config, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_SEQ_CLASS,
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=TFSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
@@ -1626,7 +1626,7 @@ def __init__(self, config, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASS,
+        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
         output_type=TFTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,

From 426301929f394e67522758dfb5db2cdec5908db3 Mon Sep 17 00:00:00 2001
From: Minh Chien Vu <31467068+vumichien@users.noreply.github.com>
Date: Mon, 11 Apr 2022 21:44:43 +0900
Subject: [PATCH 12/13] Apply suggestions from code review

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
---
 src/transformers/models/bert/modeling_bert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index ec54eff3162f..254b69c5c66b 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -64,7 +64,7 @@
 _TOKENIZER_FOR_DOC = "BertTokenizer"
 
 # TokenClassification docstring
-_CHECKPOINT_FOR_TOKEN_CLASS = "dbmdz/bert-large-cased-finetuned-conll03-english"
+_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english"
 _TOKEN_CLASS_EXPECTED_OUTPUT = (
     "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', " "'I-LOC'] "
 )
@@ -78,7 +78,7 @@
 _QA_TARGET_END_INDEX = 15
 
 # SequenceClassification docstring
-_CHECKPOINT_FOR_SEQ_CLASS = "textattack/bert-base-uncased-yelp-polarity"
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "textattack/bert-base-uncased-yelp-polarity"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
 _SEQ_CLASS_EXPECTED_LOSS = 0.01
 

From f8ea70cf31263d4847c37e1b2e0e07d97d262efb Mon Sep 17 00:00:00 2001
From: vumichien <vumichien1692@gmail.com>
Date: Mon, 11 Apr 2022 22:23:58 +0900
Subject: [PATCH 13/13] make fixup

---
 src/transformers/models/bert/modeling_bert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 254b69c5c66b..20b65d0c0657 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -1523,7 +1523,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION ,
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
@@ -1724,7 +1724,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION ,
+        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,