From 1f14e2592d1bc628853f9578303b42a8aa4c116a Mon Sep 17 00:00:00 2001
From: Karim Foda <kmfoda@gmail.com>
Date: Sun, 27 Mar 2022 17:25:47 +0100
Subject: [PATCH 1/9] Add initial doctring changes

---
 .../models/longformer/modeling_longformer.py  | 51 ++++++++++++-------
 utils/documentation_tests.txt                 |  1 +
 2 files changed, 33 insertions(+), 19 deletions(-)
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index a2a4e94414a2..8d8ac292c8ed 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1407,6 +1407,28 @@ def _set_gradient_checkpointing(self, module, value=False):
             module.gradient_checkpointing = value
 
 
+LONGFORMER_GENERATION_DOCSTRING = r"""
+    Mask filling example:
+
+    ```python
+    >>> from transformers import LongformerTokenizer, LongformerForMaskedLM
+
+    >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
+    >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
+
+    >>> TXT = "My friends are <mask> but they eat too many carbs." + " That's why I decide not to eat with them."*300
+    >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
+    >>> logits = model(input_ids).logits
+
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+    >>> probs = logits[0, masked_index].softmax(dim=0)
+    >>> values, predictions = probs.topk(5)
+
+    >>> tokenizer.decode(predictions).split()
+    ['healthy', 'skinny', 'thin', 'good', 'vegetarian']
+    ```
+"""
+
 LONGFORMER_START_DOCSTRING = r"""
 
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -1636,20 +1658,9 @@ def forward(
         >>> SAMPLE_TEXT = " ".join(["Hello world! "] * 1000)  # long input document
         >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
 
-        >>> attention_mask = torch.ones(
-        ...     input_ids.shape, dtype=torch.long, device=input_ids.device
-        >>> )  # initialize to local attention
-        >>> global_attention_mask = torch.zeros(
-        ...     input_ids.shape, dtype=torch.long, device=input_ids.device
-        >>> )  # initialize to global attention to be deactivated for all tokens
-        >>> global_attention_mask[
-        ...     :,
-        ...     [
-        ...         1,
-        ...         4,
-        ...         21,
-        ...     ],
-        >>> ] = 1  # Set global attention to random tokens for the sake of this example
+        >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)  # initialize to local attention
+        >>> global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device)  # initialize to global attention to be deactivated for all tokens
+        >>> global_attention_mask[:,[1,4,21,],] = 1  # Set global attention to random tokens for the sake of this example
         >>> # Usually, set global attention based on the task. For example,
         >>> # classification: the <s> token
         >>> # QA: question tokens
@@ -1852,9 +1863,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="jpelhaw/longformer-base-plagiarism-detection",
         output_type=LongformerSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output=[1,2],
+        expected_loss=0.08
     )
     def forward(
         self,
@@ -2027,9 +2040,7 @@ def forward(
         >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
 
         >>> answer_tokens = all_tokens[torch.argmax(start_logits) : torch.argmax(end_logits) + 1]
-        >>> answer = tokenizer.decode(
-        ...     tokenizer.convert_tokens_to_ids(answer_tokens)
-        >>> )  # remove space prepending space token
+        >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -2118,9 +2129,11 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="brad1141/Longformer-finetuned-norm",
         output_type=LongformerTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="['Lead', 'Evidence', 'Lead', 'Evidence', 'Lead']",
+        expected_loss=0.01,
     )
     def forward(
         self,
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index b8632f29f9f2..daf30224dece 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -10,6 +10,7 @@ src/transformers/models/convnext/modeling_convnext.py
 src/transformers/models/data2vec/modeling_data2vec_audio.py
 src/transformers/models/deit/modeling_deit.py
 src/transformers/models/hubert/modeling_hubert.py
+src/transformers/models/longformer/modeling_longformer.py
 src/transformers/models/marian/modeling_marian.py
 src/transformers/models/mbart/modeling_mbart.py
 src/transformers/models/pegasus/modeling_pegasus.py

From 996d0329214f012230018de0556680dc0a9c5bf1 Mon Sep 17 00:00:00 2001
From: Karim Foda <kmfoda@gmail.com>
Date: Sun, 27 Mar 2022 17:40:03 +0100
Subject: [PATCH 2/9] make fixup

---
 .../models/longformer/modeling_longformer.py  | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 8d8ac292c8ed..b922f63f9830 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1416,7 +1416,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
     >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
 
-    >>> TXT = "My friends are <mask> but they eat too many carbs." + " That's why I decide not to eat with them."*300
+    >>> TXT = "My friends are <mask> but they eat too many carbs." + " That's why I decide not to eat with them." * 300
     >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
     >>> logits = model(input_ids).logits
 
@@ -1658,9 +1658,20 @@ def forward(
         >>> SAMPLE_TEXT = " ".join(["Hello world! "] * 1000)  # long input document
         >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
 
-        >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)  # initialize to local attention
-        >>> global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device)  # initialize to global attention to be deactivated for all tokens
-        >>> global_attention_mask[:,[1,4,21,],] = 1  # Set global attention to random tokens for the sake of this example
+        >>> attention_mask = torch.ones(
+        ...     input_ids.shape, dtype=torch.long, device=input_ids.device
+        >>> )  # initialize to local attention
+        >>> global_attention_mask = torch.zeros(
+        ...     input_ids.shape, dtype=torch.long, device=input_ids.device
+        >>> )  # initialize to global attention to be deactivated for all tokens
+        >>> global_attention_mask[
+        ...     :,
+        ...     [
+        ...         1,
+        ...         4,
+        ...         21,
+        ...     ],
+        >>> ] = 1  # Set global attention to random tokens for the sake of this example
         >>> # Usually, set global attention based on the task. For example,
         >>> # classification: the <s> token
         >>> # QA: question tokens
@@ -1866,8 +1877,8 @@ def __init__(self, config):
         checkpoint="jpelhaw/longformer-base-plagiarism-detection",
         output_type=LongformerSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=[1,2],
-        expected_loss=0.08
+        expected_output=[1, 2],
+        expected_loss=0.08,
     )
     def forward(
         self,
@@ -2040,7 +2051,9 @@ def forward(
         >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
 
         >>> answer_tokens = all_tokens[torch.argmax(start_logits) : torch.argmax(end_logits) + 1]
-        >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
+        >>> answer = tokenizer.decode(
+        ...     tokenizer.convert_tokens_to_ids(answer_tokens)
+        >>> )  # remove space prepending space token
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 

From c4b9a25f95f747e6e4b7cd29d80eec0fd4dc8fdc Mon Sep 17 00:00:00 2001
From: Karim Foda <kmfoda@gmail.com>
Date: Thu, 31 Mar 2022 15:15:49 +0100
Subject: [PATCH 3/9] Add TF doc changes

---
 .../models/longformer/modeling_tf_longformer.py        | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index 762f872ee709..a214b562dcca 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -2081,10 +2081,12 @@ def get_prefix_bias_name(self):
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="saibo/legal-longformer-base-4096",
         output_type=TFLongformerMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
         mask="<mask>",
+        expected_output="' no'",
+        expected_loss=4.5,
     )
     def call(
         self,
@@ -2178,6 +2180,8 @@ def __init__(self, config, *inputs, **kwargs):
         checkpoint="allenai/longformer-large-4096-finetuned-triviaqa",
         output_type=TFLongformerQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="' puppet'",
+        expected_loss=0.96,
     )
     def call(
         self,
@@ -2325,6 +2329,8 @@ def __init__(self, config, *inputs, **kwargs):
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFLongformerSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="LABEL_0",
+        expected_loss=0.58,
     )
     def call(
         self,
@@ -2565,6 +2571,8 @@ def __init__(self, config, *inputs, **kwargs):
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TFLongformerTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
+        expected_output="['LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_1', 'LABEL_1', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0']",
+        expected_loss=0.62,
     )
     def call(
         self,

From 9c811f7b057bb8d377e0e269cf39f7efac5bd9af Mon Sep 17 00:00:00 2001
From: Karim Foda <kmfoda@gmail.com>
Date: Tue, 3 May 2022 09:11:01 +0200
Subject: [PATCH 4/9] fix seq classifier output

---
 .../models/longformer/modeling_longformer.py  | 52 ++++++-------------
 utils/documentation_tests.txt                 |  1 +
 2 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 95a417a3b87e..194cfc66ca86 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1407,28 +1407,6 @@ def _set_gradient_checkpointing(self, module, value=False):
             module.gradient_checkpointing = value
 
 
-LONGFORMER_GENERATION_DOCSTRING = r"""
-    Mask filling example:
-
-    ```python
-    >>> from transformers import LongformerTokenizer, LongformerForMaskedLM
-
-    >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
-    >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
-
-    >>> TXT = "My friends are <mask> but they eat too many carbs." + " That's why I decide not to eat with them." * 300
-    >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
-    >>> logits = model(input_ids).logits
-
-    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-    >>> probs = logits[0, masked_index].softmax(dim=0)
-    >>> values, predictions = probs.topk(5)
-
-    >>> tokenizer.decode(predictions).split()
-    ['healthy', 'skinny', 'thin', 'good', 'vegetarian']
-    ```
-"""
-
 LONGFORMER_START_DOCSTRING = r"""
 
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -1796,23 +1774,25 @@ def forward(
 
         Returns:
 
-        Examples:
+        Mask filling example:
 
         ```python
-        >>> import torch
-        >>> from transformers import LongformerForMaskedLM, LongformerTokenizer
+        >>> from transformers import LongformerTokenizer, LongformerForMaskedLM
 
-        >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
         >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
-
-        >>> SAMPLE_TEXT = " ".join(["Hello world! "] * 1000)  # long input document
-        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
-
-        >>> attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
-        >>> # check `LongformerModel.forward` for more details how to set *attention_mask*
-        >>> outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
-        >>> loss = outputs.loss
-        >>> prediction_logits = outputs.logits
+        >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
+        
+        # Let's try a very long input.
+        >>> TXT = "My friends are <mask> but they eat too many carbs." + " That's why I decide not to eat with them." * 300
+        >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
+        >>> logits = model(input_ids).logits
+
+        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+        >>> probs = logits[0, masked_index].softmax(dim=0)
+        >>> values, predictions = probs.topk(5)
+
+        >>> tokenizer.decode(predictions).split()
+        ['healthy', 'skinny', 'thin', 'good', 'vegetarian']
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1877,7 +1857,7 @@ def __init__(self, config):
         checkpoint="jpelhaw/longformer-base-plagiarism-detection",
         output_type=LongformerSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=[1, 2],
+        expected_output='ORIGINAL',
         expected_loss=0.08,
     )
     def forward(
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index c1059d45168f..eca783ceb348 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -19,6 +19,7 @@ src/transformers/models/deit/modeling_deit.py
 src/transformers/models/glpn/modeling_glpn.py
 src/transformers/models/hubert/modeling_hubert.py
 src/transformers/models/longformer/modeling_longformer.py
+src/transformers/models/longformer/modeling_tf_longformer.py
 src/transformers/models/marian/modeling_marian.py
 src/transformers/models/marian/modeling_marian.py
 src/transformers/models/mbart/modeling_mbart.py

From 34809da689aded2fb5d1649d70798694ca15bd06 Mon Sep 17 00:00:00 2001
From: Karim Foda <kmfoda@gmail.com>
Date: Tue, 3 May 2022 10:25:31 +0200
Subject: [PATCH 5/9] fix quality errors

---
 src/transformers/models/longformer/modeling_longformer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 20868e5931c2..5b71bfe15ce2 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1777,7 +1777,7 @@ def forward(
 
         >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
         >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
-        
+
         # Let's try a very long input.
         >>> TXT = "My friends are <mask> but they eat too many carbs." + " That's why I decide not to eat with them." * 300
         >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
@@ -1853,7 +1853,7 @@ def __init__(self, config):
         checkpoint="jpelhaw/longformer-base-plagiarism-detection",
         output_type=LongformerSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output='ORIGINAL',
+        expected_output="ORIGINAL",
         expected_loss=0.08,
     )
     def forward(

From 2da3790691856d2670018dc893a135ce8a614044 Mon Sep 17 00:00:00 2001
From: Karim Foda <kmfoda@gmail.com>
Date: Tue, 3 May 2022 20:41:36 +0200
Subject: [PATCH 6/9] t

---
 src/transformers/models/longformer/modeling_longformer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 5b71bfe15ce2..241321b048a7 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1777,9 +1777,12 @@ def forward(
 
         >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
         >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
-
         # Let's try a very long input.
-        >>> TXT = "My friends are <mask> but they eat too many carbs." + " That's why I decide not to eat with them." * 300
+
+        >>> TXT = (
+        ...     "My friends are <mask> but they eat too many carbs."
+        ...     + " That's why I decide not to eat with them." * 300
+        ... )
         >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
         >>> logits = model(input_ids).logits
 

From ca8f6dcc61771ab9f86aabd9f8f9ac5e9c3ff9df Mon Sep 17 00:00:00 2001
From: Karim Foda <kmfoda@gmail.com>
Date: Wed, 11 May 2022 16:39:58 -0700
Subject: [PATCH 7/9] swithc head to random init

---
 .../models/longformer/modeling_tf_longformer.py             | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index d6141b0e7c10..031b837c74d3 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -2079,7 +2079,7 @@ def get_prefix_bias_name(self):
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="saibo/legal-longformer-base-4096",
+        checkpoint="allenai/longformer-base-4096",
         output_type=TFLongformerMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
         mask="<mask>",
@@ -2322,7 +2322,7 @@ def __init__(self, config, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="hf-internal-testing/tiny-random-longformer",
         output_type=TFLongformerSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output="LABEL_0",
@@ -2562,7 +2562,7 @@ def __init__(self, config, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
+        checkpoint="hf-internal-testing/tiny-random-longformer",
         output_type=TFLongformerTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output="['LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_1', 'LABEL_1', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0']",

From 2858d815001a33ab82a574f9952d7ec2404df983 Mon Sep 17 00:00:00 2001
From: Karim Foda <kmfoda@gmail.com>
Date: Fri, 13 May 2022 12:59:42 -0700
Subject: [PATCH 8/9] Fix expected outputs

---
 .../models/longformer/modeling_longformer.py         | 11 ++++++-----
 .../models/longformer/modeling_tf_longformer.py      | 12 ++++++------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index bbacdee99083..109e1b24064c 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1777,7 +1777,8 @@ def forward(
 
         >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
         >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
-        # Let's try a very long input.
+        
+        Let's try a very long input.
 
         >>> TXT = (
         ...     "My friends are <mask> but they eat too many carbs."
@@ -1856,8 +1857,8 @@ def __init__(self, config):
         checkpoint="jpelhaw/longformer-base-plagiarism-detection",
         output_type=LongformerSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="ORIGINAL",
-        expected_loss=0.08,
+        expected_output="'ORIGINAL'",
+        expected_loss=5.44,
     )
     def forward(
         self,
@@ -2124,8 +2125,8 @@ def __init__(self, config):
         checkpoint="brad1141/Longformer-finetuned-norm",
         output_type=LongformerTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="['Lead', 'Evidence', 'Lead', 'Evidence', 'Lead']",
-        expected_loss=0.01,
+        expected_output="['Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence']",
+        expected_loss=0.63,
     )
     def forward(
         self,
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index 031b837c74d3..8f654ea84b18 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -2083,8 +2083,8 @@ def get_prefix_bias_name(self):
         output_type=TFLongformerMaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
         mask="<mask>",
-        expected_output="' no'",
-        expected_loss=4.5,
+        expected_output="' Paris'",
+        expected_loss=0.44,
     )
     def call(
         self,
@@ -2325,8 +2325,8 @@ def __init__(self, config, *inputs, **kwargs):
         checkpoint="hf-internal-testing/tiny-random-longformer",
         output_type=TFLongformerSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="LABEL_0",
-        expected_loss=0.58,
+        expected_output="'LABEL_1'",
+        expected_loss=0.69,
     )
     def call(
         self,
@@ -2565,8 +2565,8 @@ def __init__(self, config, *inputs, **kwargs):
         checkpoint="hf-internal-testing/tiny-random-longformer",
         output_type=TFLongformerTokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output="['LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_1', 'LABEL_1', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0']",
-        expected_loss=0.62,
+        expected_output="['LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1']",
+        expected_loss=0.59,
     )
     def call(
         self,

From f5f504039c8e6000834d93b465614ee014681a04 Mon Sep 17 00:00:00 2001
From: Karim Foda <35491698+KMFODA@users.noreply.github.com>
Date: Mon, 16 May 2022 17:09:09 -0700
Subject: [PATCH 9/9] Update
 src/transformers/models/longformer/modeling_longformer.py

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
---
 src/transformers/models/longformer/modeling_longformer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 109e1b24064c..c35cf318c437 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1777,9 +1777,11 @@ def forward(
 
         >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
         >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
-        
+        ```
+
         Let's try a very long input.
 
+        ```python
         >>> TXT = (
         ...     "My friends are <mask> but they eat too many carbs."
         ...     + " That's why I decide not to eat with them." * 300