From 0d9e46508f6173a4ad96a64ca662415fde19bdcb Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Mon, 26 Sep 2022 10:32:19 +0000
Subject: [PATCH 01/10] complete t5 more output

---
 paddlenlp/transformers/model_outputs.py | 120 ++++++++++++++++++++++
 paddlenlp/transformers/t5/modeling.py   | 128 +++++++++++++++++++-----
 2 files changed, 221 insertions(+), 27 deletions(-)

diff --git a/paddlenlp/transformers/model_outputs.py b/paddlenlp/transformers/model_outputs.py
index 528777d10e3a..6cf8a33f9a5f 100644
--- a/paddlenlp/transformers/model_outputs.py
+++ b/paddlenlp/transformers/model_outputs.py
@@ -724,3 +724,123 @@ class CausalLMOutputWithCrossAttentions(ModelOutput):
     hidden_states: Optional[Tuple[paddle.Tensor]] = None
     attentions: Optional[Tuple[paddle.Tensor]] = None
     cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqModelOutput(ModelOutput):
+    """
+    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+    encoder_last_hidden_state: Optional[paddle.Tensor] = None
+    encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None   
+
+@dataclass
+class Seq2SeqLMOutput(ModelOutput):
+    """
+    Base class for sequence-to-sequence language models outputs.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+    encoder_last_hidden_state: Optional[paddle.Tensor] = None
+    encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py
index db228d4cedd8..e44846c39b9f 100644
--- a/paddlenlp/transformers/t5/modeling.py
+++ b/paddlenlp/transformers/t5/modeling.py
@@ -26,6 +26,14 @@
 
 from ..model_utils import PretrainedModel, register_base_model
 from ..nezha.modeling import ACT2FN
+from ..model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqModelOutput,
+    Seq2SeqLMOutput,
+    BaseModelOutput,
+    ModelOutput,
+)
+
 
 __all__ = [
     'T5Model', "T5PretrainedModel", 'T5ForConditionalGeneration',
@@ -944,7 +952,8 @@ def forward(self,
                 cache=None,
                 use_cache=False,
                 output_attentions=False,
-                output_hidden_states=False):
+                output_hidden_states=False,
+                return_dict=False):
         assert input_ids is not None, "input_ids can not be None"
         input_shape = input_ids.shape
         input_ids = input_ids.reshape(shape=[-1, input_shape[-1]])
@@ -1051,13 +1060,22 @@ def forward(self,
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states, )
 
-        return tuple(v for v in [
-            hidden_states,
-            present_key_value_states,
-            all_hidden_states,
-            all_attentions,
-            all_cross_attentions,
-        ] if v is not None)
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                present_key_value_states,
+                all_hidden_states,
+                all_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )         
 
     def get_extended_attention_mask(self, attention_mask, input_shape):
         if attention_mask.ndim == 3:
@@ -1293,7 +1311,8 @@ def forward(self,
                 cache=None,
                 use_cache=True,
                 output_attentions=False,
-                output_hidden_states=False):
+                output_hidden_states=False,
+                return_dict=False):
         r"""
         The T5Model forward method, overrides the `__call__()` special method.
 
@@ -1343,6 +1362,11 @@ def forward(self,
             output_hidden_states (bool, optional):
                 Whether or not to return the output of all hidden layers.
                 Defaults to `False`.
+            return_dict (bool, optional):
+                Whether or not to return a class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput` if `return_dict=True`.
+                Otherwise it returns a tuple of tensors corresponding to ordered and
+                not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`.
+
 
         Returns:
             tuple: Returns tuple (`last_hidden_state`, `cache`, `decoder_hidden_states`, `decoder_attentions`,
@@ -1419,8 +1443,10 @@ def forward(self,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states)
-
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict)
+        elif return_dict and not isinstance(encoder_output, ModelOutput):
+            encoder_output = convert_encoder_output(encoder_output)
         hidden_states = encoder_output[0]
 
         # Decode
@@ -1432,10 +1458,23 @@ def forward(self,
             encoder_attention_mask=attention_mask,
             use_cache=use_cache,
             output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states)
-
-        return decoder_outputs + encoder_output
-
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+        if not return_dict:
+            return decoder_outputs + encoder_output
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_output.last_hidden_state,
+            encoder_hidden_states=encoder_output.hidden_states,
+            encoder_attentions=encoder_output.attentions,
+        )
+            
 
 class T5ForConditionalGeneration(T5PretrainedModel):
     """
@@ -1490,7 +1529,8 @@ def forward(self,
                 labels=None,
                 use_cache=True,
                 output_attentions=False,
-                output_hidden_states=False):
+                output_hidden_states=False,
+                return_dict=False):
         r"""
 
         Args:
@@ -1518,6 +1558,8 @@ def forward(self,
                 See :class:`T5Model`.
             output_hidden_states (bool, optional):
                 See :class:`T5Model`.
+            return_dict (bool, optional):
+                See :class:`T5Model`.
 
         Returns:
             tuple: Returns tuple (`loss`, `logits`, `cache`, `decoder_hidden_states`, `decoder_attentions`,
@@ -1581,12 +1623,13 @@ def forward(self,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states)
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict)
 
-        if isinstance(encoder_output, (tuple, list)):
-            hidden_states = encoder_output[0]
-        else:
-            hidden_states = encoder_output
+        # encoder_output could be a Tensor, tuple or ModelOutput
+        if isinstance(encoder_output, paddle.Tensor):
+            encoder_output = (encoder_output, )
+        hidden_states = encoder_output[0]
 
         if labels is not None and decoder_input_ids is None:
             # get decoder inputs from shifting lm labels to the right
@@ -1610,7 +1653,8 @@ def forward(self,
             encoder_attention_mask=attention_mask,
             use_cache=use_cache,
             output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states)
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
 
         sequence_output = decoder_outputs[0]
 
@@ -1630,12 +1674,28 @@ def forward(self,
             loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
             loss = loss_fct(lm_logits.reshape(shape=[-1, lm_logits.shape[-1]]),
                             labels.flatten())
+        
+        if not return_dict:
+            # 元组相加
+            output = (lm_logits, ) + decoder_outputs[1:] + encoder_output[0:]
+
+            return ((loss, ) + output) if loss is not None else output
+
+        if not isinstance(encoder_output, ModelOutput):
+            encoder_output = convert_encoder_output(encoder_output)
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_output.last_hidden_state,
+            encoder_hidden_states=encoder_output.hidden_states,
+            encoder_attentions=encoder_output.attentions,
+        )
 
-        if not isinstance(encoder_output, (list, tuple)):
-            encoder_output = (encoder_output, )
-
-        output = (lm_logits, ) + decoder_outputs[1:] + encoder_output
-        return ((loss, ) + output) if loss is not None else output
 
     @staticmethod
     def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
@@ -1817,6 +1877,7 @@ def forward(
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = False,
     ):
         encoder_outputs = self.encoder(
             input_ids=input_ids,
@@ -1827,9 +1888,22 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            return_dict=return_dict
         )
 
         return encoder_outputs
 
 
 T5EncoderModel.base_model_class = T5EncoderModel
+
+
+def convert_encoder_output(encoder_output):
+    """
+    Convert encoder_output which type is tuple to an instance of BaseModelOutput.
+    args: encoder_output = (last_hidden_state, hidden_states, attentions)
+    """
+    return BaseModelOutput(
+            last_hidden_state=encoder_output[0],
+            hidden_states=encoder_output[1] if len(encoder_output) > 1 else None,
+            attentions=encoder_output[2] if len(encoder_output) > 2 else None,
+        )

From a6d1261b9b0c0bf41b02681beb51c510b3c4c373 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Mon, 26 Sep 2022 13:05:43 +0000
Subject: [PATCH 02/10] check codestyle

---
 paddlenlp/transformers/t5/modeling.py | 32 +++++++++++++++------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py
index e44846c39b9f..554bfee151ee 100644
--- a/paddlenlp/transformers/t5/modeling.py
+++ b/paddlenlp/transformers/t5/modeling.py
@@ -34,7 +34,6 @@
     ModelOutput,
 )
 
-
 __all__ = [
     'T5Model', "T5PretrainedModel", 'T5ForConditionalGeneration',
     'T5EncoderModel'
@@ -1075,7 +1074,7 @@ def forward(self,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
             cross_attentions=all_cross_attentions,
-        )         
+        )
 
     def get_extended_attention_mask(self, attention_mask, input_shape):
         if attention_mask.ndim == 3:
@@ -1363,12 +1362,15 @@ def forward(self,
                 Whether or not to return the output of all hidden layers.
                 Defaults to `False`.
             return_dict (bool, optional):
-                Whether or not to return a class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput` if `return_dict=True`.
-                Otherwise it returns a tuple of tensors corresponding to ordered and
-                not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`.
+                Whether or not to return a class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
 
 
         Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput` if `return_dict=True`. 
+            Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`.
+
             tuple: Returns tuple (`last_hidden_state`, `cache`, `decoder_hidden_states`, `decoder_attentions`,
             `cross_attentions`, `encoder_last_hidden_state`, `encoder_hidden_states`, `encoder_attentions`)
 
@@ -1474,7 +1476,7 @@ def forward(self,
             encoder_hidden_states=encoder_output.hidden_states,
             encoder_attentions=encoder_output.attentions,
         )
-            
+
 
 class T5ForConditionalGeneration(T5PretrainedModel):
     """
@@ -1562,6 +1564,10 @@ def forward(self,
                 See :class:`T5Model`.
 
         Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput` if `return_dict=True`. 
+            Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`. 
+
             tuple: Returns tuple (`loss`, `logits`, `cache`, `decoder_hidden_states`, `decoder_attentions`,
             `cross_attentions`, `encoder_last_hidden_state`, `encoder_hidden_states`, `encoder_attentions`)
 
@@ -1674,7 +1680,7 @@ def forward(self,
             loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
             loss = loss_fct(lm_logits.reshape(shape=[-1, lm_logits.shape[-1]]),
                             labels.flatten())
-        
+
         if not return_dict:
             # 元组相加
             output = (lm_logits, ) + decoder_outputs[1:] + encoder_output[0:]
@@ -1696,7 +1702,6 @@ def forward(self,
             encoder_attentions=encoder_output.attentions,
         )
 
-
     @staticmethod
     def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
         batch_size = 1
@@ -1888,8 +1893,7 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict
-        )
+            return_dict=return_dict)
 
         return encoder_outputs
 
@@ -1903,7 +1907,7 @@ def convert_encoder_output(encoder_output):
     args: encoder_output = (last_hidden_state, hidden_states, attentions)
     """
     return BaseModelOutput(
-            last_hidden_state=encoder_output[0],
-            hidden_states=encoder_output[1] if len(encoder_output) > 1 else None,
-            attentions=encoder_output[2] if len(encoder_output) > 2 else None,
-        )
+        last_hidden_state=encoder_output[0],
+        hidden_states=encoder_output[1] if len(encoder_output) > 1 else None,
+        attentions=encoder_output[2] if len(encoder_output) > 2 else None,
+    )

From 943c578a904037644d7e3c3e17bf4914185fa49e Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Mon, 26 Sep 2022 13:35:39 +0000
Subject: [PATCH 03/10] modify description of function

---
 paddlenlp/transformers/t5/modeling.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py
index 554bfee151ee..109e39a6e938 100644
--- a/paddlenlp/transformers/t5/modeling.py
+++ b/paddlenlp/transformers/t5/modeling.py
@@ -1904,7 +1904,11 @@ def forward(
 def convert_encoder_output(encoder_output):
     """
     Convert encoder_output which type is tuple to an instance of BaseModelOutput.
-    args: encoder_output = (last_hidden_state, hidden_states, attentions)
+    
+    Args: 
+        encoder_output (tuple or ModleOutput):
+            The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional).
+            The data type of `last_hidden_state` is float32 and its shape is [batch_size, sequence_length, hidden_size].
     """
     return BaseModelOutput(
         last_hidden_state=encoder_output[0],

From 1146c7bfcc8c887889deabdd3ec6b291dd970339 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Mon, 26 Sep 2022 13:58:39 +0000
Subject: [PATCH 04/10] check codestyle again

---
 paddlenlp/transformers/t5/modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py
index 109e39a6e938..c961c481e961 100644
--- a/paddlenlp/transformers/t5/modeling.py
+++ b/paddlenlp/transformers/t5/modeling.py
@@ -1903,7 +1903,7 @@ def forward(
 
 def convert_encoder_output(encoder_output):
     """
-    Convert encoder_output which type is tuple to an instance of BaseModelOutput.
+    Convert encoder_output from tuple to class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`.
     
     Args: 
         encoder_output (tuple or ModleOutput):

From e796a1de07457b8ec2dd01372e69015988de1348 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Tue, 27 Sep 2022 09:02:44 +0000
Subject: [PATCH 05/10] rewrite the condition of converting encoder_output

---
 paddlenlp/transformers/t5/modeling.py  | 18 ++++++++----------
 tests/transformers/t5/test_modeling.py |  9 ++++++++-
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py
index c961c481e961..a9d12ff38e65 100644
--- a/paddlenlp/transformers/t5/modeling.py
+++ b/paddlenlp/transformers/t5/modeling.py
@@ -1447,7 +1447,7 @@ def forward(self,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict)
-        elif return_dict and not isinstance(encoder_output, ModelOutput):
+        elif return_dict and not isinstance(encoder_output, BaseModelOutput):
             encoder_output = convert_encoder_output(encoder_output)
         hidden_states = encoder_output[0]
 
@@ -1631,10 +1631,12 @@ def forward(self,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict)
+        else:
+            if isinstance(encoder_output, paddle.Tensor):
+                encoder_output = (encoder_output, )
+            if return_dict and not isinstance(encoder_output, BaseModelOutput):
+                encoder_output = convert_encoder_output(encoder_output)
 
-        # encoder_output could be a Tensor, tuple or ModelOutput
-        if isinstance(encoder_output, paddle.Tensor):
-            encoder_output = (encoder_output, )
         hidden_states = encoder_output[0]
 
         if labels is not None and decoder_input_ids is None:
@@ -1682,14 +1684,9 @@ def forward(self,
                             labels.flatten())
 
         if not return_dict:
-            # 元组相加
-            output = (lm_logits, ) + decoder_outputs[1:] + encoder_output[0:]
-
+            output = (lm_logits, ) + decoder_outputs[1:] + encoder_output
             return ((loss, ) + output) if loss is not None else output
 
-        if not isinstance(encoder_output, ModelOutput):
-            encoder_output = convert_encoder_output(encoder_output)
-
         return Seq2SeqLMOutput(
             loss=loss,
             logits=lm_logits,
@@ -1910,6 +1907,7 @@ def convert_encoder_output(encoder_output):
             The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional).
             The data type of `last_hidden_state` is float32 and its shape is [batch_size, sequence_length, hidden_size].
     """
+    # if isinstance(encoder_output, tuple)
     return BaseModelOutput(
         last_hidden_state=encoder_output[0],
         hidden_states=encoder_output[1] if len(encoder_output) > 1 else None,
diff --git a/tests/transformers/t5/test_modeling.py b/tests/transformers/t5/test_modeling.py
index d76e1705dbb0..4619850995db 100644
--- a/tests/transformers/t5/test_modeling.py
+++ b/tests/transformers/t5/test_modeling.py
@@ -18,6 +18,7 @@
 import copy
 import tempfile
 import unittest
+from parameterized import parameterized_class
 
 from tests.testing_utils import slow
 
@@ -497,10 +498,16 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
+@parameterized_class(("return_dict", "use_labels"), [
+    [False, False],
+    [False, True],
+    [True, False],
+    [True, True],
+])
 class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     base_model_class = T5Model
 
-    all_model_classes = (T5Model, T5ForConditionalGeneration, T5EncoderModel)
+    all_model_classes = (T5Model, T5ForConditionalGeneration)
     all_generative_model_classes = {T5ForConditionalGeneration: (T5Model, "t5")}
     all_parallelizable_model_classes = (T5Model, T5ForConditionalGeneration,
                                         T5EncoderModel)

From b9cbe0ff5cde943d15fdaf7952d501b39cc865ac Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Tue, 27 Sep 2022 09:35:50 +0000
Subject: [PATCH 06/10] check codestyle

---
 paddlenlp/transformers/model_outputs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddlenlp/transformers/model_outputs.py b/paddlenlp/transformers/model_outputs.py
index 6cf8a33f9a5f..cb99161b2ac8 100644
--- a/paddlenlp/transformers/model_outputs.py
+++ b/paddlenlp/transformers/model_outputs.py
@@ -784,7 +784,8 @@ class Seq2SeqModelOutput(ModelOutput):
     cross_attentions: Optional[Tuple[paddle.Tensor]] = None
     encoder_last_hidden_state: Optional[paddle.Tensor] = None
     encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
-    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None   
+    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
 
 @dataclass
 class Seq2SeqLMOutput(ModelOutput):

From cb6f77b9c05d9c1d00e3ad564b9f4e19c6e605b6 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Wed, 28 Sep 2022 09:32:28 +0000
Subject: [PATCH 07/10] modift the documents of two classes

---
 paddlenlp/transformers/model_outputs.py | 53 +++++++++++++++----------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/paddlenlp/transformers/model_outputs.py b/paddlenlp/transformers/model_outputs.py
index cb99161b2ac8..64e95dcb3c3b 100644
--- a/paddlenlp/transformers/model_outputs.py
+++ b/paddlenlp/transformers/model_outputs.py
@@ -733,45 +733,51 @@ class Seq2SeqModelOutput(ModelOutput):
     decoding.
 
     Args:
-        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        last_hidden_state (`paddle.Tensor`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model, whose shape is `(batch_size, Sequence_length, hidden_size)`.
 
             If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
             hidden_size)` is output.
-        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        past_key_values (`tuple(tuple(paddle.Tensor))`, optional):
             Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
             `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Returned when `use_cache=True` is passed or when `config.use_cache=True`.
 
             Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        decoder_hidden_states (`tuple(paddle.Tensor)`, optional):
             Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`.
 
             Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
-        decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        decoder_attentions (`tuple(paddle.Tensor)`, optional): 
             Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
+            Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
 
             Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
             self-attention heads.
-        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        cross_attentions (`tuple(paddle.Tensor)`, optional):
             Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
+            Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
 
             Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
             weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        encoder_last_hidden_state (`paddle.Tensor`, optional):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model whose shape is `(batch_size, sequence_length, hidden_size)`,
+        encoder_hidden_states (`tuple(paddle.Tensor)`, optional):
             Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`.
 
             Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
-        encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        encoder_attentions (`tuple(paddle.Tensor)`, optional):
             Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
+            Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
 
             Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
             self-attention heads.
@@ -793,44 +799,49 @@ class Seq2SeqLMOutput(ModelOutput):
     Base class for sequence-to-sequence language models outputs.
 
     Args:
-        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Language modeling loss.
-        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        loss (`paddle.Tensor`, optional):
+            Language modeling loss whose shape is `(1,)`. Returned when `labels` is provided.
+        logits (`paddle.Tensor`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) whose shape is `(batch_size, sequence_length, config.vocab_size)`).
+        past_key_values (`tuple(tuple(paddle.Tensor))`, optional):
             Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
             `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Returned when `use_cache=True` is passed or when `config.use_cache=True`.
 
             Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        decoder_hidden_states (`tuple(paddle.Tensor)`, optional):
             Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`.
 
             Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        decoder_attentions (`tuple(paddle.Tensor)`, optional):
             Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
+            Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
 
             Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
             self-attention heads.
-        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        cross_attentions (`tuple(paddle.Tensor)`, optional):
             Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
+            Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
 
             Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
             weighted average in the cross-attention heads.
-        encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_last_hidden_state (`paddle.Tensor`, optional):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model whose shape is `(batch_size, sequence_length, hidden_size)`.
         encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
 
             Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        encoder_attentions (`tuple(paddle.Tensor)`, optional):
             Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
+            Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
 
             Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
             self-attention heads.

From f478886cb170967ca314a0b0b9e9bdb88f10b360 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Wed, 28 Sep 2022 12:42:46 +0000
Subject: [PATCH 08/10] modify document

---
 paddlenlp/transformers/t5/modeling.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py
index cfe626b55fca..30bbfbfcbe9d 100644
--- a/paddlenlp/transformers/t5/modeling.py
+++ b/paddlenlp/transformers/t5/modeling.py
@@ -1561,7 +1561,8 @@ def forward(self,
             output_hidden_states (bool, optional):
                 See :class:`T5Model`.
             return_dict (bool, optional):
-                See :class:`T5Model`.
+                Whether or not to return a class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
 
         Returns:
             An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput` if `return_dict=True`. 

From cf0f11fd4212c31ba3df372dbd05aa8e036c6b5c Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Wed, 28 Sep 2022 12:46:38 +0000
Subject: [PATCH 09/10] add test case with use_labels=False or return_dict=True

---
 tests/transformers/t5/test_modeling.py | 134 ++++++++++++++++---------
 1 file changed, 89 insertions(+), 45 deletions(-)

diff --git a/tests/transformers/t5/test_modeling.py b/tests/transformers/t5/test_modeling.py
index 53ac3f1a89b7..637a4ae7cc9d 100644
--- a/tests/transformers/t5/test_modeling.py
+++ b/tests/transformers/t5/test_modeling.py
@@ -54,7 +54,6 @@ def __init__(
         # For common tests
         is_training=True,
         use_attention_mask=True,
-        use_labels=True,
         hidden_size=32,
         num_hidden_layers=5,
         num_attention_heads=4,
@@ -76,7 +75,6 @@ def __init__(
         self.seq_length = self.decoder_seq_length
         self.is_training = is_training
         self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -106,7 +104,7 @@ def prepare_config_and_inputs(self):
                 [self.batch_size, self.decoder_seq_length], vocab_size=2)
 
         lm_labels = None
-        if self.use_labels:
+        if self.parent.use_labels:
             lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length],
                                    self.vocab_size)
 
@@ -166,6 +164,8 @@ def check_prepare_lm_labels_via_shift_left(
         decoder_attention_mask,
         lm_labels,
     ):
+        if not self.parent.use_labels:
+            return
         model = T5Model(**config)
         model.eval()
 
@@ -214,13 +214,14 @@ def create_and_check_model(
     ):
         model = T5Model(**config)
         model.eval()
-        result = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        result = model(input_ids=input_ids,
+                       decoder_input_ids=decoder_input_ids,
+                       attention_mask=attention_mask,
+                       decoder_attention_mask=decoder_attention_mask,
+                       return_dict=self.parent.return_dict)
+        result = model(input_ids=input_ids,
+                       decoder_input_ids=decoder_input_ids,
+                       return_dict=self.parent.return_dict)
         decoder_output = result[0]
         decoder_past = result[1]
         encoder_output = result[2]
@@ -248,17 +249,22 @@ def create_and_check_with_lm_head(
         pretrained_model = T5Model(**config)
         model = T5ForConditionalGeneration(pretrained_model)
         model.eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-        )
-        self.parent.assertEqual(len(outputs), 4)
-        self.parent.assertEqual(
-            outputs[1].shape,
-            [self.batch_size, self.decoder_seq_length, self.vocab_size])
-        self.parent.assertEqual(outputs[0].shape, [1])
+        outputs = model(input_ids=input_ids,
+                        decoder_input_ids=decoder_input_ids,
+                        decoder_attention_mask=decoder_attention_mask,
+                        labels=lm_labels,
+                        return_dict=self.parent.return_dict)
+        self.parent.assertEqual(len(outputs),
+                                4 if self.parent.use_labels else 3)
+        if self.parent.use_labels:
+            self.parent.assertEqual(
+                outputs[1].shape,
+                [self.batch_size, self.decoder_seq_length, self.vocab_size])
+            self.parent.assertEqual(outputs[0].shape, [1])
+        else:
+            self.parent.assertEqual(
+                outputs[0].shape,
+                [self.batch_size, self.decoder_seq_length, self.vocab_size])
 
     def create_and_check_decoder_model_past(
         self,
@@ -272,14 +278,19 @@ def create_and_check_decoder_model_past(
         model = T5Model(**config).get_decoder()
         model.eval()
         # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
+        outputs = model(input_ids,
+                        use_cache=True,
+                        return_dict=self.parent.return_dict)
+        outputs_use_cache_conf = model(input_ids,
+                                       return_dict=self.parent.return_dict)
+        outputs_no_past = model(input_ids,
+                                use_cache=False,
+                                return_dict=self.parent.return_dict)
 
         self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf) + 1)
         self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
 
-        output, past_key_values = outputs
+        output, past_key_values = outputs[:2]
 
         # create hypothetical next token and extent to next_input_ids
         next_tokens = ids_tensor([self.batch_size, 1], config["vocab_size"])
@@ -287,8 +298,11 @@ def create_and_check_decoder_model_past(
         # append to next input_ids and
         next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1)
 
-        output_from_no_past = model(next_input_ids)[0]
-        output_from_past = model(next_tokens, cache=past_key_values)[0]
+        output_from_no_past = model(next_input_ids,
+                                    return_dict=self.parent.return_dict)[0]
+        output_from_past = model(next_tokens,
+                                 cache=past_key_values,
+                                 return_dict=self.parent.return_dict)[0]
 
         # select random slice
         random_slice_idx = ids_tensor([
@@ -327,7 +341,8 @@ def create_and_check_decoder_model_attention_mask_past(
         # first forward pass
         output, past_key_values = model(input_ids,
                                         attention_mask=attn_mask,
-                                        use_cache=True)
+                                        use_cache=True,
+                                        return_dict=self.parent.return_dict)[:2]
 
         # create hypothetical next token and extent to next_input_ids
         next_tokens = ids_tensor([self.batch_size, 1], config["vocab_size"])
@@ -349,11 +364,14 @@ def create_and_check_decoder_model_attention_mask_past(
         )
 
         # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
+        output_from_no_past = model(next_input_ids,
+                                    attention_mask=attn_mask,
+                                    return_dict=self.parent.return_dict)[0]
         output_from_past = model(next_tokens,
                                  cache=past_key_values,
                                  attention_mask=paddle.ones(
-                                     (attn_mask.shape[0], 1), dtype="int64"))[0]
+                                     (attn_mask.shape[0], 1), dtype="int64"),
+                                 return_dict=self.parent.return_dict)[0]
 
         # select random slice
         random_slice_idx = ids_tensor([
@@ -385,9 +403,10 @@ def create_and_check_decoder_model_past_large_inputs(
         # first forward pass
         outputs = model(input_ids,
                         attention_mask=attention_mask,
-                        use_cache=True)
+                        use_cache=True,
+                        return_dict=self.parent.return_dict)
 
-        output, past_key_values = outputs
+        output, past_key_values = outputs[:2]
 
         # create hypothetical multiple next token and extent to next_input_ids
         next_tokens = ids_tensor([self.batch_size, 3], config["vocab_size"])
@@ -399,10 +418,12 @@ def create_and_check_decoder_model_past_large_inputs(
                                             axis=-1)
 
         output_from_no_past = model(next_input_ids,
-                                    attention_mask=next_attention_mask)[0]
+                                    attention_mask=next_attention_mask,
+                                    return_dict=self.parent.return_dict)[0]
         output_from_past = model(next_tokens,
                                  attention_mask=next_attention_mask,
-                                 cache=past_key_values)[0]
+                                 cache=past_key_values,
+                                 return_dict=self.parent.return_dict)[0]
 
         # select random slice
         random_slice_idx = ids_tensor([
@@ -506,6 +527,8 @@ def prepare_config_and_inputs_for_common(self):
 ])
 class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     base_model_class = T5Model
+    return_dict: bool = False
+    use_labels: bool = False
 
     all_model_classes = (T5Model, T5ForConditionalGeneration)
     all_generative_model_classes = {T5ForConditionalGeneration: (T5Model, "t5")}
@@ -1108,7 +1131,15 @@ def test_translation_en_to_ro(self):
         self.assertEqual(translation, expected_translation)
 
 
+@parameterized_class(("return_dict", "use_labels"), [
+    [False, False],
+    [False, True],
+    [True, False],
+    [True, True],
+])
 class TestAsymmetricT5(unittest.TestCase):
+    return_dict = False
+    use_labels = False
 
     def build_model_and_check_forward_pass(self, **kwargs):
         tester = T5ModelTester(self, **kwargs)
@@ -1123,18 +1154,31 @@ def build_model_and_check_forward_pass(self, **kwargs):
         pretrained_model = T5Model(**config)
         model = T5ForConditionalGeneration(pretrained_model)
         model.eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-        )
+        outputs = model(input_ids=input_ids,
+                        decoder_input_ids=decoder_input_ids,
+                        decoder_attention_mask=decoder_attention_mask,
+                        labels=lm_labels,
+                        return_dict=self.return_dict)
         # outputs = model(*inputs)
-        assert len(outputs) == 4
-        assert outputs[1].shape == [
-            tester.batch_size, tester.decoder_seq_length, tester.vocab_size
-        ]
-        assert outputs[0].shape == [1]
+        assert len(outputs) == (4 if self.use_labels else
+                                3), f"{type(outputs)}, {type(lm_labels)}"
+        # try:
+        #     outputs[1].shape == [
+        #         tester.batch_size, tester.decoder_seq_length, tester.vocab_size
+        #     ]
+        # except Exception as e:
+        #     assert 1==0, f"use_labels:{self.use_labels}, return_dict:{self.return_dict},"\
+        #         f"{len(outputs)} " + f"{type(outputs)}" if isinstance(outputs, tuple) else f"{outputs.keys()}"
+
+        if self.use_labels:
+            assert outputs[1].shape == [
+                tester.batch_size, tester.decoder_seq_length, tester.vocab_size
+            ]
+            assert outputs[0].shape == [1]
+        else:
+            assert outputs[0].shape == [
+                tester.batch_size, tester.decoder_seq_length, tester.vocab_size
+            ]
         return model
 
     def test_small_decoder(self):

From abe565f81aceb86647bff17021cdae3f727851a6 Mon Sep 17 00:00:00 2001
From: Yam0214 <oyhy0214@163.com>
Date: Thu, 29 Sep 2022 11:04:43 +0000
Subject: [PATCH 10/10] delete annotated code

---
 paddlenlp/transformers/t5/modeling.py  | 1 -
 tests/transformers/t5/test_modeling.py | 7 -------
 2 files changed, 8 deletions(-)

diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py
index 30bbfbfcbe9d..dcfd8c5c149c 100644
--- a/paddlenlp/transformers/t5/modeling.py
+++ b/paddlenlp/transformers/t5/modeling.py
@@ -1900,7 +1900,6 @@ def convert_encoder_output(encoder_output):
             The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional).
             The data type of `last_hidden_state` is float32 and its shape is [batch_size, sequence_length, hidden_size].
     """
-    # if isinstance(encoder_output, tuple)
     return BaseModelOutput(
         last_hidden_state=encoder_output[0],
         hidden_states=encoder_output[1] if len(encoder_output) > 1 else None,
diff --git a/tests/transformers/t5/test_modeling.py b/tests/transformers/t5/test_modeling.py
index 637a4ae7cc9d..32959ce49b21 100644
--- a/tests/transformers/t5/test_modeling.py
+++ b/tests/transformers/t5/test_modeling.py
@@ -1162,13 +1162,6 @@ def build_model_and_check_forward_pass(self, **kwargs):
         # outputs = model(*inputs)
         assert len(outputs) == (4 if self.use_labels else
                                 3), f"{type(outputs)}, {type(lm_labels)}"
-        # try:
-        #     outputs[1].shape == [
-        #         tester.batch_size, tester.decoder_seq_length, tester.vocab_size
-        #     ]
-        # except Exception as e:
-        #     assert 1==0, f"use_labels:{self.use_labels}, return_dict:{self.return_dict},"\
-        #         f"{len(outputs)} " + f"{type(outputs)}" if isinstance(outputs, tuple) else f"{outputs.keys()}"
 
         if self.use_labels:
             assert outputs[1].shape == [