From 0d9e46508f6173a4ad96a64ca662415fde19bdcb Mon Sep 17 00:00:00 2001 From: Yam0214 Date: Mon, 26 Sep 2022 10:32:19 +0000 Subject: [PATCH 01/10] complete t5 more output --- paddlenlp/transformers/model_outputs.py | 120 ++++++++++++++++++++++ paddlenlp/transformers/t5/modeling.py | 128 +++++++++++++++++++----- 2 files changed, 221 insertions(+), 27 deletions(-) diff --git a/paddlenlp/transformers/model_outputs.py b/paddlenlp/transformers/model_outputs.py index 528777d10e3a..6cf8a33f9a5f 100644 --- a/paddlenlp/transformers/model_outputs.py +++ b/paddlenlp/transformers/model_outputs.py @@ -724,3 +724,123 @@ class CausalLMOutputWithCrossAttentions(ModelOutput): hidden_states: Optional[Tuple[paddle.Tensor]] = None attentions: Optional[Tuple[paddle.Tensor]] = None cross_attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class Seq2SeqModelOutput(ModelOutput): + """ + Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + + Args: + last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs. + decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs. + encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + last_hidden_state: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + decoder_attentions: Optional[Tuple[paddle.Tensor]] = None + cross_attentions: Optional[Tuple[paddle.Tensor]] = None + encoder_last_hidden_state: Optional[paddle.Tensor] = None + encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + encoder_attentions: Optional[Tuple[paddle.Tensor]] = None + +@dataclass +class Seq2SeqLMOutput(ModelOutput): + """ + Base class for sequence-to-sequence language models outputs. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + decoder_attentions: Optional[Tuple[paddle.Tensor]] = None + cross_attentions: Optional[Tuple[paddle.Tensor]] = None + encoder_last_hidden_state: Optional[paddle.Tensor] = None + encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + encoder_attentions: Optional[Tuple[paddle.Tensor]] = None diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py index db228d4cedd8..e44846c39b9f 100644 --- a/paddlenlp/transformers/t5/modeling.py +++ b/paddlenlp/transformers/t5/modeling.py @@ -26,6 +26,14 @@ from ..model_utils import PretrainedModel, register_base_model from ..nezha.modeling import ACT2FN +from ..model_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqModelOutput, + Seq2SeqLMOutput, + BaseModelOutput, + ModelOutput, +) + __all__ = [ 'T5Model', "T5PretrainedModel", 'T5ForConditionalGeneration', @@ -944,7 +952,8 @@ def forward(self, cache=None, use_cache=False, output_attentions=False, - output_hidden_states=False): + output_hidden_states=False, + return_dict=False): assert input_ids is not None, "input_ids can not be None" input_shape = input_ids.shape input_ids = input_ids.reshape(shape=[-1, input_shape[-1]]) @@ -1051,13 +1060,22 @@ def forward(self, if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states, ) - return tuple(v for v in [ - hidden_states, - present_key_value_states, - all_hidden_states, - all_attentions, - all_cross_attentions, - ] if v is not None) + if not return_dict: + return tuple(v for v in [ + hidden_states, + present_key_value_states, + all_hidden_states, + all_attentions, + all_cross_attentions, + ] if v is not None) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=present_key_value_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) def get_extended_attention_mask(self, attention_mask, input_shape): if attention_mask.ndim == 3: @@ -1293,7 +1311,8 @@ def forward(self, cache=None, use_cache=True, output_attentions=False, - output_hidden_states=False): + output_hidden_states=False, + return_dict=False): r""" The T5Model forward method, overrides the `__call__()` special method. @@ -1343,6 +1362,11 @@ def forward(self, output_hidden_states (bool, optional): Whether or not to return the output of all hidden layers. Defaults to `False`. + return_dict (bool, optional): + Whether or not to return a class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput` if `return_dict=True`. + Otherwise it returns a tuple of tensors corresponding to ordered and + not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`. + Returns: tuple: Returns tuple (`last_hidden_state`, `cache`, `decoder_hidden_states`, `decoder_attentions`, @@ -1419,8 +1443,10 @@ def forward(self, input_ids=input_ids, attention_mask=attention_mask, output_attentions=output_attentions, - output_hidden_states=output_hidden_states) - + output_hidden_states=output_hidden_states, + return_dict=return_dict) + elif return_dict and not isinstance(encoder_output, ModelOutput): + encoder_output = convert_encoder_output(encoder_output) hidden_states = encoder_output[0] # Decode @@ -1432,10 +1458,23 @@ def forward(self, encoder_attention_mask=attention_mask, use_cache=use_cache, output_attentions=output_attentions, - output_hidden_states=output_hidden_states) - - return decoder_outputs + encoder_output - + output_hidden_states=output_hidden_states, + return_dict=return_dict) + + if not return_dict: + return decoder_outputs + encoder_output + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_output.last_hidden_state, + encoder_hidden_states=encoder_output.hidden_states, + encoder_attentions=encoder_output.attentions, + ) + class T5ForConditionalGeneration(T5PretrainedModel): """ @@ -1490,7 +1529,8 @@ def forward(self, labels=None, use_cache=True, output_attentions=False, - output_hidden_states=False): + output_hidden_states=False, + return_dict=False): r""" Args: @@ -1518,6 +1558,8 @@ def forward(self, See :class:`T5Model`. output_hidden_states (bool, optional): See :class:`T5Model`. + return_dict (bool, optional): + See :class:`T5Model`. Returns: tuple: Returns tuple (`loss`, `logits`, `cache`, `decoder_hidden_states`, `decoder_attentions`, @@ -1581,12 +1623,13 @@ def forward(self, input_ids=input_ids, attention_mask=attention_mask, output_attentions=output_attentions, - output_hidden_states=output_hidden_states) + output_hidden_states=output_hidden_states, + return_dict=return_dict) - if isinstance(encoder_output, (tuple, list)): - hidden_states = encoder_output[0] - else: - hidden_states = encoder_output + # encoder_output could be a Tensor, tuple or ModelOutput + if isinstance(encoder_output, paddle.Tensor): + encoder_output = (encoder_output, ) + hidden_states = encoder_output[0] if labels is not None and decoder_input_ids is None: # get decoder inputs from shifting lm labels to the right @@ -1610,7 +1653,8 @@ def forward(self, encoder_attention_mask=attention_mask, use_cache=use_cache, output_attentions=output_attentions, - output_hidden_states=output_hidden_states) + output_hidden_states=output_hidden_states, + return_dict=return_dict) sequence_output = decoder_outputs[0] @@ -1630,12 +1674,28 @@ def forward(self, loss_fct = nn.CrossEntropyLoss(ignore_index=-100) loss = loss_fct(lm_logits.reshape(shape=[-1, lm_logits.shape[-1]]), labels.flatten()) + + if not return_dict: + # 元组相加 + output = (lm_logits, ) + decoder_outputs[1:] + encoder_output[0:] + + return ((loss, ) + output) if loss is not None else output + + if not isinstance(encoder_output, ModelOutput): + encoder_output = convert_encoder_output(encoder_output) + + return Seq2SeqLMOutput( + loss=loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_output.last_hidden_state, + encoder_hidden_states=encoder_output.hidden_states, + encoder_attentions=encoder_output.attentions, + ) - if not isinstance(encoder_output, (list, tuple)): - encoder_output = (encoder_output, ) - - output = (lm_logits, ) + decoder_outputs[1:] + encoder_output - return ((loss, ) + output) if loss is not None else output @staticmethod def prepare_input_ids_for_generation(bos_token_id, encoder_output=None): @@ -1817,6 +1877,7 @@ def forward( use_cache: Optional[bool] = False, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = False, ): encoder_outputs = self.encoder( input_ids=input_ids, @@ -1827,9 +1888,22 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict ) return encoder_outputs T5EncoderModel.base_model_class = T5EncoderModel + + +def convert_encoder_output(encoder_output): + """ + Convert encoder_output which type is tuple to an instance of BaseModelOutput. + args: encoder_output = (last_hidden_state, hidden_states, attentions) + """ + return BaseModelOutput( + last_hidden_state=encoder_output[0], + hidden_states=encoder_output[1] if len(encoder_output) > 1 else None, + attentions=encoder_output[2] if len(encoder_output) > 2 else None, + ) From a6d1261b9b0c0bf41b02681beb51c510b3c4c373 Mon Sep 17 00:00:00 2001 From: Yam0214 Date: Mon, 26 Sep 2022 13:05:43 +0000 Subject: [PATCH 02/10] check codestyle --- paddlenlp/transformers/t5/modeling.py | 32 +++++++++++++++------------ 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py index e44846c39b9f..554bfee151ee 100644 --- a/paddlenlp/transformers/t5/modeling.py +++ b/paddlenlp/transformers/t5/modeling.py @@ -34,7 +34,6 @@ ModelOutput, ) - __all__ = [ 'T5Model', "T5PretrainedModel", 'T5ForConditionalGeneration', 'T5EncoderModel' @@ -1075,7 +1074,7 @@ def forward(self, hidden_states=all_hidden_states, attentions=all_attentions, cross_attentions=all_cross_attentions, - ) + ) def get_extended_attention_mask(self, attention_mask, input_shape): if attention_mask.ndim == 3: @@ -1363,12 +1362,15 @@ def forward(self, Whether or not to return the output of all hidden layers. Defaults to `False`. return_dict (bool, optional): - Whether or not to return a class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput` if `return_dict=True`. - Otherwise it returns a tuple of tensors corresponding to ordered and - not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`. + Whether or not to return a class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`. If `False`, the output + will be a tuple of tensors. Defaults to `False`. Returns: + An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput` if `return_dict=True`. + Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`. + tuple: Returns tuple (`last_hidden_state`, `cache`, `decoder_hidden_states`, `decoder_attentions`, `cross_attentions`, `encoder_last_hidden_state`, `encoder_hidden_states`, `encoder_attentions`) @@ -1474,7 +1476,7 @@ def forward(self, encoder_hidden_states=encoder_output.hidden_states, encoder_attentions=encoder_output.attentions, ) - + class T5ForConditionalGeneration(T5PretrainedModel): """ @@ -1562,6 +1564,10 @@ def forward(self, See :class:`T5Model`. Returns: + An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput` if `return_dict=True`. + Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of + :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`. + tuple: Returns tuple (`loss`, `logits`, `cache`, `decoder_hidden_states`, `decoder_attentions`, `cross_attentions`, `encoder_last_hidden_state`, `encoder_hidden_states`, `encoder_attentions`) @@ -1674,7 +1680,7 @@ def forward(self, loss_fct = nn.CrossEntropyLoss(ignore_index=-100) loss = loss_fct(lm_logits.reshape(shape=[-1, lm_logits.shape[-1]]), labels.flatten()) - + if not return_dict: # 元组相加 output = (lm_logits, ) + decoder_outputs[1:] + encoder_output[0:] @@ -1696,7 +1702,6 @@ def forward(self, encoder_attentions=encoder_output.attentions, ) - @staticmethod def prepare_input_ids_for_generation(bos_token_id, encoder_output=None): batch_size = 1 @@ -1888,8 +1893,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict - ) + return_dict=return_dict) return encoder_outputs @@ -1903,7 +1907,7 @@ def convert_encoder_output(encoder_output): args: encoder_output = (last_hidden_state, hidden_states, attentions) """ return BaseModelOutput( - last_hidden_state=encoder_output[0], - hidden_states=encoder_output[1] if len(encoder_output) > 1 else None, - attentions=encoder_output[2] if len(encoder_output) > 2 else None, - ) + last_hidden_state=encoder_output[0], + hidden_states=encoder_output[1] if len(encoder_output) > 1 else None, + attentions=encoder_output[2] if len(encoder_output) > 2 else None, + ) From 943c578a904037644d7e3c3e17bf4914185fa49e Mon Sep 17 00:00:00 2001 From: Yam0214 Date: Mon, 26 Sep 2022 13:35:39 +0000 Subject: [PATCH 03/10] modify description of function --- paddlenlp/transformers/t5/modeling.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py index 554bfee151ee..109e39a6e938 100644 --- a/paddlenlp/transformers/t5/modeling.py +++ b/paddlenlp/transformers/t5/modeling.py @@ -1904,7 +1904,11 @@ def forward( def convert_encoder_output(encoder_output): """ Convert encoder_output which type is tuple to an instance of BaseModelOutput. - args: encoder_output = (last_hidden_state, hidden_states, attentions) + + Args: + encoder_output (tuple or ModleOutput): + The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional). + The data type of `last_hidden_state` is float32 and its shape is [batch_size, sequence_length, hidden_size]. """ return BaseModelOutput( last_hidden_state=encoder_output[0], From 1146c7bfcc8c887889deabdd3ec6b291dd970339 Mon Sep 17 00:00:00 2001 From: Yam0214 Date: Mon, 26 Sep 2022 13:58:39 +0000 Subject: [PATCH 04/10] check codestyle again --- paddlenlp/transformers/t5/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py index 109e39a6e938..c961c481e961 100644 --- a/paddlenlp/transformers/t5/modeling.py +++ b/paddlenlp/transformers/t5/modeling.py @@ -1903,7 +1903,7 @@ def forward( def convert_encoder_output(encoder_output): """ - Convert encoder_output which type is tuple to an instance of BaseModelOutput. + Convert encoder_output from tuple to class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`. Args: encoder_output (tuple or ModleOutput): From e796a1de07457b8ec2dd01372e69015988de1348 Mon Sep 17 00:00:00 2001 From: Yam0214 Date: Tue, 27 Sep 2022 09:02:44 +0000 Subject: [PATCH 05/10] rewrite the condition of converting encoder_output --- paddlenlp/transformers/t5/modeling.py | 18 ++++++++---------- tests/transformers/t5/test_modeling.py | 9 ++++++++- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py index c961c481e961..a9d12ff38e65 100644 --- a/paddlenlp/transformers/t5/modeling.py +++ b/paddlenlp/transformers/t5/modeling.py @@ -1447,7 +1447,7 @@ def forward(self, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) - elif return_dict and not isinstance(encoder_output, ModelOutput): + elif return_dict and not isinstance(encoder_output, BaseModelOutput): encoder_output = convert_encoder_output(encoder_output) hidden_states = encoder_output[0] @@ -1631,10 +1631,12 @@ def forward(self, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) + else: + if isinstance(encoder_output, paddle.Tensor): + encoder_output = (encoder_output, ) + if return_dict and not isinstance(encoder_output, BaseModelOutput): + encoder_output = convert_encoder_output(encoder_output) - # encoder_output could be a Tensor, tuple or ModelOutput - if isinstance(encoder_output, paddle.Tensor): - encoder_output = (encoder_output, ) hidden_states = encoder_output[0] if labels is not None and decoder_input_ids is None: @@ -1682,14 +1684,9 @@ def forward(self, labels.flatten()) if not return_dict: - # 元组相加 - output = (lm_logits, ) + decoder_outputs[1:] + encoder_output[0:] - + output = (lm_logits, ) + decoder_outputs[1:] + encoder_output return ((loss, ) + output) if loss is not None else output - if not isinstance(encoder_output, ModelOutput): - encoder_output = convert_encoder_output(encoder_output) - return Seq2SeqLMOutput( loss=loss, logits=lm_logits, @@ -1910,6 +1907,7 @@ def convert_encoder_output(encoder_output): The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional). The data type of `last_hidden_state` is float32 and its shape is [batch_size, sequence_length, hidden_size]. """ + # if isinstance(encoder_output, tuple) return BaseModelOutput( last_hidden_state=encoder_output[0], hidden_states=encoder_output[1] if len(encoder_output) > 1 else None, diff --git a/tests/transformers/t5/test_modeling.py b/tests/transformers/t5/test_modeling.py index d76e1705dbb0..4619850995db 100644 --- a/tests/transformers/t5/test_modeling.py +++ b/tests/transformers/t5/test_modeling.py @@ -18,6 +18,7 @@ import copy import tempfile import unittest +from parameterized import parameterized_class from tests.testing_utils import slow @@ -497,10 +498,16 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict +@parameterized_class(("return_dict", "use_labels"), [ + [False, False], + [False, True], + [True, False], + [True, True], +]) class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): base_model_class = T5Model - all_model_classes = (T5Model, T5ForConditionalGeneration, T5EncoderModel) + all_model_classes = (T5Model, T5ForConditionalGeneration) all_generative_model_classes = {T5ForConditionalGeneration: (T5Model, "t5")} all_parallelizable_model_classes = (T5Model, T5ForConditionalGeneration, T5EncoderModel) From b9cbe0ff5cde943d15fdaf7952d501b39cc865ac Mon Sep 17 00:00:00 2001 From: Yam0214 Date: Tue, 27 Sep 2022 09:35:50 +0000 Subject: [PATCH 06/10] check codestyle --- paddlenlp/transformers/model_outputs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddlenlp/transformers/model_outputs.py b/paddlenlp/transformers/model_outputs.py index 6cf8a33f9a5f..cb99161b2ac8 100644 --- a/paddlenlp/transformers/model_outputs.py +++ b/paddlenlp/transformers/model_outputs.py @@ -784,7 +784,8 @@ class Seq2SeqModelOutput(ModelOutput): cross_attentions: Optional[Tuple[paddle.Tensor]] = None encoder_last_hidden_state: Optional[paddle.Tensor] = None encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None - encoder_attentions: Optional[Tuple[paddle.Tensor]] = None + encoder_attentions: Optional[Tuple[paddle.Tensor]] = None + @dataclass class Seq2SeqLMOutput(ModelOutput): From cb6f77b9c05d9c1d00e3ad564b9f4e19c6e605b6 Mon Sep 17 00:00:00 2001 From: Yam0214 Date: Wed, 28 Sep 2022 09:32:28 +0000 Subject: [PATCH 07/10] modift the documents of two classes --- paddlenlp/transformers/model_outputs.py | 53 +++++++++++++++---------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/paddlenlp/transformers/model_outputs.py b/paddlenlp/transformers/model_outputs.py index cb99161b2ac8..64e95dcb3c3b 100644 --- a/paddlenlp/transformers/model_outputs.py +++ b/paddlenlp/transformers/model_outputs.py @@ -733,45 +733,51 @@ class Seq2SeqModelOutput(ModelOutput): decoding. Args: - last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the decoder of the model. + last_hidden_state (`paddle.Tensor`): + Sequence of hidden-states at the output of the last layer of the decoder of the model, whose shape is `(batch_size, Sequence_length, hidden_size)`. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output. - past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + past_key_values (`tuple(tuple(paddle.Tensor))`, optional): Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + Returned when `use_cache=True` is passed or when `config.use_cache=True`. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + decoder_hidden_states (`tuple(paddle.Tensor)`, optional): Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`. Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs. - decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + decoder_attentions (`tuple(paddle.Tensor)`, optional): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + Returned when `output_attentions=True` is passed or when `config.output_attentions=True`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. - cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + cross_attentions (`tuple(paddle.Tensor)`, optional): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + Returned when `output_attentions=True` is passed or when `config.output_attentions=True`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. - encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the encoder of the model. - encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + encoder_last_hidden_state (`paddle.Tensor`, optional): + Sequence of hidden-states at the output of the last layer of the encoder of the model whose shape is `(batch_size, sequence_length, hidden_size)`, + encoder_hidden_states (`tuple(paddle.Tensor)`, optional): Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`. Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs. - encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + encoder_attentions (`tuple(paddle.Tensor)`, optional): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + Returned when `output_attentions=True` is passed or when `config.output_attentions=True`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -793,44 +799,49 @@ class Seq2SeqLMOutput(ModelOutput): Base class for sequence-to-sequence language models outputs. Args: - loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Language modeling loss. - logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + loss (`paddle.Tensor`, optional): + Language modeling loss whose shape is `(1,)`. Returned when `labels` is provided. + logits (`paddle.Tensor`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) whose shape is `(batch_size, sequence_length, config.vocab_size)`). + past_key_values (`tuple(tuple(paddle.Tensor))`, optional): Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + Returned when `use_cache=True` is passed or when `config.use_cache=True`. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + decoder_hidden_states (`tuple(paddle.Tensor)`, optional): Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`. Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. - decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + decoder_attentions (`tuple(paddle.Tensor)`, optional): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + Returned when `output_attentions=True` is passed or when `config.output_attentions=True`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. - cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + cross_attentions (`tuple(paddle.Tensor)`, optional): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + Returned when `output_attentions=True` is passed or when `config.output_attentions=True`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. - encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_last_hidden_state (`paddle.Tensor`, optional): + Sequence of hidden-states at the output of the last layer of the encoder of the model whose shape is `(batch_size, sequence_length, hidden_size)`. encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. - encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + encoder_attentions (`tuple(paddle.Tensor)`, optional): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + Returned when `output_attentions=True` is passed or when `config.output_attentions=True`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the self-attention heads. From f478886cb170967ca314a0b0b9e9bdb88f10b360 Mon Sep 17 00:00:00 2001 From: Yam0214 Date: Wed, 28 Sep 2022 12:42:46 +0000 Subject: [PATCH 08/10] modify document --- paddlenlp/transformers/t5/modeling.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py index cfe626b55fca..30bbfbfcbe9d 100644 --- a/paddlenlp/transformers/t5/modeling.py +++ b/paddlenlp/transformers/t5/modeling.py @@ -1561,7 +1561,8 @@ def forward(self, output_hidden_states (bool, optional): See :class:`T5Model`. return_dict (bool, optional): - See :class:`T5Model`. + Whether or not to return a class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`. If `False`, the output + will be a tuple of tensors. Defaults to `False`. Returns: An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput` if `return_dict=True`. From cf0f11fd4212c31ba3df372dbd05aa8e036c6b5c Mon Sep 17 00:00:00 2001 From: Yam0214 Date: Wed, 28 Sep 2022 12:46:38 +0000 Subject: [PATCH 09/10] add test case with use_labels=False or return_dict=True --- tests/transformers/t5/test_modeling.py | 134 ++++++++++++++++--------- 1 file changed, 89 insertions(+), 45 deletions(-) diff --git a/tests/transformers/t5/test_modeling.py b/tests/transformers/t5/test_modeling.py index 53ac3f1a89b7..637a4ae7cc9d 100644 --- a/tests/transformers/t5/test_modeling.py +++ b/tests/transformers/t5/test_modeling.py @@ -54,7 +54,6 @@ def __init__( # For common tests is_training=True, use_attention_mask=True, - use_labels=True, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, @@ -76,7 +75,6 @@ def __init__( self.seq_length = self.decoder_seq_length self.is_training = is_training self.use_attention_mask = use_attention_mask - self.use_labels = use_labels self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers @@ -106,7 +104,7 @@ def prepare_config_and_inputs(self): [self.batch_size, self.decoder_seq_length], vocab_size=2) lm_labels = None - if self.use_labels: + if self.parent.use_labels: lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) @@ -166,6 +164,8 @@ def check_prepare_lm_labels_via_shift_left( decoder_attention_mask, lm_labels, ): + if not self.parent.use_labels: + return model = T5Model(**config) model.eval() @@ -214,13 +214,14 @@ def create_and_check_model( ): model = T5Model(**config) model.eval() - result = model( - input_ids=input_ids, - decoder_input_ids=decoder_input_ids, - attention_mask=attention_mask, - decoder_attention_mask=decoder_attention_mask, - ) - result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) + result = model(input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + return_dict=self.parent.return_dict) + result = model(input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + return_dict=self.parent.return_dict) decoder_output = result[0] decoder_past = result[1] encoder_output = result[2] @@ -248,17 +249,22 @@ def create_and_check_with_lm_head( pretrained_model = T5Model(**config) model = T5ForConditionalGeneration(pretrained_model) model.eval() - outputs = model( - input_ids=input_ids, - decoder_input_ids=decoder_input_ids, - decoder_attention_mask=decoder_attention_mask, - labels=lm_labels, - ) - self.parent.assertEqual(len(outputs), 4) - self.parent.assertEqual( - outputs[1].shape, - [self.batch_size, self.decoder_seq_length, self.vocab_size]) - self.parent.assertEqual(outputs[0].shape, [1]) + outputs = model(input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + labels=lm_labels, + return_dict=self.parent.return_dict) + self.parent.assertEqual(len(outputs), + 4 if self.parent.use_labels else 3) + if self.parent.use_labels: + self.parent.assertEqual( + outputs[1].shape, + [self.batch_size, self.decoder_seq_length, self.vocab_size]) + self.parent.assertEqual(outputs[0].shape, [1]) + else: + self.parent.assertEqual( + outputs[0].shape, + [self.batch_size, self.decoder_seq_length, self.vocab_size]) def create_and_check_decoder_model_past( self, @@ -272,14 +278,19 @@ def create_and_check_decoder_model_past( model = T5Model(**config).get_decoder() model.eval() # first forward pass - outputs = model(input_ids, use_cache=True) - outputs_use_cache_conf = model(input_ids) - outputs_no_past = model(input_ids, use_cache=False) + outputs = model(input_ids, + use_cache=True, + return_dict=self.parent.return_dict) + outputs_use_cache_conf = model(input_ids, + return_dict=self.parent.return_dict) + outputs_no_past = model(input_ids, + use_cache=False, + return_dict=self.parent.return_dict) self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf) + 1) self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) - output, past_key_values = outputs + output, past_key_values = outputs[:2] # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor([self.batch_size, 1], config["vocab_size"]) @@ -287,8 +298,11 @@ def create_and_check_decoder_model_past( # append to next input_ids and next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1) - output_from_no_past = model(next_input_ids)[0] - output_from_past = model(next_tokens, cache=past_key_values)[0] + output_from_no_past = model(next_input_ids, + return_dict=self.parent.return_dict)[0] + output_from_past = model(next_tokens, + cache=past_key_values, + return_dict=self.parent.return_dict)[0] # select random slice random_slice_idx = ids_tensor([ @@ -327,7 +341,8 @@ def create_and_check_decoder_model_attention_mask_past( # first forward pass output, past_key_values = model(input_ids, attention_mask=attn_mask, - use_cache=True) + use_cache=True, + return_dict=self.parent.return_dict)[:2] # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor([self.batch_size, 1], config["vocab_size"]) @@ -349,11 +364,14 @@ def create_and_check_decoder_model_attention_mask_past( ) # get two different outputs - output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0] + output_from_no_past = model(next_input_ids, + attention_mask=attn_mask, + return_dict=self.parent.return_dict)[0] output_from_past = model(next_tokens, cache=past_key_values, attention_mask=paddle.ones( - (attn_mask.shape[0], 1), dtype="int64"))[0] + (attn_mask.shape[0], 1), dtype="int64"), + return_dict=self.parent.return_dict)[0] # select random slice random_slice_idx = ids_tensor([ @@ -385,9 +403,10 @@ def create_and_check_decoder_model_past_large_inputs( # first forward pass outputs = model(input_ids, attention_mask=attention_mask, - use_cache=True) + use_cache=True, + return_dict=self.parent.return_dict) - output, past_key_values = outputs + output, past_key_values = outputs[:2] # create hypothetical multiple next token and extent to next_input_ids next_tokens = ids_tensor([self.batch_size, 3], config["vocab_size"]) @@ -399,10 +418,12 @@ def create_and_check_decoder_model_past_large_inputs( axis=-1) output_from_no_past = model(next_input_ids, - attention_mask=next_attention_mask)[0] + attention_mask=next_attention_mask, + return_dict=self.parent.return_dict)[0] output_from_past = model(next_tokens, attention_mask=next_attention_mask, - cache=past_key_values)[0] + cache=past_key_values, + return_dict=self.parent.return_dict)[0] # select random slice random_slice_idx = ids_tensor([ @@ -506,6 +527,8 @@ def prepare_config_and_inputs_for_common(self): ]) class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): base_model_class = T5Model + return_dict: bool = False + use_labels: bool = False all_model_classes = (T5Model, T5ForConditionalGeneration) all_generative_model_classes = {T5ForConditionalGeneration: (T5Model, "t5")} @@ -1108,7 +1131,15 @@ def test_translation_en_to_ro(self): self.assertEqual(translation, expected_translation) +@parameterized_class(("return_dict", "use_labels"), [ + [False, False], + [False, True], + [True, False], + [True, True], +]) class TestAsymmetricT5(unittest.TestCase): + return_dict = False + use_labels = False def build_model_and_check_forward_pass(self, **kwargs): tester = T5ModelTester(self, **kwargs) @@ -1123,18 +1154,31 @@ def build_model_and_check_forward_pass(self, **kwargs): pretrained_model = T5Model(**config) model = T5ForConditionalGeneration(pretrained_model) model.eval() - outputs = model( - input_ids=input_ids, - decoder_input_ids=decoder_input_ids, - decoder_attention_mask=decoder_attention_mask, - labels=lm_labels, - ) + outputs = model(input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + labels=lm_labels, + return_dict=self.return_dict) # outputs = model(*inputs) - assert len(outputs) == 4 - assert outputs[1].shape == [ - tester.batch_size, tester.decoder_seq_length, tester.vocab_size - ] - assert outputs[0].shape == [1] + assert len(outputs) == (4 if self.use_labels else + 3), f"{type(outputs)}, {type(lm_labels)}" + # try: + # outputs[1].shape == [ + # tester.batch_size, tester.decoder_seq_length, tester.vocab_size + # ] + # except Exception as e: + # assert 1==0, f"use_labels:{self.use_labels}, return_dict:{self.return_dict},"\ + # f"{len(outputs)} " + f"{type(outputs)}" if isinstance(outputs, tuple) else f"{outputs.keys()}" + + if self.use_labels: + assert outputs[1].shape == [ + tester.batch_size, tester.decoder_seq_length, tester.vocab_size + ] + assert outputs[0].shape == [1] + else: + assert outputs[0].shape == [ + tester.batch_size, tester.decoder_seq_length, tester.vocab_size + ] return model def test_small_decoder(self): From abe565f81aceb86647bff17021cdae3f727851a6 Mon Sep 17 00:00:00 2001 From: Yam0214 Date: Thu, 29 Sep 2022 11:04:43 +0000 Subject: [PATCH 10/10] delete annotated code --- paddlenlp/transformers/t5/modeling.py | 1 - tests/transformers/t5/test_modeling.py | 7 ------- 2 files changed, 8 deletions(-) diff --git a/paddlenlp/transformers/t5/modeling.py b/paddlenlp/transformers/t5/modeling.py index 30bbfbfcbe9d..dcfd8c5c149c 100644 --- a/paddlenlp/transformers/t5/modeling.py +++ b/paddlenlp/transformers/t5/modeling.py @@ -1900,7 +1900,6 @@ def convert_encoder_output(encoder_output): The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional). The data type of `last_hidden_state` is float32 and its shape is [batch_size, sequence_length, hidden_size]. """ - # if isinstance(encoder_output, tuple) return BaseModelOutput( last_hidden_state=encoder_output[0], hidden_states=encoder_output[1] if len(encoder_output) > 1 else None, diff --git a/tests/transformers/t5/test_modeling.py b/tests/transformers/t5/test_modeling.py index 637a4ae7cc9d..32959ce49b21 100644 --- a/tests/transformers/t5/test_modeling.py +++ b/tests/transformers/t5/test_modeling.py @@ -1162,13 +1162,6 @@ def build_model_and_check_forward_pass(self, **kwargs): # outputs = model(*inputs) assert len(outputs) == (4 if self.use_labels else 3), f"{type(outputs)}, {type(lm_labels)}" - # try: - # outputs[1].shape == [ - # tester.batch_size, tester.decoder_seq_length, tester.vocab_size - # ] - # except Exception as e: - # assert 1==0, f"use_labels:{self.use_labels}, return_dict:{self.return_dict},"\ - # f"{len(outputs)} " + f"{type(outputs)}" if isinstance(outputs, tuple) else f"{outputs.keys()}" if self.use_labels: assert outputs[1].shape == [