diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py index 8f0a86ed3d..96ff551b2b 100644 --- a/src/transformers/models/aimv2/modeling_aimv2.py +++ b/src/transformers/models/aimv2/modeling_aimv2.py @@ -72,10 +72,7 @@ class Aimv2Output(ModelOutput): vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> tuple[Any]: - return tuple( - self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() - for k in self.keys() - ) + return tuple(v.to_tuple() if isinstance(v, ModelOutput) else v for v in self.values()) @use_kernel_forward_from_hub("RMSNorm") diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 247b810237..3334052abf 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -34,7 +34,7 @@ from ...processing_utils import Unpack from ...pytorch_utils import apply_chunking_to_forward from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int -from ...utils.generic import is_flash_attention_requested +from ...utils.generic import check_model_inputs, is_flash_attention_requested from .configuration_altclip import AltCLIPConfig, AltCLIPTextConfig, AltCLIPVisionConfig @@ -85,10 +85,7 @@ class AltCLIPOutput(ModelOutput): vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> tuple[Any]: - return tuple( - self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() - for k in self.keys() - ) + return tuple(v.to_tuple() if isinstance(v, ModelOutput) else v for v in self.values()) # Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->AltRoberta @@ -482,7 +479,7 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, - output_attentions: bool | None = False, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor | None]: """Input shape: Batch x Time x Channel""" @@ -518,12 +515,11 @@ def forward( is_causal=self.is_causal, scaling=self.scale, dropout=0.0 if not self.training else self.dropout, + **kwargs, ) attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: - attn_weights = None return attn_output, attn_weights @@ -557,18 +553,8 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor, - output_attentions: bool | None = False, - ) -> tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor, torch.Tensor | None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -576,7 +562,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -585,12 +571,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states, attn_weights class AltCLIPEncoder(nn.Module): @@ -608,16 +589,13 @@ def __init__(self, config: AltCLIPConfig): self.layers = nn.ModuleList([AltCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @can_return_tuple def forward( self, inputs_embeds, attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -638,45 +616,20 @@ def forward( - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) layer_outputs = encoder_layer( hidden_states, attention_mask, causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = layer_outputs[0] - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + last_hidden_state=hidden_states, ) @@ -770,6 +723,10 @@ class AltCLIPPreTrainedModel(PreTrainedModel): base_model_prefix = "altclip" input_modalities = ("image", "text") supports_gradient_checkpointing = True + _can_record_outputs = { + "hidden_states": AltCLIPEncoderLayer, + "attentions": AltCLIPAttention, + } _no_split_module = [] @torch.no_grad() @@ -833,22 +790,13 @@ def __init__(self, config: AltCLIPVisionConfig): self.encoder = AltCLIPEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - @can_return_tuple @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, interpolate_pos_encoding: bool | None = False, - ) -> tuple | BaseModelOutputWithPooling: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -857,9 +805,7 @@ def forward( encoder_outputs = self.encoder( inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=True, + **kwargs, ) last_hidden_state = encoder_outputs[0] @@ -869,8 +815,6 @@ def forward( return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) @@ -888,16 +832,14 @@ def __init__(self, config: AltCLIPVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, interpolate_pos_encoding: bool = False, - return_dict: bool | None = None, - **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -920,14 +862,10 @@ def forward( >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - return self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, + **kwargs, ) @@ -1194,7 +1132,7 @@ def get_text_features( return text_outputs - @can_return_tuple + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def get_image_features( self, @@ -1223,7 +1161,6 @@ def get_image_features( vision_outputs = self.vision_model( pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=True, **kwargs, ) pooled_output = vision_outputs.pooler_output @@ -1231,6 +1168,7 @@ def get_image_features( return vision_outputs + @can_return_tuple @auto_docstring def forward( self, @@ -1240,11 +1178,8 @@ def forward( position_ids: torch.LongTensor | None = None, token_type_ids: torch.Tensor | None = None, return_loss: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, interpolate_pos_encoding: bool = False, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | AltCLIPOutput: r""" return_loss (`bool`, *optional*): @@ -1270,29 +1205,18 @@ def forward( >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities ```""" - # Use AltCLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - text_outputs = self.text_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) vision_outputs = self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, + **kwargs, ) image_embeds = vision_outputs[1] @@ -1314,10 +1238,6 @@ def forward( if return_loss: loss = clip_loss(logits_per_text) - if not return_dict: - output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return AltCLIPOutput( loss=loss, logits_per_image=logits_per_image, diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 64223c23e8..1a185b30af 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -175,10 +175,7 @@ class ClapOutput(ModelOutput): audio_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> tuple[Any]: - return tuple( - self[k] if k not in ["text_model_output", "audio_model_output"] else getattr(self, k).to_tuple() - for k in self.keys() - ) + return tuple(v.to_tuple() if isinstance(v, ModelOutput) else v for v in self.values()) # Adapted from transformers.models.swin.modeling_swin.SwinDropPath diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 268661968b..e984833a13 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -132,10 +132,7 @@ class CLIPOutput(ModelOutput): vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> tuple[Any]: - return tuple( - self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() - for k in self.keys() - ) + return tuple(v.to_tuple() if isinstance(v, ModelOutput) else v for v in self.values()) class CLIPVisionEmbeddings(nn.Module): diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 9c93a5764b..034a14c320 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -29,8 +29,8 @@ from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int -from ...utils.generic import is_flash_attention_requested +from ...utils import ModelOutput, TransformersKwargs, auto_docstring, logging, torch_int +from ...utils.generic import check_model_inputs, is_flash_attention_requested from .configuration_clipseg import CLIPSegConfig, CLIPSegTextConfig, CLIPSegVisionConfig @@ -82,10 +82,7 @@ class CLIPSegOutput(ModelOutput): vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> tuple[Any]: - return tuple( - self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() - for k in self.keys() - ) + return tuple(v.to_tuple() if isinstance(v, ModelOutput) else v for v in self.values()) @dataclass @@ -127,10 +124,7 @@ class CLIPSegImageSegmentationOutput(ModelOutput): decoder_output: CLIPSegDecoderOutput = None def to_tuple(self) -> tuple[Any]: - return tuple( - self[k] if k not in ["vision_model_output", "decoder_output"] else getattr(self, k).to_tuple() - for k in self.keys() - ) + return tuple(v.to_tuple() if isinstance(v, ModelOutput) else v for v in self.values()) class CLIPSegVisionEmbeddings(nn.Module): @@ -310,10 +304,12 @@ def forward( attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, output_attentions: bool | None = False, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor | None]: """Input shape: Batch x Time x Channel""" batch_size, seq_length, embed_dim = hidden_states.shape + output_attentions = kwargs.get("output_attentions", output_attentions) queries = self.q_proj(hidden_states) keys = self.k_proj(hidden_states) @@ -345,6 +341,7 @@ def forward( is_causal=self.is_causal, scaling=self.scale, dropout=0.0 if not self.training else self.dropout, + **kwargs, ) attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() @@ -386,18 +383,8 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor, - output_attentions: bool | None = False, - ) -> tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor, torch.Tensor | None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -405,7 +392,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -414,12 +401,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states, attn_weights @auto_docstring @@ -428,6 +410,10 @@ class CLIPSegPreTrainedModel(PreTrainedModel): base_model_prefix = "clip" input_modalities = ("image", "text") supports_gradient_checkpointing = True + _can_record_outputs = { + "hidden_states": CLIPSegEncoderLayer, + "attentions": CLIPSegAttention, + } @torch.no_grad() def _init_weights(self, module): @@ -490,16 +476,13 @@ def __init__(self, config: CLIPSegConfig): self.layers = nn.ModuleList([CLIPSegEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @can_return_tuple def forward( self, inputs_embeds, attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -520,44 +503,20 @@ def forward( - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) layer_outputs = encoder_layer( hidden_states, attention_mask, causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) - hidden_states = layer_outputs[0] - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) + hidden_states = layer_outputs[0] return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + last_hidden_state=hidden_states, ) @@ -670,16 +629,14 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.text_model.embeddings.token_embedding = value + @check_model_inputs @auto_docstring def forward( self, input_ids: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: r""" Examples: @@ -700,9 +657,8 @@ def forward( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=True, + **kwargs, ) @@ -772,6 +728,7 @@ def __init__(self, config: CLIPSegVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, @@ -780,7 +737,7 @@ def forward( output_hidden_states: bool | None = None, interpolate_pos_encoding: bool | None = True, return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: r""" Examples: @@ -810,6 +767,7 @@ def forward( output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, return_dict=return_dict, + **kwargs, ) @@ -853,7 +811,7 @@ def __init__(self, config: CLIPSegConfig): # Initialize weights and apply final processing self.post_init() - @can_return_tuple + @check_model_inputs @auto_docstring def get_text_features( self, @@ -880,7 +838,6 @@ def get_text_features( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - return_dict=True, **kwargs, ) pooled_output = text_outputs.pooler_output @@ -888,7 +845,7 @@ def get_text_features( return text_outputs - @can_return_tuple + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def get_image_features( self, @@ -918,7 +875,6 @@ def get_image_features( vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=True, **kwargs, ) pooled_output = vision_outputs.pooler_output @@ -926,6 +882,7 @@ def get_image_features( return vision_outputs + @check_model_inputs @auto_docstring def forward( self, @@ -934,11 +891,8 @@ def forward( attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, return_loss: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, interpolate_pos_encoding: bool = True, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | CLIPSegOutput: r""" return_loss (`bool`, *optional*): @@ -966,35 +920,22 @@ def forward( >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities ```""" - # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - vision_outputs = self.vision_model( + vision_outputs = self.get_image_features( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, + **kwargs, ) - text_outputs = self.text_model( + text_outputs = self.get_text_features( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) - image_embeds = vision_outputs[1] - image_embeds = self.visual_projection(image_embeds) + image_embeds = vision_outputs.pooler_output - text_embeds = text_outputs[1] - text_embeds = self.text_projection(text_embeds) + text_embeds = text_outputs.pooler_output # normalized features image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) @@ -1009,10 +950,6 @@ def forward( if return_loss: loss = clipseg_loss(logits_per_text) - if not return_dict: - output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return CLIPSegOutput( loss=loss, logits_per_image=logits_per_image, @@ -1288,16 +1225,16 @@ def forward( # step 1: forward the query images through the frozen CLIP vision encoder with torch.no_grad(): - vision_outputs = self.clip.vision_model( + vision_outputs = self.clip.get_image_features( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=True, # we need the intermediate hidden states interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, + output_attentions=output_attentions, + output_hidden_states=True, # required for extract_layers + return_dict=True, ) - pooled_output = self.clip.visual_projection(vision_outputs[1]) + pooled_output = vision_outputs.pooler_output - hidden_states = vision_outputs.hidden_states if return_dict else vision_outputs[2] + hidden_states = vision_outputs.hidden_states # we add +1 here as the hidden states also include the initial embeddings activations = [hidden_states[i + 1] for i in self.extract_layers] @@ -1352,6 +1289,10 @@ def forward( loss = loss_fn(logits, labels) if not return_dict: + if isinstance(vision_outputs, ModelOutput): + vision_outputs = vision_outputs.to_tuple() + if isinstance(decoder_outputs, ModelOutput): + decoder_outputs = decoder_outputs.to_tuple() output = (logits, conditional_embeddings, pooled_output, vision_outputs, decoder_outputs) return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 5cb977b451..62f9e66b1b 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -35,15 +35,16 @@ CausalLMOutputWithPast, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack from ...pytorch_utils import apply_chunking_to_forward from ...utils import ( ModelOutput, + TransformersKwargs, auto_docstring, - can_return_tuple, logging, torch_int, ) -from ...utils.generic import is_flash_attention_requested +from ...utils.generic import check_model_inputs, is_flash_attention_requested from .configuration_git import GitConfig, GitVisionConfig @@ -656,7 +657,7 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, - output_attentions: bool | None = False, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor | None]: """Input shape: Batch x Time x Channel""" @@ -692,12 +693,11 @@ def forward( is_causal=self.is_causal, scaling=self.scale, dropout=0.0 if not self.training else self.dropout, + **kwargs, ) attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: - attn_weights = None return attn_output, attn_weights @@ -716,18 +716,8 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor, - output_attentions: bool | None = False, - ) -> tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor, torch.Tensor | None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -735,7 +725,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -744,12 +734,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states, attn_weights # Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->GitVision, CLIPConfig @@ -768,16 +753,13 @@ def __init__(self, config: GitVisionConfig): self.layers = nn.ModuleList([GitVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @can_return_tuple def forward( self, inputs_embeds, attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -798,45 +780,20 @@ def forward( - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) layer_outputs = encoder_layer( hidden_states, attention_mask, causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = layer_outputs[0] - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + last_hidden_state=hidden_states, ) @@ -856,17 +813,9 @@ def __init__(self, config: GitVisionConfig): def forward( self, pixel_values: torch.FloatTensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, interpolate_pos_encoding: bool | None = False, - return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -875,22 +824,15 @@ def forward( encoder_outputs = self.encoder( inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state last_hidden_state = self.post_layernorm(last_hidden_state) - if not return_dict: - return (last_hidden_state,) + encoder_outputs[1:] - return BaseModelOutput( last_hidden_state=last_hidden_state, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) @@ -903,6 +845,10 @@ class GitVisionModel(GitPreTrainedModel): config: GitVisionConfig main_input_name = "pixel_values" input_modalities = ("image",) + _can_record_outputs = { + "hidden_states": GitVisionEncoderLayer, + "attentions": GitVisionAttention, + } # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP->Git def __init__(self, config: GitVisionConfig): @@ -914,15 +860,13 @@ def __init__(self, config: GitVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, interpolate_pos_encoding: bool = False, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutput: r""" Examples: @@ -945,14 +889,10 @@ def forward( >>> outputs = model(**inputs) >>> last_hidden_state = outputs.last_hidden_state ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - return self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, + **kwargs, ) diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index a0e83e5fbf..c1cec77e8f 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -706,18 +706,8 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor, - output_attentions: bool | None = False, - ) -> tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor, torch.Tensor | None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -725,7 +715,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -734,12 +724,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states, attn_weights @auto_docstring diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py index 91bdb78c3b..6891eea8c4 100644 --- a/src/transformers/models/idefics/vision.py +++ b/src/transformers/models/idefics/vision.py @@ -24,9 +24,10 @@ from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack from ...utils import ( ModelOutput, - can_return_tuple, + TransformersKwargs, logging, ) from ...utils.generic import is_flash_attention_requested @@ -217,7 +218,7 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, - output_attentions: bool | None = False, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor | None]: """Input shape: Batch x Time x Channel""" @@ -253,11 +254,12 @@ def forward( is_causal=self.is_causal, scaling=self.scale, dropout=0.0 if not self.training else self.dropout, + **kwargs, ) attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: + if not kwargs.get("output_attentions", False): attn_weights = None return attn_output, attn_weights @@ -293,18 +295,8 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor, - output_attentions: bool | None = False, - ) -> tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor, torch.Tensor | None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -312,7 +304,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -321,12 +313,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states, attn_weights # Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->IdeficsVision @@ -345,16 +332,13 @@ def __init__(self, config: IdeficsVisionConfig): self.layers = nn.ModuleList([IdeficsVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @can_return_tuple def forward( self, inputs_embeds, attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -375,45 +359,20 @@ def forward( - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) layer_outputs = encoder_layer( hidden_states, attention_mask, causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = layer_outputs[0] - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + last_hidden_state=hidden_states, ) diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index d42b15d9d1..a75e6625d2 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -37,7 +37,7 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int -from ...utils.generic import is_flash_attention_requested +from ...utils.generic import check_model_inputs, is_flash_attention_requested from .configuration_kosmos2 import Kosmos2Config, Kosmos2TextConfig, Kosmos2VisionConfig @@ -312,7 +312,7 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, - output_attentions: bool | None = False, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor | None]: """Input shape: Batch x Time x Channel""" @@ -348,12 +348,11 @@ def forward( is_causal=self.is_causal, scaling=self.scale, dropout=0.0 if not self.training else self.dropout, + **kwargs, ) attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: - attn_weights = None return attn_output, attn_weights @@ -388,18 +387,8 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor, - output_attentions: bool | None = False, - ) -> tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor, torch.Tensor | None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -407,7 +396,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -416,12 +405,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states, attn_weights class Kosmos2VisionEncoder(nn.Module): @@ -439,16 +423,13 @@ def __init__(self, config: Kosmos2VisionConfig): self.layers = nn.ModuleList([Kosmos2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @can_return_tuple def forward( self, inputs_embeds, attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -469,45 +450,20 @@ def forward( - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) layer_outputs = encoder_layer( hidden_states, attention_mask, causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = layer_outputs[0] - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - return BaseModelOutputWithProjectionAttentions( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + last_hidden_state=hidden_states, ) @@ -527,17 +483,9 @@ def __init__(self, config: Kosmos2VisionConfig): def forward( self, pixel_values: torch.FloatTensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, interpolate_pos_encoding: bool = False, - return_dict: bool | None = None, - ) -> tuple | BaseModelOutputWithPooling: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -546,23 +494,16 @@ def forward( encoder_outputs = self.encoder( inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) last_hidden_state = encoder_outputs[0] pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) @@ -1136,6 +1077,10 @@ class Kosmos2PreTrainedModel(PreTrainedModel): _supports_attention_backend = True _supports_flash_attn = True _supports_sdpa = True + _can_record_outputs = { + "hidden_states": Kosmos2VisionEncoderLayer, + "attentions": Kosmos2VisionAttention, + } @torch.no_grad() def _init_weights(self, module: nn.Module): @@ -1213,22 +1158,18 @@ def __init__(self, config: Kosmos2VisionConfig): def get_input_embeddings(self) -> nn.Module: return self.model.embeddings.patch_embedding + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, interpolate_pos_encoding: bool = False, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithProjectionAttentions: return self.model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, + **kwargs, ) @@ -1507,7 +1448,7 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.text_model.model.embed_tokens = value - @can_return_tuple + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def get_image_features( self, diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py index fb7a9f87e0..7b0e436599 100644 --- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py @@ -686,10 +686,7 @@ class MetaClip2Output(ModelOutput): vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> tuple[Any]: - return tuple( - self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() - for k in self.keys() - ) + return tuple(v.to_tuple() if isinstance(v, ModelOutput) else v for v in self.values()) # contrastive loss function, adapted from diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py index 3079bb3505..38ff9a553a 100644 --- a/src/transformers/models/owlv2/modeling_owlv2.py +++ b/src/transformers/models/owlv2/modeling_owlv2.py @@ -416,10 +416,12 @@ def forward( attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, output_attentions: bool | None = False, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]: """Input shape: Batch x Time x Channel""" bsz, tgt_len, embed_dim = hidden_states.size() + output_attentions = kwargs.get("output_attentions", output_attentions) # get query proj query_states = self.q_proj(hidden_states) * self.scale @@ -523,18 +525,8 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor, - output_attentions: bool | None = False, - ) -> tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor, torch.Tensor | None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -542,7 +534,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -551,12 +543,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states, attn_weights @auto_docstring diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 4d2c96f793..ca88f21a6e 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -404,10 +404,12 @@ def forward( attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, output_attentions: bool | None = False, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]: """Input shape: Batch x Time x Channel""" bsz, tgt_len, embed_dim = hidden_states.size() + output_attentions = kwargs.get("output_attentions", output_attentions) # get query proj query_states = self.q_proj(hidden_states) * self.scale @@ -511,18 +513,8 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor, - output_attentions: bool | None = False, - ) -> tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor, torch.Tensor | None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -530,7 +522,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -539,12 +531,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states, attn_weights @auto_docstring diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py index bf91bfe8b5..81b2de4c07 100644 --- a/src/transformers/models/siglip/modeling_siglip.py +++ b/src/transformers/models/siglip/modeling_siglip.py @@ -141,10 +141,7 @@ class SiglipOutput(ModelOutput): vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> tuple[Any]: - return tuple( - self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() - for k in self.keys() - ) + return tuple(v.to_tuple() if isinstance(v, ModelOutput) else v for v in self.values()) class SiglipVisionEmbeddings(nn.Module): diff --git a/src/transformers/models/siglip2/modeling_siglip2.py b/src/transformers/models/siglip2/modeling_siglip2.py index f39a1de270..405733513f 100644 --- a/src/transformers/models/siglip2/modeling_siglip2.py +++ b/src/transformers/models/siglip2/modeling_siglip2.py @@ -108,10 +108,7 @@ class Siglip2Output(ModelOutput): vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> tuple[Any]: - return tuple( - self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() - for k in self.keys() - ) + return tuple(v.to_tuple() if isinstance(v, ModelOutput) else v for v in self.values()) class Siglip2VisionEmbeddings(nn.Module): @@ -544,25 +541,19 @@ def __init__(self, config: Siglip2VisionConfig): self.post_init() + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values: torch.FloatTensor, attention_mask: torch.Tensor, spatial_shapes: torch.LongTensor, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: r""" spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`): Tensor containing the spatial dimensions (height, width) of the input images. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - hidden_states = self.embeddings(pixel_values, spatial_shapes) if attention_mask is not None and not is_flash_attention_requested(self.config): @@ -574,8 +565,7 @@ def forward( encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, attention_mask=encoder_attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) last_hidden_state = encoder_outputs.last_hidden_state @@ -874,7 +864,7 @@ def get_text_features( **kwargs, ) - @can_return_tuple + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def get_image_features( self, @@ -916,7 +906,7 @@ def get_image_features( ) # NOTE: Siglip2Model uses Pretrained backbones, so we don't need to add `check_model_inputs` here - @can_return_tuple + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, @@ -927,9 +917,7 @@ def forward( attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, return_loss: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> Siglip2Output: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): @@ -968,26 +956,18 @@ def forward( 31.9% that image 0 is 'a photo of 2 cats' ``` """ - # Use Siglip2 model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, attention_mask=pixel_attention_mask, spatial_shapes=spatial_shapes, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) image_embeds = vision_outputs.pooler_output @@ -1059,7 +1039,7 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value: nn.Module): self.vision_model.embeddings.patch_embedding = value - @check_model_inputs + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, @@ -1067,9 +1047,7 @@ def forward( pixel_attention_mask: torch.Tensor | None = None, spatial_shapes: torch.LongTensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> ImageClassifierOutput: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): @@ -1109,17 +1087,11 @@ def forward( Predicted class: LABEL_1 ``` """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values, attention_mask=pixel_attention_mask, spatial_shapes=spatial_shapes, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) sequence_output = outputs.last_hidden_state diff --git a/src/transformers/models/siglip2/modular_siglip2.py b/src/transformers/models/siglip2/modular_siglip2.py index 91ab930357..17e66cf5ea 100644 --- a/src/transformers/models/siglip2/modular_siglip2.py +++ b/src/transformers/models/siglip2/modular_siglip2.py @@ -37,7 +37,7 @@ from ...modeling_attn_mask_utils import _prepare_4d_attention_mask from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import check_model_inputs, is_flash_attention_requested @@ -280,24 +280,19 @@ def __init__(self, config: Siglip2VisionConfig): super().__init__(config) # Update: add `spatial_shapes` and `attention_mask` + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring def forward( self, pixel_values: torch.FloatTensor, attention_mask: torch.Tensor, spatial_shapes: torch.LongTensor, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: r""" spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`): Tensor containing the spatial dimensions (height, width) of the input images. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - hidden_states = self.embeddings(pixel_values, spatial_shapes) if attention_mask is not None and not is_flash_attention_requested(self.config): @@ -309,8 +304,7 @@ def forward( encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, attention_mask=encoder_attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) last_hidden_state = encoder_outputs.last_hidden_state @@ -402,7 +396,7 @@ def forward( class Siglip2Model(SiglipModel): # Update: add `spatial_shapes` and `pixel_attention_mask` - @can_return_tuple + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def get_image_features( self, @@ -444,6 +438,8 @@ def get_image_features( ) # Update: add `spatial_shapes` and `pixel_attention_mask` + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring def forward( self, input_ids: torch.LongTensor | None = None, @@ -453,9 +449,7 @@ def forward( attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, return_loss: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> Siglip2Output: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): @@ -494,26 +488,18 @@ def forward( 31.9% that image 0 is 'a photo of 2 cats' ``` """ - # Use Siglip2 model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, attention_mask=pixel_attention_mask, spatial_shapes=spatial_shapes, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) image_embeds = vision_outputs.pooler_output @@ -553,15 +539,15 @@ def forward( class Siglip2ForImageClassification(SiglipForImageClassification): # Update: add `spatial_shapes` and `pixel_attention_mask` + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring def forward( self, pixel_values: torch.Tensor | None = None, pixel_attention_mask: torch.Tensor | None = None, spatial_shapes: torch.LongTensor | None = None, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> ImageClassifierOutput: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): @@ -601,17 +587,11 @@ def forward( Predicted class: LABEL_1 ``` """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values, attention_mask=pixel_attention_mask, spatial_shapes=spatial_shapes, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) sequence_output = outputs.last_hidden_state diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index ff29a9f502..fac81ba81a 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -29,7 +29,7 @@ from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, ModelOutput from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check +from ...utils import TransformersKwargs, auto_docstring, torch_compilable_check from ...utils.generic import check_model_inputs from ..auto import AutoModel from .configuration_vipllava import VipLlavaConfig @@ -150,7 +150,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.language_model.set_input_embeddings(value) - @can_return_tuple + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring( custom_intro="Obtains image last hidden states from the vision tower and apply multimodal projection." ) @@ -158,7 +158,6 @@ def get_image_features( self, pixel_values: torch.FloatTensor, vision_feature_layers: int | list[int] | None = None, - output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: r""" @@ -171,9 +170,10 @@ def get_image_features( vision_feature_layers = ( vision_feature_layers if vision_feature_layers is not None else self.config.vision_feature_layers ) + # We need hidden states to select intermediate vision features by layer index below. + kwargs["output_hidden_states"] = True image_outputs = self.vision_tower( pixel_values, - output_hidden_states=True, # Ignore arg on purpose return_dict=True, **kwargs, ) @@ -215,6 +215,7 @@ def get_placeholder_mask( ) return special_image_mask + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, @@ -226,22 +227,14 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, vision_feature_layers: int | list[int] | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, - **lm_kwargs, + **lm_kwargs: Unpack[TransformersKwargs], ) -> tuple | VipLlavaModelOutputWithPast: r""" vision_feature_layers (`Union[int, list[int]]`, *optional*): The vision feature layer, or the list of indexes of the layers to select the vision feature. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_feature_layers = ( vision_feature_layers if vision_feature_layers is not None else self.config.vision_feature_layers ) @@ -268,8 +261,6 @@ def forward( past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, return_dict=True, cache_position=cache_position, **lm_kwargs, @@ -282,7 +273,7 @@ def forward( attentions=outputs.attentions, image_hidden_states=image_features if pixel_values is not None else None, ) - return output if return_dict else output.to_tuple() + return output @auto_docstring( @@ -333,7 +324,6 @@ def get_image_features( ) @check_model_inputs(tie_last_hidden_states=False) - @auto_docstring def forward( self, input_ids: torch.LongTensor | None = None, @@ -345,12 +335,9 @@ def forward( vision_feature_layers: int | list[int] | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, - **lm_kwargs, + **lm_kwargs: Unpack[TransformersKwargs], ) -> tuple | VipLlavaCausalLMOutputWithPast: r""" vision_feature_layers (`Union[int, list[int]]`, *optional*): @@ -388,11 +375,6 @@ def forward( The image features a brown and white cat sitting on a green surface, with a red ball in its ```""" - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_feature_layers = ( vision_feature_layers if vision_feature_layers is not None else self.config.vision_feature_layers ) @@ -406,8 +388,6 @@ def forward( inputs_embeds=inputs_embeds, use_cache=use_cache, vision_feature_layers=vision_feature_layers, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, return_dict=True, cache_position=cache_position, **lm_kwargs, diff --git a/src/transformers/models/vipllava/modular_vipllava.py b/src/transformers/models/vipllava/modular_vipllava.py index eceda8058e..2a46f02118 100644 --- a/src/transformers/models/vipllava/modular_vipllava.py +++ b/src/transformers/models/vipllava/modular_vipllava.py @@ -27,7 +27,8 @@ from ...cache_utils import Cache from ...modeling_outputs import BaseModelOutputWithPooling from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils import TransformersKwargs, auto_docstring, logging +from ...utils.generic import check_model_inputs from .configuration_vipllava import VipLlavaConfig @@ -71,7 +72,7 @@ class VipLlavaPreTrainedModel(LlavaPreTrainedModel): class VipLlavaModel(LlavaModel): - @can_return_tuple + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring( custom_intro="Obtains image last hidden states from the vision tower and apply multimodal projection." ) @@ -79,7 +80,6 @@ def get_image_features( self, pixel_values: torch.FloatTensor, vision_feature_layers: int | list[int] | None = None, - output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: r""" @@ -92,9 +92,10 @@ def get_image_features( vision_feature_layers = ( vision_feature_layers if vision_feature_layers is not None else self.config.vision_feature_layers ) + # We need hidden states to select intermediate vision features by layer index below. + kwargs["output_hidden_states"] = True image_outputs = self.vision_tower( pixel_values, - output_hidden_states=True, # Ignore arg on purpose return_dict=True, **kwargs, ) @@ -112,6 +113,7 @@ def get_image_features( return image_outputs + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, @@ -123,22 +125,14 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, vision_feature_layers: int | list[int] | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, - **lm_kwargs, + **lm_kwargs: Unpack[TransformersKwargs], ) -> tuple | VipLlavaModelOutputWithPast: r""" vision_feature_layers (`Union[int, list[int]]`, *optional*): The vision feature layer, or the list of indexes of the layers to select the vision feature. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_feature_layers = ( vision_feature_layers if vision_feature_layers is not None else self.config.vision_feature_layers ) @@ -165,8 +159,6 @@ def forward( past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, return_dict=True, cache_position=cache_position, **lm_kwargs, @@ -179,7 +171,7 @@ def forward( attentions=outputs.attentions, image_hidden_states=image_features if pixel_values is not None else None, ) - return output if return_dict else output.to_tuple() + return output class VipLlavaForConditionalGeneration(LlavaForConditionalGeneration): @@ -201,6 +193,7 @@ def get_image_features( pixel_values=pixel_values, vision_feature_layers=vision_feature_layers, **kwargs ) + @check_model_inputs(tie_last_hidden_states=False) def forward( self, input_ids: torch.LongTensor | None = None, @@ -212,12 +205,9 @@ def forward( vision_feature_layers: int | list[int] | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, - **lm_kwargs, + **lm_kwargs: Unpack[TransformersKwargs], ) -> tuple | VipLlavaCausalLMOutputWithPast: r""" vision_feature_layers (`Union[int, list[int]]`, *optional*): @@ -255,11 +245,6 @@ def forward( The image features a brown and white cat sitting on a green surface, with a red ball in its ```""" - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_feature_layers = ( vision_feature_layers if vision_feature_layers is not None else self.config.vision_feature_layers ) @@ -273,8 +258,6 @@ def forward( inputs_embeds=inputs_embeds, use_cache=use_cache, vision_feature_layers=vision_feature_layers, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, return_dict=True, cache_position=cache_position, **lm_kwargs, diff --git a/src/transformers/models/vjepa2/modeling_vjepa2.py b/src/transformers/models/vjepa2/modeling_vjepa2.py index 332e021080..26247bab4b 100644 --- a/src/transformers/models/vjepa2/modeling_vjepa2.py +++ b/src/transformers/models/vjepa2/modeling_vjepa2.py @@ -22,7 +22,9 @@ from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel -from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging +from ...processing_utils import Unpack +from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.generic import OutputRecorder, check_model_inputs from .configuration_vjepa2 import VJEPA2Config @@ -294,8 +296,7 @@ def forward( self, hidden_states, position_mask: torch.Tensor | None = None, - output_attentions: bool = False, - ) -> tuple[torch.Tensor, torch.Tensor] | tuple[torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: batch_size, seq_length, _ = hidden_states.shape query_layer = ( self.query(hidden_states) @@ -335,9 +336,7 @@ def forward( new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = self.proj(context_layer.reshape(new_context_layer_shape)) - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) - - return outputs + return context_layer, attention_probs # Adapted from transformers.models.beit.modeling_dinov2.drop_path @@ -414,17 +413,15 @@ def forward( self, hidden_states: torch.Tensor, position_mask: torch.Tensor | None = None, - output_attentions: bool = False, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, ...]: # Self-Attention residual = hidden_states hidden_states = self.norm1(hidden_states) - self_attention_outputs = self.attention( + attention_output, attn_weights = self.attention( hidden_states, position_mask=position_mask, # position mask for context/target selection - output_attentions=output_attentions, ) - attention_output = self_attention_outputs[0] hidden_states = self.drop_path(attention_output) + residual # MLP @@ -434,10 +431,7 @@ def forward( hidden_states = self.drop_path(hidden_states) + residual # Add self attentions if we output attention weights - outputs = self_attention_outputs[1:] - outputs = (hidden_states,) + outputs - - return outputs + return hidden_states, attn_weights class VJEPA2Encoder(nn.Module): @@ -465,38 +459,21 @@ def __init__(self, config: VJEPA2Config): self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.gradient_checkpointing = False - @can_return_tuple def forward( self, pixel_values_videos: torch.Tensor | None = None, - output_attentions: bool = False, - output_hidden_states: bool = False, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutput: - all_hidden_states = () if output_hidden_states else None - all_self_attentions = () if output_attentions else None - hidden_states = self.embeddings(pixel_values_videos) for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - layer_outputs = layer_module(hidden_states, None, output_attentions) + layer_outputs = layer_module(hidden_states, None, **kwargs) hidden_states = layer_outputs[0] - if output_attentions: - all_self_attentions = all_self_attentions + (layer_outputs[1],) - hidden_states = self.layernorm(hidden_states) - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - return BaseModelOutput( last_hidden_state=hidden_states, - hidden_states=all_hidden_states, - attentions=all_self_attentions, ) @@ -635,19 +612,13 @@ def unsort_tokens(self, hidden_states, argsort): hidden_states = torch.gather(hidden_states, dim=1, index=reverse_argsort) return hidden_states - @can_return_tuple def forward( self, encoder_hidden_states: torch.Tensor, context_mask: list[torch.Tensor], target_mask: list[torch.Tensor], - output_attentions: bool = False, - output_hidden_states: bool = False, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutput: - all_hidden_states = () if output_hidden_states else None - all_self_attentions = () if output_attentions else None - # mask out the encoder hidden states # this is implemented here as in VJEPA training a separate encoder is used for target encoder_hidden_states = apply_masks(encoder_hidden_states, context_mask) @@ -659,18 +630,9 @@ def forward( hidden_states, position_masks = self.sort_tokens(hidden_states, position_masks, argsort) for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - layer_outputs = layer_module(hidden_states, position_masks, output_attentions) + layer_outputs = layer_module(hidden_states, position_masks, **kwargs) hidden_states = layer_outputs[0] - if output_attentions: - all_self_attentions = all_self_attentions + (layer_outputs[1],) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - hidden_states = self.layernorm(hidden_states) # unsort and extract the predicted tokens hidden_states = self.unsort_tokens(hidden_states, argsort) @@ -680,8 +642,6 @@ def forward( return BaseModelOutput( last_hidden_state=hidden_states, - hidden_states=all_hidden_states, - attentions=all_self_attentions, ) @@ -939,6 +899,10 @@ class VJEPA2PreTrainedModel(PreTrainedModel): ] _supports_sdpa = True _supports_flash_attn = True + _can_record_outputs = { + "hidden_states": OutputRecorder(VJEPA2Layer, layer_name="encoder.layer"), + "attentions": OutputRecorder(VJEPA2RopeAttention, index=1, layer_name="encoder.layer"), + } @torch.no_grad() def _init_weights(self, module): @@ -982,7 +946,7 @@ def __init__(self, config: VJEPA2Config): def get_input_embeddings(self) -> VJEPA2PatchEmbeddings3D: return self.encoder.embeddings.patch_embeddings - @can_return_tuple + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, @@ -990,9 +954,7 @@ def forward( context_mask: list[torch.Tensor] | None = None, target_mask: list[torch.Tensor] | None = None, skip_predictor: bool = False, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> VJEPA2WithMaskedInputModelOutput: r""" context_mask (`torch.Tensor` with shape `[batch_size, patch_size, 1]`, *optional*): @@ -1006,18 +968,12 @@ def forward( skip_predictor (bool): flag to skip the predictor forward, useful if you just need the encoder outputs """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - if pixel_values_videos is None: raise ValueError("You have to specify pixel_values_videos") encoder_outputs: BaseModelOutput = self.encoder( pixel_values_videos=pixel_values_videos, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) sequence_output = encoder_outputs.last_hidden_state @@ -1032,8 +988,7 @@ def forward( encoder_hidden_states=sequence_output, context_mask=context_mask, target_mask=target_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) predictor_output = VJEPA2WithMaskedInputPredictorOutput( last_hidden_state=predictor_outputs.last_hidden_state, @@ -1084,9 +1039,7 @@ def forward( self, pixel_values_videos: torch.Tensor, labels: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | ImageClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1126,8 +1079,7 @@ def forward( outputs = self.vjepa2( pixel_values_videos=pixel_values_videos, skip_predictor=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + **kwargs, ) last_hidden_state = outputs.last_hidden_state diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index f08c4a695e..0e8d8c8d9c 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -32,11 +32,10 @@ ModelOutput, TransformersKwargs, auto_docstring, - can_return_tuple, logging, torch_int, ) -from ...utils.generic import is_flash_attention_requested +from ...utils.generic import OutputRecorder, check_model_inputs, is_flash_attention_requested from .configuration_x_clip import XCLIPConfig, XCLIPTextConfig, XCLIPVisionConfig @@ -276,7 +275,7 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, - output_attentions: bool | None = False, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor | None]: """Input shape: Batch x Time x Channel""" @@ -312,13 +311,11 @@ def forward( is_causal=self.is_causal, scaling=self.scale, dropout=0.0 if not self.training else self.dropout, + **kwargs, ) attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() attn_output = self.out_proj(attn_output) - if not output_attentions: - attn_weights = None - return attn_output, attn_weights @@ -352,18 +349,8 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor, - output_attentions: bool | None = False, - ) -> tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor, torch.Tensor | None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -371,7 +358,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -380,12 +367,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states, attn_weights # Copied from transformers.models.beit.modeling_beit.drop_path @@ -445,29 +427,14 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor, - output_attentions: bool | None = False, - ) -> tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - `(config.encoder_attention_heads,)`. - causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Causal mask for the text model. Mask values selected in `[0, 1]`: - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor, torch.Tensor | None]: batch_time, seq_length, hidden_size = hidden_states.size() batch_size = batch_time // self.num_frames msg_token = self.message_fc(hidden_states[:, 0, :]) msg_token = msg_token.view(batch_size, self.num_frames, hidden_size) - msg_token = msg_token + self.drop_path(self.message_attn(self.message_ln(msg_token))[0]) + msg_token = msg_token + self.drop_path(self.message_attn(self.message_ln(msg_token), **kwargs)[0]) # add dummy sequence dimension msg_token = msg_token.view(-1, 1, hidden_size) @@ -480,7 +447,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = residual + hidden_states @@ -491,12 +458,7 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs + return hidden_states, attn_weights @auto_docstring @@ -505,6 +467,13 @@ class XCLIPPreTrainedModel(PreTrainedModel): base_model_prefix = "x_clip" input_modalities = ("image", "text") supports_gradient_checkpointing = True + _can_record_outputs = { + "hidden_states": [ + OutputRecorder(XCLIPEncoderLayer, layer_name="text_model"), + XCLIPVisionEncoderLayer, + ], + "attentions": OutputRecorder(GradientCheckpointingLayer, layer_name="model", index=1), + } @torch.no_grad() def _init_weights(self, module): @@ -573,16 +542,13 @@ def __init__(self, config: XCLIPConfig): self.layers = nn.ModuleList([XCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False - @can_return_tuple def forward( self, inputs_embeds, attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -603,45 +569,20 @@ def forward( - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) layer_outputs = encoder_layer( hidden_states, attention_mask, causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) hidden_states = layer_outputs[0] - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + last_hidden_state=hidden_states, ) @@ -660,16 +601,8 @@ def forward( input_ids: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - ) -> tuple | BaseModelOutputWithPooling: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: if input_ids is None: raise ValueError("You have to specify either input_ids") @@ -692,9 +625,7 @@ def forward( inputs_embeds=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) last_hidden_state = encoder_outputs[0] @@ -704,14 +635,9 @@ def forward( # take features from the eot embedding (eot_token is the highest number in each sequence) pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)] - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) @@ -731,16 +657,14 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.text_model.embeddings.token_embedding = value + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, input_ids: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: r""" Examples: @@ -761,9 +685,7 @@ def forward( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) @@ -787,9 +709,7 @@ def forward( inputs_embeds, attention_mask: torch.Tensor | None = None, causal_attention_mask: torch.Tensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutput: r""" Args: @@ -811,48 +731,18 @@ def forward( - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) layer_outputs = encoder_layer( hidden_states, attention_mask, causal_attention_mask, - output_attentions=output_attentions, + **kwargs, ) - hidden_states = layer_outputs[0] - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions - ) + return BaseModelOutput(last_hidden_state=hidden_states) class XCLIPVisionTransformer(nn.Module): @@ -874,39 +764,24 @@ def __init__(self, config: XCLIPVisionConfig): def forward( self, pixel_values: torch.FloatTensor, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, interpolate_pos_encoding: bool = False, - return_dict: bool | None = None, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layernorm(hidden_states) encoder_outputs = self.encoder( inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) last_hidden_state = encoder_outputs[0] pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) @@ -924,14 +799,12 @@ def __init__(self, config: XCLIPVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: r""" Examples: @@ -1009,9 +882,7 @@ def forward( ```""" return self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) @@ -1029,9 +900,7 @@ def __init__(self, config: XCLIPVisionConfig): def forward( self, hidden_states, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutput: residual = hidden_states @@ -1040,9 +909,7 @@ def forward( encoder_outputs = self.encoder( inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) last_hidden_state = encoder_outputs[0] @@ -1050,14 +917,9 @@ def forward( pooled_output = last_hidden_state.mean(dim=1, keepdim=False) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) @@ -1203,7 +1065,7 @@ def __init__(self, config: XCLIPConfig): # Initialize weights and apply final processing self.post_init() - @can_return_tuple + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def get_text_features( self, @@ -1230,7 +1092,6 @@ def get_text_features( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - return_dict=True, **kwargs, ) pooled_output = text_outputs.pooler_output @@ -1238,7 +1099,7 @@ def get_text_features( return text_outputs - @can_return_tuple + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def get_video_features( self, @@ -1318,18 +1179,17 @@ def get_video_features( batch_size, num_frames, num_channels, height, width = pixel_values.shape pixel_values = pixel_values.reshape(-1, num_channels, height, width) - video_outputs: BaseModelOutputWithPooling = self.vision_model( - pixel_values=pixel_values, return_dict=True, **kwargs - ) + video_outputs: BaseModelOutputWithPooling = self.vision_model(pixel_values=pixel_values, **kwargs) video_embeds = video_outputs.pooler_output video_embeds = self.visual_projection(video_embeds) cls_features = video_embeds.view(batch_size, num_frames, -1) - mit_outputs: BaseModelOutputWithPooling = self.mit(cls_features, return_dict=True, **kwargs) + mit_outputs: BaseModelOutputWithPooling = self.mit(cls_features, **kwargs) video_outputs.pooler_output = mit_outputs.pooler_output return video_outputs + @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, @@ -1338,11 +1198,8 @@ def forward( attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, return_loss: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, interpolate_pos_encoding: bool = False, - return_dict: bool | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> tuple | XCLIPOutput: r""" return_loss (`bool`, *optional*): @@ -1429,22 +1286,13 @@ def forward( >>> print(probs) tensor([[1.9496e-04, 9.9960e-01, 2.0825e-04]]) ```""" - # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - batch_size, num_frames, num_channels, height, width = pixel_values.shape pixel_values = pixel_values.reshape(-1, num_channels, height, width) vision_outputs = self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, + **kwargs, ) video_embeds = vision_outputs[1] @@ -1454,9 +1302,7 @@ def forward( mit_outputs = self.mit( cls_features, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) video_embeds = mit_outputs[1] @@ -1470,9 +1316,7 @@ def forward( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + **kwargs, ) text_embeds = text_outputs[1] @@ -1494,10 +1338,6 @@ def forward( if return_loss: loss = x_clip_loss(logits_per_text) - if not return_dict: - output = (logits_per_video, logits_per_text, text_embeds, video_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return XCLIPOutput( loss=loss, logits_per_video=logits_per_video, diff --git a/tests/models/phi4_multimodal/test_feature_extraction_phi4_multimodal.py b/tests/models/phi4_multimodal/test_feature_extraction_phi4_multimodal.py index 8d235b5199..2999d033a2 100644 --- a/tests/models/phi4_multimodal/test_feature_extraction_phi4_multimodal.py +++ b/tests/models/phi4_multimodal/test_feature_extraction_phi4_multimodal.py @@ -23,7 +23,7 @@ from datasets import load_dataset from transformers import Phi4MultimodalFeatureExtractor -from transformers.testing_utils import check_json_file_has_correct_format, require_torch +from transformers.testing_utils import check_json_file_has_correct_format, require_numba, require_torch from transformers.utils.import_utils import is_torch_available from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin @@ -212,6 +212,7 @@ def _load_datasamples(self, num_samples): return [x["array"] for x in speech_samples] @require_torch + @require_numba def test_torch_integration(self): # fmt: off EXPECTED_INPUT_FEATURES = torch.tensor( @@ -234,6 +235,7 @@ def test_torch_integration(self): @unittest.mock.patch( "transformers.models.phi4_multimodal.feature_extraction_phi4_multimodal.is_torch_available", lambda: False ) + @require_numba def test_numpy_integration(self): # fmt: off EXPECTED_INPUT_FEATURES = np.array( @@ -254,6 +256,7 @@ def test_numpy_integration(self): self.assertTrue(np.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4)) @require_torch + @require_numba def test_torch_integration_batch(self): # fmt: off EXPECTED_INPUT_FEATURES = torch.tensor(