From dd123592d984dd82edc74c6e56e5c06561ffb346 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 30 Sep 2025 14:04:39 +0200 Subject: [PATCH 1/2] Add missing ModelOutput subclass return type hints --- src/transformers/models/dac/modeling_dac.py | 8 ++++---- .../models/deepseek_vl/modeling_deepseek_vl.py | 4 ++-- .../deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py | 4 ++-- .../deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py | 4 ++-- src/transformers/models/deprecated/mmbt/modeling_mmbt.py | 4 +++- .../models/grounding_dino/modeling_grounding_dino.py | 6 +++--- src/transformers/models/janus/modeling_janus.py | 4 ++-- src/transformers/models/janus/modular_janus.py | 4 ++-- .../models/maskformer/modeling_maskformer_swin.py | 6 +++--- .../mm_grounding_dino/modeling_mm_grounding_dino.py | 6 +++--- src/transformers/models/tvp/modeling_tvp.py | 8 ++++---- src/transformers/models/udop/modeling_udop.py | 6 +++--- 12 files changed, 33 insertions(+), 31 deletions(-) diff --git a/src/transformers/models/dac/modeling_dac.py b/src/transformers/models/dac/modeling_dac.py index e97c8183651e..a759170efb00 100644 --- a/src/transformers/models/dac/modeling_dac.py +++ b/src/transformers/models/dac/modeling_dac.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import Optional +from typing import Optional, Union import numpy as np import torch @@ -579,7 +579,7 @@ def encode( input_values: torch.Tensor, n_quantizers: Optional[int] = None, return_dict: Optional[bool] = None, - ): + ) -> Union[tuple, DacEncoderOutput]: r""" input_values (`torch.Tensor of shape `(batch_size, 1, time_steps)`): Input audio data to encode, @@ -606,7 +606,7 @@ def decode( quantized_representation: Optional[torch.Tensor] = None, audio_codes: Optional[torch.Tensor] = None, return_dict: Optional[bool] = None, - ): + ) -> Union[tuple, DacDecoderOutput]: r""" quantized_representation (torch.Tensor of shape `(batch_size, dimension, time_steps)`, *optional*): Quantized continuous representation of input. @@ -639,7 +639,7 @@ def forward( input_values: torch.Tensor, n_quantizers: Optional[int] = None, return_dict: Optional[bool] = None, - ): + ) -> Union[tuple, DacOutput]: r""" input_values (`torch.Tensor` of shape `(batch_size, 1, time_steps)`): Audio data to encode. diff --git a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py index 22d8e0928a6e..a797c49a65a5 100644 --- a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py @@ -208,7 +208,7 @@ def forward( use_cache: Optional[bool] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs, - ): + ) -> DeepseekVLBaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" @@ -282,7 +282,7 @@ def forward( use_cache: Optional[bool] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> DeepseekVLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py index cae509e14d64..0b698ee3362b 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py @@ -316,7 +316,7 @@ def forward( use_cache: Optional[bool] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs, - ): + ) -> DeepseekVLHybridBaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" @@ -428,7 +428,7 @@ def forward( use_cache: Optional[bool] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> DeepseekVLHybridCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py index e9808b02ce34..2f646050fc25 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -302,7 +302,7 @@ def forward( use_cache: Optional[bool] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs, - ): + ) -> DeepseekVLHybridBaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" @@ -366,7 +366,7 @@ def forward( use_cache: Optional[bool] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> DeepseekVLHybridCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/deprecated/mmbt/modeling_mmbt.py b/src/transformers/models/deprecated/mmbt/modeling_mmbt.py index 45ae577f7fce..a257ee64fbfe 100644 --- a/src/transformers/models/deprecated/mmbt/modeling_mmbt.py +++ b/src/transformers/models/deprecated/mmbt/modeling_mmbt.py @@ -15,6 +15,8 @@ # limitations under the License. """PyTorch MMBT model.""" +from typing import Union + import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss @@ -204,7 +206,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, - ): + ) -> Union[tuple, BaseModelOutputWithPooling]: r""" Returns: diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 594524c8dd1c..bcc8bdcc76c7 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1506,7 +1506,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, - ): + ) -> Union[tuple, GroundingDinoEncoderOutput]: r""" Args: vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -1660,7 +1660,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, - ): + ) -> Union[tuple, GroundingDinoDecoderOutput]: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -2052,7 +2052,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, - ): + ) -> Union[tuple, GroundingDinoModelOutput]: r""" input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py index 94e1c6288bd3..a0a98910f0e9 100644 --- a/src/transformers/models/janus/modeling_janus.py +++ b/src/transformers/models/janus/modeling_janus.py @@ -1126,7 +1126,7 @@ def forward( use_cache: Optional[bool] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs, - ): + ) -> JanusBaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" @@ -1202,7 +1202,7 @@ def forward( use_cache: Optional[bool] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> JanusCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index a8e24a86b0d3..85f80bdf456a 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -941,7 +941,7 @@ def forward( use_cache: Optional[bool] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs, - ): + ) -> JanusBaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" @@ -1017,7 +1017,7 @@ def forward( use_cache: Optional[bool] = None, logits_to_keep: Union[int, torch.Tensor] = 0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> JanusCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py index 2de478440414..c4ef364d6831 100644 --- a/src/transformers/models/maskformer/modeling_maskformer_swin.py +++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py @@ -19,7 +19,7 @@ import collections.abc import math from dataclasses import dataclass -from typing import Optional +from typing import Optional, Union import torch from torch import Tensor, nn @@ -687,7 +687,7 @@ def forward( output_attentions=False, output_hidden_states=False, return_dict=True, - ): + ) -> Union[tuple, MaskFormerSwinBaseModelOutput]: all_hidden_states = () if output_hidden_states else None all_input_dimensions = () all_self_attentions = () if output_attentions else None @@ -783,7 +783,7 @@ def forward( output_hidden_states=None, interpolate_pos_encoding=False, return_dict=None, - ): + ) -> Union[tuple, MaskFormerSwinModelOutputWithPooling]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py b/src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py index b27d6ac42a3a..7c94e8ddc324 100644 --- a/src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +++ b/src/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py @@ -1176,7 +1176,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, - ): + ) -> Union[tuple, MMGroundingDinoEncoderOutput]: r""" Args: vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -1472,7 +1472,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, - ): + ) -> Union[tuple, MMGroundingDinoDecoderOutput]: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1947,7 +1947,7 @@ def forward( output_attentions=None, output_hidden_states=None, return_dict=None, - ): + ) -> Union[tuple, MMGroundingDinoModelOutput]: r""" input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide diff --git a/src/transformers/models/tvp/modeling_tvp.py b/src/transformers/models/tvp/modeling_tvp.py index dcbd220331f9..65b5328e97a4 100644 --- a/src/transformers/models/tvp/modeling_tvp.py +++ b/src/transformers/models/tvp/modeling_tvp.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import Optional +from typing import Optional, Union import torch from torch import nn @@ -494,7 +494,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ): + ) -> Union[tuple, BaseModelOutput]: return_dict = return_dict if return_dict is not None else self.config.return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -759,7 +759,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, interpolate_pos_encoding: bool = False, - ): + ) -> Union[tuple, BaseModelOutputWithPooling]: r""" Examples: ```python @@ -862,7 +862,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, interpolate_pos_encoding: bool = False, - ): + ) -> Union[tuple, TvpVideoGroundingOutput]: r""" labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*): The labels contains duration, start time, and end time of the video corresponding to the text. diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index 22f45731030e..d6affc169495 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -1155,7 +1155,7 @@ def forward( output_hidden_states=None, return_dict=None, cache_position=None, - ): + ) -> Union[tuple, BaseModelOutputWithAttentionMask]: use_cache = use_cache if use_cache is not None else self.config.use_cache output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -1535,7 +1535,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, - ) -> tuple[Tensor, ...]: + ) -> Union[tuple, Seq2SeqModelOutput]: r""" bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*): Bounding boxes of each input sequence tokens. Selected in the range `[0, @@ -1732,7 +1732,7 @@ def forward( return_dict: Optional[bool] = None, labels: Optional[Tensor] = None, cache_position: Optional[torch.LongTensor] = None, - ) -> tuple[Tensor, ...]: + ) -> Union[tuple, Seq2SeqLMOutput]: r""" bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*): Bounding boxes of each input sequence tokens. Selected in the range `[0, From b7b235a5e4193dbd294dc362c59a7f9885b8b098 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 30 Sep 2025 14:32:18 +0200 Subject: [PATCH 2/2] Fix incorrect type hint: FlavaOutput (nn.Module subclass) -> FlavaModelOutput --- src/transformers/models/flava/modeling_flava.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index c48f2ca1279f..292ce3395734 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -1191,7 +1191,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: bool = True, return_dict: Optional[bool] = None, - ) -> Union[tuple, FlavaOutput]: + ) -> Union[tuple, FlavaModelOutput]: r""" input_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See