diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index 76f3da90b1da..3c0abffe2430 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -596,7 +596,7 @@ def _init_weights(self, module): @auto_docstring class AriaPreTrainedModel(PreTrainedModel): config: AriaConfig - base_model_prefix = "" + base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["AriaDecoderLayer"] _skip_keys_device_placement = ["past_key_values"] @@ -893,6 +893,10 @@ class AriaModelOutputWithPast(BaseModelOutputWithPast): """ ) class AriaModel(AriaPreTrainedModel): + _checkpoint_conversion_mapping = { + r"^language_model.model": "language_model", + } + def __init__(self, config: AriaConfig): super().__init__(config) self.vision_tower = AutoModel.from_config(config.vision_config) diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 28b62c390f7d..0afe2ae23e98 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -1206,7 +1206,7 @@ def _init_weights(self, module): class AriaPreTrainedModel(LlamaPreTrainedModel): config: AriaConfig - base_model_prefix = "" + base_model_prefix = "model" _can_compile_fullgraph = False # MoE models don't work with torch.compile (dynamic slicing) _supports_attention_backend = True diff --git a/src/transformers/models/aya_vision/modeling_aya_vision.py b/src/transformers/models/aya_vision/modeling_aya_vision.py index 6e57e0a04178..734dff382416 100644 --- a/src/transformers/models/aya_vision/modeling_aya_vision.py +++ b/src/transformers/models/aya_vision/modeling_aya_vision.py @@ -90,6 +90,7 @@ def pixel_shuffle(self, image_features): # B, S, D @auto_docstring class AyaVisionPreTrainedModel(PreTrainedModel): config: AyaVisionConfig + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" @@ -162,6 +163,10 @@ class AyaVisionModelOutputWithPast(BaseModelOutputWithPast): """ ) class AyaVisionModel(AyaVisionPreTrainedModel): + _checkpoint_conversion_mapping = { + r"^language_model.model": "language_model", + } + def __init__(self, config: AyaVisionConfig): super().__init__(config) self.vision_tower = AutoModel.from_config(config.vision_config) diff --git a/src/transformers/models/blt/modeling_blt.py b/src/transformers/models/blt/modeling_blt.py index ae40c5e75ab9..d4b19101c861 100644 --- a/src/transformers/models/blt/modeling_blt.py +++ b/src/transformers/models/blt/modeling_blt.py @@ -430,6 +430,7 @@ def forward( @auto_docstring class BltPreTrainedModel(PreTrainedModel): config: BltConfig + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["BltTransformerLayer"] diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index 6c8171547aa3..835275f4e20c 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -778,7 +778,7 @@ def forward( @auto_docstring class ClvpPreTrainedModel(PreTrainedModel): config: ClvpConfig - base_model_prefix = "clvp" + base_model_prefix = "model" supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py index f3b6e8a8aff4..083fde1f9197 100644 --- a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py @@ -129,6 +129,7 @@ class Cohere2VisionCausalLMOutputWithPast(ModelOutput): @auto_docstring class Cohere2VisionPreTrainedModel(PreTrainedModel): config: Cohere2VisionConfig + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" @@ -142,7 +143,6 @@ class Cohere2VisionPreTrainedModel(PreTrainedModel): "hidden_states": "DecoderLayer", "attentions": "Attention", } - base_model_prefix = "model" @auto_docstring( diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index bf0bb66c8880..7d900cf4a27a 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -1490,7 +1490,6 @@ def forward( class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin): - base_model_prefix = "" output_modalities = ["image", "text"] _tied_weights_keys = {"lm_head.weight": "model.text_model.embed_tokens.weight"} _checkpoint_conversion_mapping = { diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py index a70fd220b582..1ef5e23a4436 100644 --- a/src/transformers/models/emu3/modular_emu3.py +++ b/src/transformers/models/emu3/modular_emu3.py @@ -1044,7 +1044,6 @@ def forward( class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin): - base_model_prefix = "" output_modalities = ["image", "text"] _tied_weights_keys = {"lm_head.weight": "model.text_model.embed_tokens.weight"} _checkpoint_conversion_mapping = { diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index effb5111cf96..d5f263137b3d 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -1298,7 +1298,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: """ ) class FlavaImageCodebook(FlavaPreTrainedModel): - base_model_prefix = "" + base_model_prefix = "model" config: FlavaImageCodebookConfig main_input_name = "pixel_values" input_modalities = "image" diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py index 34aa2f8f454d..4d6d38c5db29 100644 --- a/src/transformers/models/florence2/modeling_florence2.py +++ b/src/transformers/models/florence2/modeling_florence2.py @@ -615,6 +615,7 @@ class Florence2Seq2SeqLMOutput(Seq2SeqLMOutput): @auto_docstring class Florence2PreTrainedModel(PreTrainedModel): config: Florence2Config + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" @@ -627,7 +628,6 @@ class Florence2PreTrainedModel(PreTrainedModel): _supports_attention_backend = False config_class = Florence2Config - base_model_prefix = "model" @auto_docstring( diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index b7aef2390f20..c623324226ac 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -446,7 +446,7 @@ def forward( @auto_docstring class Gemma3PreTrainedModel(PreTrainedModel): config: Gemma3Config - base_model_prefix = "" + base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = [ "Gemma3DecoderLayer", @@ -632,7 +632,6 @@ class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin): _tp_plan = {"lm_head": "colwise_rep"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} config: Gemma3TextConfig - base_model_prefix = "model" def __init__(self, config: Gemma3TextConfig): super().__init__(config) diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 47e1b49ac7bb..a1b0a57928d1 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -561,7 +561,7 @@ def forward( class Gemma3PreTrainedModel(Gemma2PreTrainedModel): - base_model_prefix = "" + base_model_prefix = "model" input_modalities = ["image", "text"] _no_split_modules = [ "Gemma3DecoderLayer", @@ -717,7 +717,6 @@ def forward( class Gemma3ForCausalLM(Gemma2ForCausalLM): config: Gemma3TextConfig - base_model_prefix = "model" def __init__(self, config: Gemma3TextConfig): super().__init__(config) diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index 5111a69cebfe..0b3088aadec7 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -1939,7 +1939,6 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin): _tp_plan = {"lm_head": "colwise_rep"} _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} config: Gemma3nTextConfig - base_model_prefix = "model" _checkpoint_conversion_mapping = {"model.language_model": "model"} def __init__(self, config: Gemma3nTextConfig): @@ -2349,7 +2348,6 @@ def get_audio_features( class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin): _checkpoint_conversion_mapping = {} _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"} - base_model_prefix = "model" def __init__(self, config: Gemma3nConfig): super().__init__(config) diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 375bd93f2723..02cf9b9f4833 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -2116,7 +2116,6 @@ def forward( @auto_docstring(custom_intro="The base Gemma 3n language model with a language modeling head.") class Gemma3nForCausalLM(Gemma3ForCausalLM): _checkpoint_conversion_mapping = {"model.language_model": "model"} - base_model_prefix = "model" class Gemma3nMultimodalEmbedder(nn.Module): @@ -2421,7 +2420,6 @@ def get_audio_features( ) class Gemma3nForConditionalGeneration(PaliGemmaForConditionalGeneration): _checkpoint_conversion_mapping = {} - base_model_prefix = "model" @property def audio_tower(self): diff --git a/src/transformers/models/glm46v/modeling_glm46v.py b/src/transformers/models/glm46v/modeling_glm46v.py index 7fc18482c4da..8c20786c955e 100644 --- a/src/transformers/models/glm46v/modeling_glm46v.py +++ b/src/transformers/models/glm46v/modeling_glm46v.py @@ -78,7 +78,7 @@ class Glm46VModelOutputWithPast(ModelOutput): @auto_docstring class Glm46VModel(Glm46VPreTrainedModel): - base_model_prefix = "" + base_model_prefix = "model" _checkpoint_conversion_mapping = {} # Reference: fix gemma3 grad acc #37208 accepts_loss_kwargs = False @@ -583,6 +583,8 @@ def forward( The temporal, height and width of feature shape of each image in LLM. video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): The temporal, height and width of feature shape of each video in LLM. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. Example: diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py index 47ad72ac96ce..9942770d70e3 100644 --- a/src/transformers/models/glm4v/modeling_glm4v.py +++ b/src/transformers/models/glm4v/modeling_glm4v.py @@ -926,7 +926,7 @@ def forward( @auto_docstring class Glm4vModel(Glm4vPreTrainedModel): - base_model_prefix = "" + base_model_prefix = "model" _checkpoint_conversion_mapping = {} # Reference: fix gemma3 grad acc #37208 accepts_loss_kwargs = False @@ -1431,6 +1431,8 @@ def forward( The temporal, height and width of feature shape of each image in LLM. video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): The temporal, height and width of feature shape of each video in LLM. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. Example: diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 2df8b6f9d04a..85cd3ed44085 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -1350,6 +1350,8 @@ def forward( The temporal, height and width of feature shape of each image in LLM. video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): The temporal, height and width of feature shape of each video in LLM. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. Example: diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py index 631505562bc6..a212aed44e81 100644 --- a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py @@ -537,7 +537,7 @@ def forward( @auto_docstring class Glm4vMoePreTrainedModel(PreTrainedModel): config: Glm4vMoeConfig - base_model_prefix = "" + base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["Glm4vMoeTextDecoderLayer", "Glm4vMoeVisionBlock"] _skip_keys_device_placement = "past_key_values" @@ -1090,7 +1090,7 @@ def forward( @auto_docstring class Glm4vMoeModel(Glm4vMoePreTrainedModel): - base_model_prefix = "" + base_model_prefix = "model" _checkpoint_conversion_mapping = {} # Reference: fix gemma3 grad acc #37208 accepts_loss_kwargs = False @@ -1648,6 +1648,8 @@ def forward( The temporal, height and width of feature shape of each image in LLM. video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): The temporal, height and width of feature shape of each video in LLM. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. Example: diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index c69ca8439315..67419f310521 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -475,7 +475,7 @@ def __init__(self, config: Glm4vMoeTextConfig, layer_idx: int): class Glm4vMoePreTrainedModel(Glm4MoePreTrainedModel): config: Glm4vMoeConfig - base_model_prefix = "" + base_model_prefix = "model" input_modalities = ["text", "image", "video"] _no_split_modules = ["Glm4vMoeTextDecoderLayer", "Glm4vMoeVisionBlock"] _skip_keys_device_placement = "past_key_values" diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py index ad07a2d61e6d..3ac6bd177220 100644 --- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py @@ -277,6 +277,7 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]: @auto_docstring class GotOcr2PreTrainedModel(PreTrainedModel): config: GotOcr2Config + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" @@ -532,6 +533,10 @@ class GotOcr2ModelOutputWithPast(BaseModelOutputWithPast): """ ) class GotOcr2Model(GotOcr2PreTrainedModel): + _checkpoint_conversion_mapping = { + r"^language_model.model": "language_model", + } + def __init__(self, config: GotOcr2Config): super().__init__(config) self.vision_tower = GotOcr2VisionEncoder(config.vision_config) diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py index 0ec57c60a20c..6e586ce999d5 100644 --- a/src/transformers/models/internvl/modeling_internvl.py +++ b/src/transformers/models/internvl/modeling_internvl.py @@ -473,6 +473,7 @@ def forward( @auto_docstring class InternVLPreTrainedModel(PreTrainedModel): config: InternVLConfig + base_model_prefix = "model" input_modalities = ["image", "text", "video"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" @@ -530,6 +531,10 @@ class InternVLModelOutputWithPast(BaseModelOutputWithPast): """ ) class InternVLModel(InternVLPreTrainedModel): + _checkpoint_conversion_mapping = { + r"^language_model.model": "language_model", + } + def __init__(self, config: InternVLConfig): super().__init__(config) self.vision_tower = AutoModel.from_config(config.vision_config) diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index afa139d66f49..b102a111e10f 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -568,7 +568,7 @@ def forward( @auto_docstring class JetMoePreTrainedModel(PreTrainedModel): config: JetMoeConfig - base_model_prefix = "transformer" + base_model_prefix = "model" supports_gradient_checkpointing = False _no_split_modules = ["JetMoeDecoderLayer"] _skip_keys_device_placement = ["past_key_values"] diff --git a/src/transformers/models/jetmoe/modular_jetmoe.py b/src/transformers/models/jetmoe/modular_jetmoe.py index 41babe708c63..db8c3e1059c0 100644 --- a/src/transformers/models/jetmoe/modular_jetmoe.py +++ b/src/transformers/models/jetmoe/modular_jetmoe.py @@ -429,7 +429,7 @@ class JetMoePreTrainedModel(MixtralPreTrainedModel): "attentions": OutputRecorder(JetMoeAttention, index=1), } config: JetMoeConfig - base_model_prefix = "transformer" + base_model_prefix = "model" supports_gradient_checkpointing = False _no_split_modules = ["JetMoeDecoderLayer"] _skip_keys_device_placement = ["past_key_values"] diff --git a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py index 34d35d7cda8a..27ba1ece7af4 100755 --- a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py +++ b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py @@ -76,6 +76,7 @@ def pixel_unshuffle(self, hidden_states: torch.Tensor): @auto_docstring class Lfm2VlPreTrainedModel(PreTrainedModel): config: Lfm2VlConfig + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" @@ -85,7 +86,6 @@ class Lfm2VlPreTrainedModel(PreTrainedModel): _can_compile_fullgraph = False _supports_flex_attn = True _supports_attention_backend = True - base_model_prefix = "model" @dataclass diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py index 355a4f145779..231e04c8eba2 100644 --- a/src/transformers/models/llama4/modeling_llama4.py +++ b/src/transformers/models/llama4/modeling_llama4.py @@ -1166,7 +1166,7 @@ def forward( class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin): _no_split_modules = ["Llama4TextDecoderLayer", "Llama4VisionEncoderLayer"] _tp_plan = {} - base_model_prefix = "" + base_model_prefix = "model" config: Llama4Config def __init__(self, config: Llama4Config): diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 0541b9176502..1f8a2a9645ea 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -110,6 +110,7 @@ def forward(self, image_features): @auto_docstring class LlavaPreTrainedModel(PreTrainedModel): config: LlavaConfig + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" @@ -128,6 +129,10 @@ class LlavaPreTrainedModel(PreTrainedModel): """ ) class LlavaModel(LlavaPreTrainedModel): + _checkpoint_conversion_mapping = { + r"^language_model.model": "language_model", + } + def __init__(self, config: LlavaConfig): super().__init__(config) self.vision_tower = AutoModel.from_config(config.vision_config) diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index cd155dbce452..a83821d98f96 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -223,7 +223,7 @@ def forward(self, image_features): @auto_docstring class LlavaNextPreTrainedModel(PreTrainedModel): config: LlavaNextConfig - base_model_prefix = "" + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 34feba5e18c7..7f6fbffaec07 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -164,7 +164,7 @@ def forward(self, image_features): @auto_docstring class LlavaNextVideoPreTrainedModel(PreTrainedModel): config: LlavaNextVideoConfig - base_model_prefix = "" + base_model_prefix = "model" input_modalities = ["image", "video", "text"] supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 1bd91aee4715..efe4f6fb1ba6 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -105,7 +105,7 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput): @auto_docstring class LlavaOnevisionPreTrainedModel(PreTrainedModel): config: LlavaOnevisionConfig - base_model_prefix = "" + base_model_prefix = "model" input_modalities = ["image", "video", "text"] supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py index 935279fe6485..6a67f21f216f 100644 --- a/src/transformers/models/mistral3/modeling_mistral3.py +++ b/src/transformers/models/mistral3/modeling_mistral3.py @@ -176,6 +176,7 @@ class Mistral3ModelOutputWithPast(BaseModelOutputWithPast): @auto_docstring class Mistral3PreTrainedModel(PreTrainedModel): config: Mistral3Config + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" @@ -194,6 +195,10 @@ class Mistral3PreTrainedModel(PreTrainedModel): """ ) class Mistral3Model(Mistral3PreTrainedModel): + _checkpoint_conversion_mapping = { + r"^language_model.model": "language_model", + } + def __init__(self, config: Mistral3Config): super().__init__(config) self.vision_tower = AutoModel.from_config(config.vision_config) diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index f55131274194..9f811fa7f010 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -793,6 +793,7 @@ def forward(self, x, position_ids): @auto_docstring class MllamaPreTrainedModel(PreTrainedModel): config: MllamaConfig + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = [ @@ -1436,7 +1437,6 @@ def forward( """ ) class MllamaModel(MllamaPreTrainedModel): - base_model_prefix = "" _checkpoint_conversion_mapping = { "language_model.model": "language_model", "model.vision_model": "vision_model", diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index 509906fa9e41..f1464bde5fb2 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -214,7 +214,7 @@ def create_causal_mask_mapping( @auto_docstring class PaliGemmaPreTrainedModel(PreTrainedModel): config: PaliGemmaConfig - base_model_prefix = "" + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _no_split_modules = ["PaliGemmaMultiModalProjector"] diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py index 5f2323321ec0..0a601deac183 100644 --- a/src/transformers/models/perception_lm/modeling_perception_lm.py +++ b/src/transformers/models/perception_lm/modeling_perception_lm.py @@ -89,6 +89,7 @@ def forward(self, features): @auto_docstring class PerceptionLMPreTrainedModel(PreTrainedModel): config: PerceptionLMConfig + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" @@ -99,7 +100,6 @@ class PerceptionLMPreTrainedModel(PreTrainedModel): _can_compile_fullgraph = True _supports_flex_attn = True _supports_attention_backend = True - base_model_prefix = "model" @dataclass diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 1a24d18939bb..4c035c3144eb 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -950,7 +950,7 @@ def forward( @auto_docstring class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel): - base_model_prefix = "" + base_model_prefix = "model" _checkpoint_conversion_mapping = {"^model": "language_model"} # Reference: fix gemma3 grad acc #37208 accepts_loss_kwargs = False diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index dc88dd02ee73..1331c9e70e12 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -343,7 +343,7 @@ class Qwen2_5_VLModelOutputWithPast(Qwen2VLModelOutputWithPast): class Qwen2_5_VLModel(Qwen2VLModel): config: Qwen2_5_VLConfig - base_model_prefix = "" + base_model_prefix = "model" _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] # Reference: fix gemma3 grad acc #37208 accepts_loss_kwargs = False diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index c1b52ff75f9f..bad9709e32c1 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -923,7 +923,7 @@ def forward( @auto_docstring class Qwen2VLModel(Qwen2VLPreTrainedModel): - base_model_prefix = "" + base_model_prefix = "model" _checkpoint_conversion_mapping = {"^model": "language_model"} # Reference: fix gemma3 grad acc #37208 accepts_loss_kwargs = False diff --git a/src/transformers/models/video_llama_3/modeling_video_llama_3.py b/src/transformers/models/video_llama_3/modeling_video_llama_3.py index 37370bd91266..0b6d0277ad4a 100644 --- a/src/transformers/models/video_llama_3/modeling_video_llama_3.py +++ b/src/transformers/models/video_llama_3/modeling_video_llama_3.py @@ -517,7 +517,7 @@ class VideoLlama3ModelOutputWithPast(ModelOutput): @auto_docstring class VideoLlama3Model(VideoLlama3PreTrainedModel): - base_model_prefix = "" + base_model_prefix = "model" _checkpoint_conversion_mapping = {} # Reference: fix gemma3 grad acc #37208 accepts_loss_kwargs = False diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 4aaea9d762d5..559c30ef1f65 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -125,7 +125,7 @@ def forward(self, image_features): @auto_docstring class VideoLlavaPreTrainedModel(PreTrainedModel): config: VideoLlavaConfig - base_model_prefix = "" + base_model_prefix = "model" input_modalities = ["image", "video", "text"] supports_gradient_checkpointing = True _no_split_modules = ["VideoLlavaVisionAttention"] diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index daca96966d07..02fdf4c4638c 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -113,6 +113,7 @@ def forward(self, hidden_states): @auto_docstring class VipLlavaPreTrainedModel(PreTrainedModel): config: VipLlavaConfig + base_model_prefix = "model" input_modalities = ["image", "text"] supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" @@ -131,6 +132,10 @@ class VipLlavaPreTrainedModel(PreTrainedModel): """ ) class VipLlavaModel(VipLlavaPreTrainedModel): + _checkpoint_conversion_mapping = { + r"^language_model.model": "language_model", + } + def __init__(self, config: VipLlavaConfig): super().__init__(config) self.vision_tower = AutoModel.from_config(config.vision_config) diff --git a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py index b31f745435a8..c15c315c69bf 100644 --- a/tests/models/audioflamingo3/test_modeling_audioflamingo3.py +++ b/tests/models/audioflamingo3/test_modeling_audioflamingo3.py @@ -190,6 +190,10 @@ def test_sdpa_can_dispatch_on_flash(self): def test_flash_attn_2_inference_equivalence_right_padding(self): pass + @unittest.skip(reason="AudioFlamingo3 has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + def test_sdpa_can_dispatch_composite_models(self): # AF3 is audio+text composite; verify SDPA toggles propagate to submodules. if not self.has_attentions: diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py index aebab17f624a..295a0baf0f58 100644 --- a/tests/models/bark/test_modeling_bark.py +++ b/tests/models/bark/test_modeling_bark.py @@ -597,6 +597,10 @@ def test_generate_fp16(self): model.generate(input_ids, attention_mask=attention_mask) model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3) + @unittest.skip("Bark has no base model due to special archiecture") + def test_model_base_model_prefix(self): + pass + @require_torch class BarkCoarseModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): @@ -683,6 +687,10 @@ def test_generate_fp16(self): model.generate(input_ids, attention_mask=attention_mask) model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3) + @unittest.skip("Bark has no base model due to special archiecture") + def test_model_base_model_prefix(self): + pass + @require_torch class BarkFineModelTest(ModelTesterMixin, unittest.TestCase): diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index d852ffb6a943..8115c2f89ec2 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -500,6 +500,10 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass + @unittest.skip(reason="BLIP2 has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + def test_sdpa_can_dispatch_composite_models(self): """ Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model. @@ -850,6 +854,10 @@ def test_model_get_set_embeddings(self): def test_cpu_offload(self): pass + @unittest.skip(reason="BLIP2 has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + def test_sdpa_can_dispatch_composite_models(self): """ Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model. diff --git a/tests/models/csm/test_modeling_csm.py b/tests/models/csm/test_modeling_csm.py index ccfc4e32c7b5..99594f5bf7d4 100644 --- a/tests/models/csm/test_modeling_csm.py +++ b/tests/models/csm/test_modeling_csm.py @@ -279,6 +279,10 @@ def test_model_parallel_beam_search(self): def test_tied_weights_keys(self): pass + @unittest.skip(reason="CSM has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + def _get_custom_4d_mask_test_data(self): """ Overrides [ModelTesterMixin._get_custom_4d_mask_test_data] to handle third input_ids dimension. diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py index d5403e5d42ab..732014fb9f62 100644 --- a/tests/models/fuyu/test_modeling_fuyu.py +++ b/tests/models/fuyu/test_modeling_fuyu.py @@ -260,6 +260,10 @@ def test_eager_padding_matches_padding_free_with_position_ids(self): def test_sdpa_padding_matches_padding_free_with_position_ids(self): pass + @unittest.skip(reason="Fuyu has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + @slow @require_torch_accelerator diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index a92fa8c448f3..66a0545c0ea7 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -292,6 +292,10 @@ def test_sdpa_can_dispatch_composite_models(self): def test_eager_matches_sdpa_generate(self): pass + @unittest.skip(reason="GraniteSpeech has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase): def setUp(self): diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 4882c14dba36..6a6050abb146 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -521,6 +521,10 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass + @unittest.skip(reason="InstructBLIP has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 747469de3ffa..0046ccb0bf58 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -534,6 +534,10 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_common_attributes(self): pass + @unittest.skip(reason="InstructBLIP has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + def test_forward_signature(self): for model_class in self.all_model_classes: config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py index 9a5ed55e277b..508b8c67be19 100644 --- a/tests/models/jetmoe/test_modeling_jetmoe.py +++ b/tests/models/jetmoe/test_modeling_jetmoe.py @@ -113,6 +113,10 @@ class JetMoeModelTest(CausalLMModelTest, unittest.TestCase): def test_flash_attn_2_inference_equivalence_right_padding(self): self.skipTest(reason="JetMoe flash attention does not support right padding") + @unittest.skip(reason="JetMoe has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + @require_torch class JetMoeIntegrationTest(unittest.TestCase): diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index e8ec40c8c716..0686cd8c6026 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -408,6 +408,10 @@ def test_eager_padding_matches_padding_free_with_position_ids(self): def test_sdpa_padding_matches_padding_free_with_position_ids(self): pass + @unittest.skip(reason="Kosmos2 has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + @pytest.mark.generate def test_left_padding_compatibility(self): # Overwrite -- kosmos2 needs to prepare `image_embeds_position_mask`, and it must be padded accordingly diff --git a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py index 751166f1775a..a2e7afb97799 100644 --- a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py +++ b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py @@ -376,6 +376,10 @@ def test_assisted_decoding_sample(self): def test_prompt_lookup_decoding_matches_greedy_search(self): pass + @unittest.skip(reason="Kosmos2-3 has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py index e73ef8596a20..ed0ead4c9bce 100644 --- a/tests/models/longt5/test_modeling_longt5.py +++ b/tests/models/longt5/test_modeling_longt5.py @@ -734,6 +734,10 @@ def _check_encoder_attention_for_generate(self, attentions, batch_size, config, def test_load_save_without_tied_weights(self): pass + @unittest.skip(reason="LongT5 has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + @require_torch class LongT5TGlobalModelTest(LongT5ModelTest): @@ -871,6 +875,10 @@ def _check_encoder_attention_for_generate(self, attentions, batch_size, config, [encoder_expected_shape] * len(attentions), ) + @unittest.skip(reason="LongT5 has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + class LongT5EncoderOnlyModelTester: def __init__( diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index 2f488317d0f3..090d23b45fdb 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -130,6 +130,10 @@ def setUp(self): self.model_tester = MllamaText2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=MllamaTextConfig, has_text_modality=True) + @unittest.skip("Mllama needs a different model prefix to loadd saved checkpoints") + def test_model_base_model_prefix(self): + pass + class MllamaVisionText2TextModelTester: def __init__( diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index b274036b6efa..15fdf884e099 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -834,6 +834,10 @@ def test_prepare_inputs_for_generation_kwargs_forwards(self): last_hidden_state=torch.randn(2, 3, 32), kwargs_depth_decoder={} ) + @unittest.skip(reason="Moshi has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + def place_dict_on_device(dict_to_place, device): for key in dict_to_place: diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py index 45a5ad01ab76..cd0ab3677643 100644 --- a/tests/models/mt5/test_modeling_mt5.py +++ b/tests/models/mt5/test_modeling_mt5.py @@ -735,6 +735,10 @@ def test_model_from_pretrained(self): model = MT5Model.from_pretrained(model_name) self.assertIsNotNone(model) + @unittest.skip(reason="MT5 has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + # Copied from tests.models.t5.test_modeling_t5.T5EncoderOnlyModelTester with T5->MT5 class MT5EncoderOnlyModelTester: diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py index 04bbcd28c5be..42353d54ed13 100644 --- a/tests/models/pix2struct/test_modeling_pix2struct.py +++ b/tests/models/pix2struct/test_modeling_pix2struct.py @@ -648,6 +648,10 @@ def _check_encoder_hidden_states_for_generate(self, hidden_states, batch_size, c [encoder_expected_shape] * len(hidden_states), ) + @unittest.skip("Pix2Struct has no base model, it was implemented before standardization") + def test_model_base_model_prefix(self): + pass + # We will verify our results on an image of a stop sign def prepare_img(): diff --git a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py index b54bf3f6d514..e5179f179e30 100644 --- a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py +++ b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py @@ -293,6 +293,10 @@ def test_sdpa_can_dispatch_on_flash(self): def test_model_outputs_equivalence(self): pass + @unittest.skip("Qwen2Omni has no base model, model architecture is special") + def test_model_base_model_prefix(self): + pass + def test_sdpa_can_dispatch_composite_models(self): # overwrite because Qwen2 is audio+text model (not vision+text) if not self.has_attentions: diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index 0f57b8704192..25e1200332de 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -156,6 +156,10 @@ def test_sdpa_can_compile_dynamic(self): def test_sdpa_can_dispatch_on_flash(self): pass + @unittest.skip(reason="Qwen2Audio has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + def test_sdpa_can_dispatch_composite_models(self): # overwrite because Qwen2 is audio+text model (not vision+text) if not self.has_attentions: diff --git a/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py b/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py index 89456c4c891c..df0d53bf404e 100644 --- a/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py +++ b/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py @@ -477,6 +477,10 @@ def test_custom_4d_attention_mask(self): def test_model_is_small(self): pass + @unittest.skip("Qwen3Omni has no base model, model architecture is special") + def test_model_base_model_prefix(self): + pass + @unittest.skip("FIXME this is important, but in a rush to merge, cannot investigate now") def test_get_rope_index_video_with_audio(self): image_grid_thw = torch.empty((0, 3), dtype=torch.long) diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py index 37202848242d..70dc94927786 100644 --- a/tests/models/switch_transformers/test_modeling_switch_transformers.py +++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py @@ -708,6 +708,10 @@ def test_load_save_without_tied_weights(self): def test_retain_grad_hidden_states_attentions(self): pass + @unittest.skip(reason="SwitchTransformers has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + class SwitchTransformersEncoderOnlyModelTester: def __init__( diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py index 52f85f17d9fb..ae5031e14dc5 100644 --- a/tests/models/t5/test_modeling_t5.py +++ b/tests/models/t5/test_modeling_t5.py @@ -743,6 +743,10 @@ def test_model_from_pretrained(self): model = T5Model.from_pretrained(model_name) self.assertIsNotNone(model) + @unittest.skip(reason="T5 has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + class T5EncoderOnlyModelTester: def __init__( diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py index 2444ed9f5e78..e3973fd52896 100644 --- a/tests/models/udop/test_modeling_udop.py +++ b/tests/models/udop/test_modeling_udop.py @@ -330,6 +330,10 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass + @unittest.skip(reason="Udop has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py index 58e6e923e8df..a6f90b5edc06 100644 --- a/tests/models/umt5/test_modeling_umt5.py +++ b/tests/models/umt5/test_modeling_umt5.py @@ -361,6 +361,10 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass + @unittest.skip(reason="UMT5 has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + # Copied from tests.models.t5.test_modeling_t5.T5EncoderOnlyModelTester with T5->UMT5 class UMT5EncoderOnlyModelTester: diff --git a/tests/models/voxtral/test_modeling_voxtral.py b/tests/models/voxtral/test_modeling_voxtral.py index feaa58bbd10b..213c28538c11 100644 --- a/tests/models/voxtral/test_modeling_voxtral.py +++ b/tests/models/voxtral/test_modeling_voxtral.py @@ -189,6 +189,10 @@ def test_flash_attention_3_padding_matches_padding_free_with_position_ids(self): def test_flash_attention_3_padding_matches_padding_free_with_position_ids_and_fa_kwargs(self): pass + @unittest.skip(reason="Voxtral has no separate base model without a head.") + def test_model_base_model_prefix(self): + pass + def test_sdpa_can_dispatch_composite_models(self): # overwrite because Voxtral is audio+text model (not vision+text) if not self.has_attentions: diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 2f83e6f7fe8d..036fea5ee3d1 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -1942,6 +1942,20 @@ def test_model_main_input_name(self): observed_main_input_name = list(model_signature.parameters.keys())[1] self.assertEqual(model_class.main_input_name, observed_main_input_name) + def test_model_base_model_prefix(self): + """ + Normally a generative model is a base model + lm_head on top. If this test + fails for new model, probably the model has incorrect `base_model_prefix` or + the you are re-defining base blocks for a generative model. + There are some models which might not fit this assumption, if the model + has a special architecture. Feel free to skip the test in that case with + a reason in description. + """ + for model_class in self.all_generative_model_classes: + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + self.assertTrue(model.base_model is not model) + def test_correct_missing_keys(self): if not self.test_missing_keys: self.skipTest(reason="test_missing_keys is set to `False`")