Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/transformers/models/aria/modeling_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ def _init_weights(self, module):
@auto_docstring
class AriaPreTrainedModel(PreTrainedModel):
config: AriaConfig
base_model_prefix = ""
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["AriaDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
Expand Down Expand Up @@ -893,6 +893,10 @@ class AriaModelOutputWithPast(BaseModelOutputWithPast):
"""
)
class AriaModel(AriaPreTrainedModel):
_checkpoint_conversion_mapping = {
r"^language_model.model": "language_model",
}

def __init__(self, config: AriaConfig):
super().__init__(config)
self.vision_tower = AutoModel.from_config(config.vision_config)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/aria/modular_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -1206,7 +1206,7 @@ def _init_weights(self, module):

class AriaPreTrainedModel(LlamaPreTrainedModel):
config: AriaConfig
base_model_prefix = ""
base_model_prefix = "model"
_can_compile_fullgraph = False # MoE models don't work with torch.compile (dynamic slicing)
_supports_attention_backend = True

Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/aya_vision/modeling_aya_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def pixel_shuffle(self, image_features): # B, S, D
@auto_docstring
class AyaVisionPreTrainedModel(PreTrainedModel):
config: AyaVisionConfig
base_model_prefix = "model"
input_modalities = ["image", "text"]
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
Expand Down Expand Up @@ -162,6 +163,10 @@ class AyaVisionModelOutputWithPast(BaseModelOutputWithPast):
"""
)
class AyaVisionModel(AyaVisionPreTrainedModel):
_checkpoint_conversion_mapping = {
r"^language_model.model": "language_model",
}

def __init__(self, config: AyaVisionConfig):
super().__init__(config)
self.vision_tower = AutoModel.from_config(config.vision_config)
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/blt/modeling_blt.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,7 @@ def forward(
@auto_docstring
class BltPreTrainedModel(PreTrainedModel):
config: BltConfig
base_model_prefix = "model"
input_modalities = ["image", "text"]
supports_gradient_checkpointing = True
_no_split_modules = ["BltTransformerLayer"]
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/clvp/modeling_clvp.py
Original file line number Diff line number Diff line change
Expand Up @@ -778,7 +778,7 @@ def forward(
@auto_docstring
class ClvpPreTrainedModel(PreTrainedModel):
config: ClvpConfig
base_model_prefix = "clvp"
base_model_prefix = "model"
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ class Cohere2VisionCausalLMOutputWithPast(ModelOutput):
@auto_docstring
class Cohere2VisionPreTrainedModel(PreTrainedModel):
config: Cohere2VisionConfig
base_model_prefix = "model"
input_modalities = ["image", "text"]
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
Expand All @@ -142,7 +143,6 @@ class Cohere2VisionPreTrainedModel(PreTrainedModel):
"hidden_states": "DecoderLayer",
"attentions": "Attention",
}
base_model_prefix = "model"


@auto_docstring(
Expand Down
1 change: 0 additions & 1 deletion src/transformers/models/emu3/modeling_emu3.py
Original file line number Diff line number Diff line change
Expand Up @@ -1490,7 +1490,6 @@ def forward(


class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
base_model_prefix = ""
output_modalities = ["image", "text"]
_tied_weights_keys = {"lm_head.weight": "model.text_model.embed_tokens.weight"}
_checkpoint_conversion_mapping = {
Expand Down
1 change: 0 additions & 1 deletion src/transformers/models/emu3/modular_emu3.py
Original file line number Diff line number Diff line change
Expand Up @@ -1044,7 +1044,6 @@ def forward(


class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin):
base_model_prefix = ""
output_modalities = ["image", "text"]
_tied_weights_keys = {"lm_head.weight": "model.text_model.embed_tokens.weight"}
_checkpoint_conversion_mapping = {
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/flava/modeling_flava.py
Original file line number Diff line number Diff line change
Expand Up @@ -1298,7 +1298,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
)
class FlavaImageCodebook(FlavaPreTrainedModel):
base_model_prefix = ""
base_model_prefix = "model"
config: FlavaImageCodebookConfig
main_input_name = "pixel_values"
input_modalities = "image"
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/florence2/modeling_florence2.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,7 @@ class Florence2Seq2SeqLMOutput(Seq2SeqLMOutput):
@auto_docstring
class Florence2PreTrainedModel(PreTrainedModel):
config: Florence2Config
base_model_prefix = "model"
input_modalities = ["image", "text"]
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
Expand All @@ -627,7 +628,6 @@ class Florence2PreTrainedModel(PreTrainedModel):

_supports_attention_backend = False
config_class = Florence2Config
base_model_prefix = "model"


@auto_docstring(
Expand Down
3 changes: 1 addition & 2 deletions src/transformers/models/gemma3/modeling_gemma3.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ def forward(
@auto_docstring
class Gemma3PreTrainedModel(PreTrainedModel):
config: Gemma3Config
base_model_prefix = ""
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = [
"Gemma3DecoderLayer",
Expand Down Expand Up @@ -632,7 +632,6 @@ class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin):
_tp_plan = {"lm_head": "colwise_rep"}
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
config: Gemma3TextConfig
base_model_prefix = "model"

def __init__(self, config: Gemma3TextConfig):
super().__init__(config)
Expand Down
3 changes: 1 addition & 2 deletions src/transformers/models/gemma3/modular_gemma3.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ def forward(


class Gemma3PreTrainedModel(Gemma2PreTrainedModel):
base_model_prefix = ""
base_model_prefix = "model"
input_modalities = ["image", "text"]
_no_split_modules = [
"Gemma3DecoderLayer",
Expand Down Expand Up @@ -717,7 +717,6 @@ def forward(

class Gemma3ForCausalLM(Gemma2ForCausalLM):
config: Gemma3TextConfig
base_model_prefix = "model"

def __init__(self, config: Gemma3TextConfig):
super().__init__(config)
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/models/gemma3n/modeling_gemma3n.py
Original file line number Diff line number Diff line change
Expand Up @@ -1939,7 +1939,6 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin):
_tp_plan = {"lm_head": "colwise_rep"}
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
config: Gemma3nTextConfig
base_model_prefix = "model"
_checkpoint_conversion_mapping = {"model.language_model": "model"}

def __init__(self, config: Gemma3nTextConfig):
Expand Down Expand Up @@ -2349,7 +2348,6 @@ def get_audio_features(
class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin):
_checkpoint_conversion_mapping = {}
_tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
base_model_prefix = "model"

def __init__(self, config: Gemma3nConfig):
super().__init__(config)
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/models/gemma3n/modular_gemma3n.py
Original file line number Diff line number Diff line change
Expand Up @@ -2116,7 +2116,6 @@ def forward(
@auto_docstring(custom_intro="The base Gemma 3n language model with a language modeling head.")
class Gemma3nForCausalLM(Gemma3ForCausalLM):
_checkpoint_conversion_mapping = {"model.language_model": "model"}
base_model_prefix = "model"


class Gemma3nMultimodalEmbedder(nn.Module):
Expand Down Expand Up @@ -2421,7 +2420,6 @@ def get_audio_features(
)
class Gemma3nForConditionalGeneration(PaliGemmaForConditionalGeneration):
_checkpoint_conversion_mapping = {}
base_model_prefix = "model"

@property
def audio_tower(self):
Expand Down
4 changes: 3 additions & 1 deletion src/transformers/models/glm46v/modeling_glm46v.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class Glm46VModelOutputWithPast(ModelOutput):

@auto_docstring
class Glm46VModel(Glm46VPreTrainedModel):
base_model_prefix = ""
base_model_prefix = "model"
_checkpoint_conversion_mapping = {}
# Reference: fix gemma3 grad acc #37208
accepts_loss_kwargs = False
Expand Down Expand Up @@ -583,6 +583,8 @@ def forward(
The temporal, height and width of feature shape of each image in LLM.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
The temporal, height and width of feature shape of each video in LLM.
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
The rope index difference between sequence length and multimodal rope.

Example:

Expand Down
4 changes: 3 additions & 1 deletion src/transformers/models/glm4v/modeling_glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -926,7 +926,7 @@ def forward(

@auto_docstring
class Glm4vModel(Glm4vPreTrainedModel):
base_model_prefix = ""
base_model_prefix = "model"
_checkpoint_conversion_mapping = {}
# Reference: fix gemma3 grad acc #37208
accepts_loss_kwargs = False
Expand Down Expand Up @@ -1431,6 +1431,8 @@ def forward(
The temporal, height and width of feature shape of each image in LLM.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
The temporal, height and width of feature shape of each video in LLM.
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
The rope index difference between sequence length and multimodal rope.

Example:

Expand Down
2 changes: 2 additions & 0 deletions src/transformers/models/glm4v/modular_glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -1350,6 +1350,8 @@ def forward(
The temporal, height and width of feature shape of each image in LLM.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
The temporal, height and width of feature shape of each video in LLM.
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
The rope index difference between sequence length and multimodal rope.

Example:

Expand Down
6 changes: 4 additions & 2 deletions src/transformers/models/glm4v_moe/modeling_glm4v_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ def forward(
@auto_docstring
class Glm4vMoePreTrainedModel(PreTrainedModel):
config: Glm4vMoeConfig
base_model_prefix = ""
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Glm4vMoeTextDecoderLayer", "Glm4vMoeVisionBlock"]
_skip_keys_device_placement = "past_key_values"
Expand Down Expand Up @@ -1090,7 +1090,7 @@ def forward(

@auto_docstring
class Glm4vMoeModel(Glm4vMoePreTrainedModel):
base_model_prefix = ""
base_model_prefix = "model"
_checkpoint_conversion_mapping = {}
# Reference: fix gemma3 grad acc #37208
accepts_loss_kwargs = False
Expand Down Expand Up @@ -1648,6 +1648,8 @@ def forward(
The temporal, height and width of feature shape of each image in LLM.
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
The temporal, height and width of feature shape of each video in LLM.
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
The rope index difference between sequence length and multimodal rope.

Example:

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/glm4v_moe/modular_glm4v_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,7 @@ def __init__(self, config: Glm4vMoeTextConfig, layer_idx: int):

class Glm4vMoePreTrainedModel(Glm4MoePreTrainedModel):
config: Glm4vMoeConfig
base_model_prefix = ""
base_model_prefix = "model"
input_modalities = ["text", "image", "video"]
_no_split_modules = ["Glm4vMoeTextDecoderLayer", "Glm4vMoeVisionBlock"]
_skip_keys_device_placement = "past_key_values"
Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/got_ocr2/modeling_got_ocr2.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]:
@auto_docstring
class GotOcr2PreTrainedModel(PreTrainedModel):
config: GotOcr2Config
base_model_prefix = "model"
input_modalities = ["image", "text"]
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
Expand Down Expand Up @@ -532,6 +533,10 @@ class GotOcr2ModelOutputWithPast(BaseModelOutputWithPast):
"""
)
class GotOcr2Model(GotOcr2PreTrainedModel):
_checkpoint_conversion_mapping = {
r"^language_model.model": "language_model",
}

def __init__(self, config: GotOcr2Config):
super().__init__(config)
self.vision_tower = GotOcr2VisionEncoder(config.vision_config)
Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/internvl/modeling_internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,7 @@ def forward(
@auto_docstring
class InternVLPreTrainedModel(PreTrainedModel):
config: InternVLConfig
base_model_prefix = "model"
input_modalities = ["image", "text", "video"]
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
Expand Down Expand Up @@ -530,6 +531,10 @@ class InternVLModelOutputWithPast(BaseModelOutputWithPast):
"""
)
class InternVLModel(InternVLPreTrainedModel):
_checkpoint_conversion_mapping = {
r"^language_model.model": "language_model",
}

def __init__(self, config: InternVLConfig):
super().__init__(config)
self.vision_tower = AutoModel.from_config(config.vision_config)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/jetmoe/modeling_jetmoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@ def forward(
@auto_docstring
class JetMoePreTrainedModel(PreTrainedModel):
config: JetMoeConfig
base_model_prefix = "transformer"
base_model_prefix = "model"
supports_gradient_checkpointing = False
_no_split_modules = ["JetMoeDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/jetmoe/modular_jetmoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ class JetMoePreTrainedModel(MixtralPreTrainedModel):
"attentions": OutputRecorder(JetMoeAttention, index=1),
}
config: JetMoeConfig
base_model_prefix = "transformer"
base_model_prefix = "model"
supports_gradient_checkpointing = False
_no_split_modules = ["JetMoeDecoderLayer"]
_skip_keys_device_placement = ["past_key_values"]
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/lfm2_vl/modeling_lfm2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def pixel_unshuffle(self, hidden_states: torch.Tensor):
@auto_docstring
class Lfm2VlPreTrainedModel(PreTrainedModel):
config: Lfm2VlConfig
base_model_prefix = "model"
input_modalities = ["image", "text"]
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
Expand All @@ -85,7 +86,6 @@ class Lfm2VlPreTrainedModel(PreTrainedModel):
_can_compile_fullgraph = False
_supports_flex_attn = True
_supports_attention_backend = True
base_model_prefix = "model"


@dataclass
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/llama4/modeling_llama4.py
Original file line number Diff line number Diff line change
Expand Up @@ -1166,7 +1166,7 @@ def forward(
class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
_no_split_modules = ["Llama4TextDecoderLayer", "Llama4VisionEncoderLayer"]
_tp_plan = {}
base_model_prefix = ""
base_model_prefix = "model"
config: Llama4Config

def __init__(self, config: Llama4Config):
Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/llava/modeling_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def forward(self, image_features):
@auto_docstring
class LlavaPreTrainedModel(PreTrainedModel):
config: LlavaConfig
base_model_prefix = "model"
input_modalities = ["image", "text"]
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
Expand All @@ -128,6 +129,10 @@ class LlavaPreTrainedModel(PreTrainedModel):
"""
)
class LlavaModel(LlavaPreTrainedModel):
_checkpoint_conversion_mapping = {
r"^language_model.model": "language_model",
}

def __init__(self, config: LlavaConfig):
super().__init__(config)
self.vision_tower = AutoModel.from_config(config.vision_config)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/llava_next/modeling_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def forward(self, image_features):
@auto_docstring
class LlavaNextPreTrainedModel(PreTrainedModel):
config: LlavaNextConfig
base_model_prefix = ""
base_model_prefix = "model"
input_modalities = ["image", "text"]
supports_gradient_checkpointing = True
_no_split_modules = ["LlamaDecoderLayer"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def forward(self, image_features):
@auto_docstring
class LlavaNextVideoPreTrainedModel(PreTrainedModel):
config: LlavaNextVideoConfig
base_model_prefix = ""
base_model_prefix = "model"
input_modalities = ["image", "video", "text"]
supports_gradient_checkpointing = True
_no_split_modules = ["LlamaDecoderLayer"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
@auto_docstring
class LlavaOnevisionPreTrainedModel(PreTrainedModel):
config: LlavaOnevisionConfig
base_model_prefix = ""
base_model_prefix = "model"
input_modalities = ["image", "video", "text"]
supports_gradient_checkpointing = True
_no_split_modules = ["LlamaDecoderLayer"]
Expand Down
Loading