huggingface · zucchini-nlp · May 7, 2025 · Mar 26, 2025 · Mar 26, 2025 · Mar 26, 2025
diff --git a/docs/source/en/model_doc/aria.md b/docs/source/en/model_doc/aria.md
@@ -102,6 +102,10 @@ response = processor.decode(output_ids, skip_special_tokens=True)
 
 [[autodoc]] AriaTextModel
 
+## AriaModel
+
+[[autodoc]] AriaModel
+
 ## AriaTextForCausalLM
 
 [[autodoc]] AriaTextForCausalLM

diff --git a/docs/source/en/model_doc/aya_vision.md b/docs/source/en/model_doc/aya_vision.md
@@ -237,6 +237,10 @@ for i, output in enumerate(batch_outputs):
 
 [[autodoc]] AyaVisionConfig
 
+## AyaVisionModel
+
+[[autodoc]] AyaVisionModel
+
 ## AyaVisionForConditionalGeneration
 
 [[autodoc]] AyaVisionForConditionalGeneration

diff --git a/docs/source/en/model_doc/emu3.md b/docs/source/en/model_doc/emu3.md
@@ -174,6 +174,10 @@ for i, image in enumerate(images['pixel_values']):
 [[autodoc]] Emu3TextModel
     - forward
 
+## Emu3Model
+
+[[autodoc]] Emu3Model
+
 ## Emu3ForCausalLM
 
 [[autodoc]] Emu3ForCausalLM

diff --git a/docs/source/en/model_doc/fuyu.md b/docs/source/en/model_doc/fuyu.md
@@ -103,6 +103,10 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
 
 [[autodoc]] FuyuConfig
 
+## FuyuModel
+
+[[autodoc]] FuyuModel
+
 ## FuyuForCausalLM
 
 [[autodoc]] FuyuForCausalLM

diff --git a/docs/source/en/model_doc/gemma3.md b/docs/source/en/model_doc/gemma3.md
@@ -254,6 +254,10 @@ visualizer("<img>What is shown in this image?")
 [[autodoc]] Gemma3TextModel
     - forward
 
+## Gemma3Model
+
+[[autodoc]] Gemma3Model
+
 ## Gemma3ForCausalLM
 
 [[autodoc]] Gemma3ForCausalLM

diff --git a/docs/source/en/model_doc/got_ocr2.md b/docs/source/en/model_doc/got_ocr2.md
@@ -277,6 +277,10 @@ alt="drawing" width="600"/>
 
 [[autodoc]] GotOcr2Processor
 
+## GotOcr2Model
+
+[[autodoc]] GotOcr2Model
+
 ## GotOcr2ForConditionalGeneration
 
 [[autodoc]] GotOcr2ForConditionalGeneration

diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md
@@ -69,6 +69,10 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
 [[autodoc]] InstructBlipQFormerModel
     - forward
 
+## InstructBlipModel
+
+[[autodoc]] InstructBlipModel
+
 ## InstructBlipForConditionalGeneration
 
 [[autodoc]] InstructBlipForConditionalGeneration

diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md
@@ -73,6 +73,10 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
 [[autodoc]] InstructBlipVideoQFormerModel
     - forward
 
+## InstructBlipVideoModel
+[[autodoc]] InstructBlipVideoModel
+    - forward
+
 ## InstructBlipVideoForConditionalGeneration
 
 [[autodoc]] InstructBlipVideoForConditionalGeneration

diff --git a/docs/source/en/model_doc/internvl.md b/docs/source/en/model_doc/internvl.md
@@ -340,6 +340,11 @@ This example showcases how to handle a batch of chat conversations with interlea
 [[autodoc]] InternVLVisionModel
     - forward
 
+## InternVLModel
+
+[[autodoc]] InternVLModel
+    - forward
+
 ## InternVLForConditionalGeneration
 
 [[autodoc]] InternVLForConditionalGeneration

diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md
@@ -256,6 +256,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] LlavaProcessor
 
+## LlavaModel
+
+[[autodoc]] LlavaModel
+
 ## LlavaForConditionalGeneration
 
 [[autodoc]] LlavaForConditionalGeneration

diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
@@ -315,6 +315,10 @@ model = AutoModelForImageTextToText.from_pretrained(
 
 [[autodoc]] LlavaNextProcessor
 
+## LlavaNextModel
+
+[[autodoc]] LlavaNextModel
+
 ## LlavaNextForConditionalGeneration
 
 [[autodoc]] LlavaNextForConditionalGeneration

diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md
@@ -262,6 +262,10 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
 
 [[autodoc]] LlavaNextVideoImageProcessor
 
+## LlavaNextVideoModel
+
+[[autodoc]] LlavaNextVideoModel
+
 ## LlavaNextVideoForConditionalGeneration
 
 [[autodoc]] LlavaNextVideoForConditionalGeneration

diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md
@@ -313,6 +313,10 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
 
 [[autodoc]] LlavaOnevisionVideoProcessor
 
+## LlavaOnevisionModel
+
+[[autodoc]] LlavaOnevisionModel
+
 ## LlavaOnevisionForConditionalGeneration
 
 [[autodoc]] LlavaOnevisionForConditionalGeneration

diff --git a/docs/source/en/model_doc/mistral3.md b/docs/source/en/model_doc/mistral3.md
@@ -227,6 +227,9 @@ This example also how to use `BitsAndBytes` to load the model in 4bit quantizati
 
 [[autodoc]] Mistral3Config
 
+## Mistral3Model
+
+[[autodoc]] Mistral3Model
 
 ## Mistral3ForConditionalGeneration
 

diff --git a/docs/source/en/model_doc/mllama.md b/docs/source/en/model_doc/mllama.md
@@ -130,6 +130,10 @@ print(processor.decode(output[0], skip_special_tokens=True))
 [[autodoc]] MllamaTextModel
     - forward
 
+## MllamaModel
+
+[[autodoc]] MllamaModel
+
 ## MllamaForCausalLM
 
 [[autodoc]] MllamaForCausalLM

diff --git a/docs/source/en/model_doc/paligemma.md b/docs/source/en/model_doc/paligemma.md
@@ -174,6 +174,10 @@ visualizer("<img> What is in this image?")
 
 [[autodoc]] PaliGemmaProcessor
 
+## PaliGemmaModel
+
+[[autodoc]] PaliGemmaModel
+
 ## PaliGemmaForConditionalGeneration
 
 [[autodoc]] PaliGemmaForConditionalGeneration

diff --git a/docs/source/en/model_doc/qwen2_5_vl.md b/docs/source/en/model_doc/qwen2_5_vl.md
@@ -240,6 +240,10 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 
 [[autodoc]] Qwen2_5_VLProcessor
 
+## Qwen2_5_VLTextModel
+
+[[autodoc]] Qwen2_5_VLTextModel
+    - forward
 
 ## Qwen2_5_VLModel
 

diff --git a/docs/source/en/model_doc/qwen2_vl.md b/docs/source/en/model_doc/qwen2_vl.md
@@ -296,6 +296,11 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
 
 [[autodoc]] Qwen2VLProcessor
 
+## Qwen2VLTextModel
+
+[[autodoc]] Qwen2VLTextModel
+    - forward
+
 ## Qwen2VLModel
 
 [[autodoc]] Qwen2VLModel

diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md
@@ -215,6 +215,10 @@ model = VideoLlavaForConditionalGeneration.from_pretrained(
 
 [[autodoc]] VideoLlavaProcessor
 
+## VideoLlavaModel
+
+[[autodoc]] VideoLlavaModel
+
 ## VideoLlavaForConditionalGeneration
 
 [[autodoc]] VideoLlavaForConditionalGeneration

diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md
@@ -101,6 +101,10 @@ A chat between a curious human and an artificial intelligence assistant. The ass
 
 [[autodoc]] VipLlavaConfig
 
+## VipLlavaModel
+
+[[autodoc]] VipLlavaModel
+
 ## VipLlavaForConditionalGeneration
 
 [[autodoc]] VipLlavaForConditionalGeneration

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -216,6 +216,28 @@ def is_local_dist_rank_0():
     "kaiming_normal": nn.init.kaiming_normal,
 }
 
+# DO NOT MODIFY, KEPT FOR BC ONLY
+VLMS = [
+    "aria",
+    "aya_vision",
+    "emu3",
+    "fuyu",
+    "got_ocr2",
+    "gemma3",
+    "internvl",
+    "llava",
+    "llava_next",
+    "llava_next_video",
+    "llava_onevision",
+    "mistral3",
+    "mllama",
+    "paligemma",
+    "qwen2_vl",
+    "qwem2_5_vl",
+    "video_llava",
+    "vipllava",
+]
+
 
 @contextmanager
 def no_init_weights():
@@ -1778,6 +1800,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
     main_input_name = "input_ids"
     model_tags = None
 
+    _checkpoint_conversion_mapping = {}  # used for BC support in VLMs, not meant to be used by new models
+
     _auto_class = None
     _no_split_modules = None
     _skip_keys_device_placement = None
@@ -3484,6 +3508,21 @@ def save_pretrained(
                         module_map[name + f".{key}"] = module
             state_dict = model_to_save.state_dict()
 
+        if any(allowed_name in self.__class__.__name__.lower() for allowed_name in VLMS):
+            reverse_key_mapping = {v: k for k, v in self._checkpoint_conversion_mapping.items()}
+
+            original_state_dict = {}
+            for key, value in state_dict.items():
+                for pattern, replacement in reverse_key_mapping.items():
+                    replacement = replacement.lstrip("^")  # strip off un-needed chars and patterns
+                    replacement = re.sub(r"\(.*?\)", "", pattern)
+                    key, n_replace = re.subn(pattern, replacement, key)
+                    # Early exit of the loop
+                    if n_replace > 0:
+                        break
+                original_state_dict[key] = value
+            state_dict = original_state_dict
+
         # Translate state_dict from smp to hf if saving with smp >= 1.10
         if IS_SAGEMAKER_MP_POST_1_10:
             for smp_to_hf, _ in smp.state.module_manager.translate_functions:
@@ -4071,7 +4110,13 @@ def from_pretrained(
         gguf_file = kwargs.pop("gguf_file", None)
         tp_plan = kwargs.pop("tp_plan", None)
         tp_size = kwargs.pop("tp_size", None)
-        key_mapping = kwargs.pop("key_mapping", None)
+
+        # Load models with hardcoded key mapping on class for VLMs only,  to keep BC and standardize model
+        if any(allowed_name in cls.__name__.lower() for allowed_name in VLMS):
+            key_mapping = kwargs.pop("key_mapping", cls._checkpoint_conversion_mapping)
+        else:
+            key_mapping = kwargs.pop("key_mapping", None)
+
         # Not used anymore -- remove them from the kwargs
         _ = kwargs.pop("resume_download", None)
         _ = kwargs.pop("trust_remote_code", None)