huggingface · yonigozlan · Nov 26, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/docs/source/en/model_doc/layoutxlm.md b/docs/source/en/model_doc/layoutxlm.md
@@ -70,6 +70,12 @@ data for the model.
 As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to [LayoutLMv2's documentation page](layoutlmv2) for all tips, code examples and notebooks.
 </Tip>
 
+
+## LayoutXLMConfig
+
+[[autodoc]] LayoutXLMConfig
+
+
 ## LayoutXLMTokenizer
 
 [[autodoc]] LayoutXLMTokenizer

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -222,7 +222,7 @@
         ("layoutlm", "LayoutLMConfig"),
         ("layoutlmv2", "LayoutLMv2Config"),
         ("layoutlmv3", "LayoutLMv3Config"),
-        ("layoutxlm", "LayoutLMv2Config"),
+        ("layoutxlm", "LayoutXLMConfig"),
         ("led", "LEDConfig"),
         ("levit", "LevitConfig"),
         ("lfm2", "Lfm2Config"),
@@ -915,12 +915,14 @@
     [
         ("audioflamingo3_encoder", "audioflamingo3"),
         ("openai-gpt", "openai"),
+        ("blip-2", "blip_2"),
         ("data2vec-audio", "data2vec"),
         ("data2vec-text", "data2vec"),
         ("data2vec-vision", "data2vec"),
         ("donut-swin", "donut"),
         ("kosmos-2", "kosmos2"),
         ("kosmos-2.5", "kosmos2_5"),
+        ("omdet-turbo", "omdet_turbo"),
         ("maskformer-swin", "maskformer"),
         ("xclip", "x_clip"),
         ("clip_vision_model", "clip"),
@@ -936,7 +938,10 @@
         ("glm4v_moe_vision", "glm4v_moe"),
         ("glm4v_text", "glm4v"),
         ("glm4v_moe_text", "glm4v_moe"),
+        ("grounding-dino", "grounding_dino"),
+        ("mm-grounding-dino", "mm_grounding_dino"),
         ("idefics3_vision", "idefics3"),
+        ("mgp-str", "mgp_str"),
         ("siglip_vision_model", "siglip"),
         ("siglip2_vision_model", "siglip2"),
         ("aimv2_vision_model", "aimv2"),
@@ -962,6 +967,7 @@
         ("video_llama_3_vision", "video_llama_3"),
         ("parakeet_encoder", "parakeet"),
         ("parakeet_ctc", "parakeet"),
+        ("wav2vec2-bert", "wav2vec2_bert"),
     ]
 )
 

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -130,7 +130,7 @@
             ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")),
             ("lfm2_vl", (None, "Lfm2VlImageProcessorFast")),
             ("lightglue", ("LightGlueImageProcessor", "LightGlueImageProcessorFast")),
-            ("llama4", ("Llama4ImageProcessor", "Llama4ImageProcessorFast")),
+            ("llama4", (None, "Llama4ImageProcessorFast")),
             ("llava", ("LlavaImageProcessor", "LlavaImageProcessorFast")),
             ("llava_next", ("LlavaNextImageProcessor", "LlavaNextImageProcessorFast")),
             ("llava_next_video", ("LlavaNextImageProcessor", "LlavaNextImageProcessorFast")),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -95,6 +95,7 @@
         ("kyutai_speech_to_text", "KyutaiSpeechToTextProcessor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
+        ("layoutxlm", "LayoutXLMProcessor"),
         ("lfm2_vl", "Lfm2VlProcessor"),
         ("llama4", "Llama4Processor"),
         ("llava", "LlavaProcessor"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -393,7 +393,7 @@
         ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
         ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
-        ("llava_onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+        ("llava_onevision", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
         ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
         (
             "longt5",

diff --git a/src/transformers/models/edgetam/modeling_edgetam.py b/src/transformers/models/edgetam/modeling_edgetam.py
@@ -1103,7 +1103,7 @@ def forward(
 
         >>> # Postprocess masks
         >>> masks = processor.post_process_masks(
-        ...     outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"]
+        ...     outputs.pred_masks, inputs["original_sizes"]
         ... )
         ```
         """

diff --git a/src/transformers/models/gemma3n/processing_gemma3n.py b/src/transformers/models/gemma3n/processing_gemma3n.py
@@ -147,5 +147,13 @@ def __call__(
         text_inputs["token_type_ids"] = token_type_ids.tolist()
         return BatchFeature(data={**text_inputs, **image_inputs, **audio_inputs}, tensor_type=return_tensors)
 
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names + ["token_type_ids"]
+        image_processor_input_names = self.image_processor.model_input_names
+        audio_processor_input_names = self.feature_extractor.model_input_names
+        image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]
+        return list(tokenizer_input_names + image_processor_input_names + audio_processor_input_names)
+
 
 __all__ = ["Gemma3nProcessor"]
diff --git a/src/transformers/models/glm46v/modeling_glm46v.py b/src/transformers/models/glm46v/modeling_glm46v.py
@@ -562,8 +562,6 @@ def forward(
             The temporal, height and width of feature shape of each image in LLM.
         video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
             The temporal, height and width of feature shape of each video in LLM.
-        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
-            The rope index difference between sequence length and multimodal rope.
 
         Example:
 

diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py
@@ -1410,8 +1410,6 @@ def forward(
             The temporal, height and width of feature shape of each image in LLM.
         video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
             The temporal, height and width of feature shape of each video in LLM.
-        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
-            The rope index difference between sequence length and multimodal rope.
 
         Example:
 

diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py
@@ -1350,8 +1350,6 @@ def forward(
             The temporal, height and width of feature shape of each image in LLM.
         video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
             The temporal, height and width of feature shape of each video in LLM.
-        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
-            The rope index difference between sequence length and multimodal rope.
 
         Example:
 

diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py
@@ -1630,8 +1630,6 @@ def forward(
             The temporal, height and width of feature shape of each image in LLM.
         video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
             The temporal, height and width of feature shape of each video in LLM.
-        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
-            The rope index difference between sequence length and multimodal rope.
 
         Example:
 

diff --git a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
@@ -39,7 +39,7 @@ class LayoutLMv2Config(PreTrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the LayoutLMv2 model. Defines the number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`LayoutLMv2Model`] or [`TFLayoutLMv2Model`].
+            the `inputs_ids` passed when calling [`LayoutLMv2Model`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -59,12 +59,13 @@ class LayoutLMv2Config(PreTrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`LayoutLMv2Model`] or
-            [`TFLayoutLMv2Model`].
+            The vocabulary size of the `token_type_ids` passed when calling [`LayoutLMv2Model`].
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
         max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum value that the 2D position embedding might ever be used with. Typically set this to something
             large just in case (e.g., 1024).
@@ -78,7 +79,9 @@ class LayoutLMv2Config(PreTrainedConfig):
             The maximum number of relative 2D positions in the self-attention mechanism.
         rel_2d_pos_bins (`int`, *optional*, defaults to 64):
             The number of 2D relative position bins in the self-attention mechanism.
-        image_feature_pool_shape (`list[int]`, *optional*, defaults to [7, 7, 256]):
+        convert_sync_batchnorm (`bool`, *optional*, defaults to `True`):
+            Whether or not to convert batch normalization layers to synchronized batch normalization layers.
+        image_feature_pool_shape (`list[int]`, *optional*, defaults to `[7, 7, 256]`):
             The shape of the average-pooled feature map.
         coordinate_size (`int`, *optional*, defaults to 128):
             Dimension of the coordinate embeddings.
@@ -95,6 +98,7 @@ class LayoutLMv2Config(PreTrainedConfig):
             file](https://github.com/microsoft/unilm/blob/master/layoutlmft/layoutlmft/models/layoutlmv2/detectron2_config.py)
             for details regarding default values.
 
+
     Example:
 
     ```python

diff --git a/src/transformers/models/layoutxlm/__init__.py b/src/transformers/models/layoutxlm/__init__.py
@@ -18,6 +18,7 @@
 
 
 if TYPE_CHECKING:
+    from .configuration_layoutxlm import *
     from .processing_layoutxlm import *
     from .tokenization_layoutxlm import *
     from .tokenization_layoutxlm_fast import *