From 762b6517fdd36b2fbf372c36e3bdd6e9701cbedd Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 10 Sep 2025 14:36:27 +0200
Subject: [PATCH 01/28] initial design draft

---
 smolvlm.py                                    | 119 ++++++++++++
 src/transformers/processing_utils.py          | 108 ++++++-----
 src/transformers/utils/type_validators.py     | 174 ++++++++++++++++++
 .../models/colpali/test_processing_colpali.py |  16 +-
 .../colqwen2/test_processing_colqwen2.py      |  16 +-
 tests/models/janus/test_processing_janus.py   |   2 +-
 tests/models/mllama/test_processing_mllama.py |   2 +-
 .../models/smolvlm/test_processing_smolvlm.py |   2 +-
 tests/test_processing_common.py               |  30 +--
 9 files changed, 389 insertions(+), 80 deletions(-)
 create mode 100644 smolvlm.py
 create mode 100644 src/transformers/utils/type_validators.py

diff --git a/smolvlm.py b/smolvlm.py
new file mode 100644
index 000000000000..ef01fd9f087c
--- /dev/null
+++ b/smolvlm.py
@@ -0,0 +1,119 @@
+from transformers import Qwen2VLProcessor
+
+if __name__ == "__main__":
+
+    for i in range(1):
+      processor = Qwen2VLProcessor.from_pretrained(pretrained_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", use_fast=True)
+      processor
+
+
+from typing_extensions import Unpack
+from transformers.tokenization_utils_base import PaddingStrategy
+from typing import Union, TypeVar, Generic, get_type_hints, TypedDict, Literal, Annotated, Optional, get_origin, get_args
+from dataclasses import make_dataclass, field
+
+my_int = TypeVar('my_int', bound=int)
+
+
+class Mixin:
+    def mixin_method(self):
+        return 0
+
+class Stack(Mixin, Generic[my_int]):
+    def __init__(self) -> None:
+        # Create an empty list with items of type T
+        self.items: list[my_int] = []
+
+    def push(self, item: my_int) -> None:
+        self.items.append(item)
+
+
+class ModelStack(Stack[str]):
+    pass
+
+s = ModelStack()
+s.push(0)
+
+
+
+from dataclasses import dataclass, MISSING, fields
+from huggingface_hub.dataclasses import as_validated_field, strict, validated_field
+
+def positive_int(value: int):
+    if not value >= 0:
+        raise ValueError(f"Value must be positive, got {value}")
+
+
+def multiple_of_64(value: int):
+    if not value % 64 == 0:
+        raise ValueError(f"Value must be a multiple of 64, got {value}")
+
+
+@as_validated_field
+def strictly_positive(value: int = None):
+    if value is not None and not value > 0:
+        raise ValueError(f"Value must be strictly positive, got {value}")
+
+@as_validated_field
+def padding_validator(value: Union[bool, str, PaddingStrategy] = None):
+    if value is None:
+        return
+
+    if not isinstance(value, (bool, str, PaddingStrategy)):
+        raise ValueError(f"Value must be padding")
+    if isinstance(value, str) and value not in ["longest", "max_length", "do_not_pad"]:
+        raise ValueError(f'Value for padding must be one of ["longest", "max_length", "do_not_pad"]')
+
+@strict
+@dataclass
+class Config:
+    model_type: str
+    hidden_size: int = validated_field(validator=[positive_int, multiple_of_64])
+    vocab_size: int = strictly_positive(default=16)
+
+
+class AnotherKwargs(TypedDict, total=False):
+    name: Union[str, list[str]]
+    age: Annotated[Optional[int], strictly_positive()]
+    padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()]
+    padding_side: Optional[Literal["right", "left"]]
+
+
+def unpack_annotated_type(type):
+    if get_origin(type) is Annotated:
+        base, *meta = get_args(type)
+        return base, meta[0]
+    return type, field(default=MISSING)
+
+
+def dataclass_from_typed_dict(td: type[TypedDict]):
+    hints = get_type_hints(td, include_extras=True)
+    dc_fields = [
+        (k, *unpack_annotated_type(v))
+        for k, v in hints.items()
+    ]
+    return make_dataclass(td.__name__ + "Config", dc_fields)
+
+
+class HubTypeAdapter():
+    def __init__(self, type: type[TypedDict]) -> None:
+        self.type = type
+        dataclass = dataclass_from_typed_dict(type)
+        self.dataclass = strict(dataclass)
+    
+    def validate_fields(self, **kwargs):
+        for f in fields(self.dataclass):
+            if f.name not in kwargs:
+                kwargs[f.name] = None
+        self.dataclass(**kwargs)
+    
+
+config = Config(model_type="bert", vocab_size=30000, hidden_size=768)
+print(config.__dataclass_fields__)
+assert config.model_type == "bert"
+assert config.vocab_size == 30000
+assert config.hidden_size == 768
+
+HubTypeAdapter(AnotherKwargs).validate_fields(name=["BOB", "MARY"], age=100, padding=None)
+print(AnotherKwargs.__annotations__['age'])
+
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 3130d0ded34f..038c9f5ecaa6 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -23,9 +23,10 @@
 import sys
 import typing
 import warnings
+from collections.abc import Iterable
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Optional, TypedDict, TypeVar, Union
+from typing import Annotated, Any, Literal, Optional, TypedDict, TypeVar, Union
 
 import numpy as np
 import typing_extensions
@@ -36,6 +37,18 @@
 from .feature_extraction_utils import BatchFeature
 from .image_utils import ChannelDimension, ImageInput, is_vision_available
 from .utils.chat_template_utils import render_jinja_template
+from .utils.type_validators import (
+    TypedDictAdapter,
+    device_validator,
+    image_size_validator,
+    padding_validator,
+    resampling_validator,
+    strictly_positive_int,
+    strictly_positive_number,
+    tensor_type_validator,
+    truncation_validator,
+    video_metadata_validator,
+)
 from .video_utils import VideoInput, VideoMetadata
 
 
@@ -138,15 +151,15 @@ class TextKwargs(TypedDict, total=False):
     """
 
     text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
-    text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
+    text_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
     text_pair_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
     add_special_tokens: Optional[bool]
-    padding: Union[bool, str, PaddingStrategy]
-    truncation: Union[bool, str, TruncationStrategy]
-    max_length: Optional[int]
-    stride: Optional[int]
+    padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()]
+    truncation: Annotated[Optional[Union[bool, str, TruncationStrategy]], truncation_validator()]
+    max_length: Annotated[Optional[int], strictly_positive_int()]
+    stride: Annotated[Optional[int], strictly_positive_int()]
     is_split_into_words: Optional[bool]
-    pad_to_multiple_of: Optional[int]
+    pad_to_multiple_of: Annotated[Optional[int], strictly_positive_int()]
     return_token_type_ids: Optional[bool]
     return_attention_mask: Optional[bool]
     return_overflowing_tokens: Optional[bool]
@@ -154,8 +167,9 @@ class TextKwargs(TypedDict, total=False):
     return_offsets_mapping: Optional[bool]
     return_length: Optional[bool]
     verbose: Optional[bool]
-    padding_side: Optional[str]
+    padding_side: Optional[Literal["left", "right"]]
     return_mm_token_type_ids: Optional[bool]
+    return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
 
 
 class ImagesKwargs(TypedDict, total=False):
@@ -199,21 +213,22 @@ class methods and docstrings.
     """
 
     do_resize: Optional[bool]
-    size: Optional[dict[str, int]]
-    size_divisor: Optional[int]
-    crop_size: Optional[dict[str, int]]
-    resample: Optional[Union["PILImageResampling", int]]
+    size: Annotated[Optional[dict[str, int]], image_size_validator()]
+    size_divisor: Annotated[Optional[int], strictly_positive_int()]
+    crop_size: Annotated[Optional[dict[str, int]], image_size_validator()]
+    resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()]
     do_rescale: Optional[bool]
     rescale_factor: Optional[float]
     do_normalize: Optional[bool]
     image_mean: Optional[Union[float, list[float]]]
     image_std: Optional[Union[float, list[float]]]
     do_pad: Optional[bool]
-    pad_size: Optional[dict[str, int]]
+    pad_size: Annotated[Optional[dict[str, int]], image_size_validator()]
     do_center_crop: Optional[bool]
-    data_format: Optional[ChannelDimension]
+    data_format: Optional[Union[str, ChannelDimension]]
     input_data_format: Optional[Union[str, ChannelDimension]]
-    device: Optional[str]
+    device: Annotated[Optional[str], device_validator()]
+    return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
 
 
 class VideosKwargs(TypedDict, total=False):
@@ -267,10 +282,10 @@ class VideosKwargs(TypedDict, total=False):
 
     do_convert_rgb: Optional[bool]
     do_resize: Optional[bool]
-    size: Optional[dict[str, int]]
-    size_divisor: Optional[int]
+    size: Annotated[Optional[dict[str, int]], image_size_validator()]
+    size_divisor: Annotated[Optional[int], strictly_positive_int()]
     default_to_square: Optional[bool]
-    resample: Optional["PILImageResampling"]
+    resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()]
     do_rescale: Optional[bool]
     rescale_factor: Optional[float]
     do_normalize: Optional[bool]
@@ -278,15 +293,18 @@ class VideosKwargs(TypedDict, total=False):
     image_std: Optional[Union[float, list[float]]]
     do_pad: Optional[bool]
     do_center_crop: Optional[bool]
-    crop_size: Optional[dict[str, int]]
+    crop_size: Annotated[Optional[dict[str, int]], image_size_validator()]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
-    device: Optional[str]
+    device: Annotated[Optional[str], device_validator()]
     do_sample_frames: Optional[bool]
-    video_metadata: Optional[Union[VideoMetadata, dict]]
-    fps: Optional[Union[int, float]]
-    num_frames: Optional[int]
+    video_metadata: Annotated[
+        Optional[Union[VideoMetadata, dict, Iterable[VideoMetadata, dict]]], video_metadata_validator()
+    ]
+    fps: Annotated[Optional[Union[int, float]], strictly_positive_number()]
+    num_frames: Annotated[Optional[int], strictly_positive_int()]
     return_metadata: Optional[bool]
+    return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
 
 
 class AudioKwargs(TypedDict, total=False):
@@ -319,17 +337,14 @@ class AudioKwargs(TypedDict, total=False):
             Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
     """
 
-    sampling_rate: Optional[int]
+    sampling_rate: Annotated[Optional[int], strictly_positive_int()]
     raw_speech: Optional[Union["np.ndarray", list[float], list["np.ndarray"], list[list[float]]]]
-    padding: Optional[Union[bool, str, PaddingStrategy]]
-    max_length: Optional[int]
-    truncation: Optional[bool]
-    pad_to_multiple_of: Optional[int]
+    padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()]
+    max_length: Annotated[Optional[int], strictly_positive_int()]
+    truncation: Annotated[Optional[Union[bool, str, TruncationStrategy]], truncation_validator()]
+    pad_to_multiple_of: Annotated[Optional[int], strictly_positive_int()]
     return_attention_mask: Optional[bool]
-
-
-class CommonKwargs(TypedDict, total=False):
-    return_tensors: Optional[Union[str, TensorType]]
+    return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
 
 
 class ProcessingKwargs(TypedDict, total=False):
@@ -373,9 +388,6 @@ class CustomProcessorKwargs(ProcessingKwargs, total=False):
 
     _defaults = {}
 
-    common_kwargs: CommonKwargs = {
-        **CommonKwargs.__annotations__,
-    }
     text_kwargs: TextKwargs = {
         **TextKwargs.__annotations__,
     }
@@ -1248,7 +1260,6 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
             "images_kwargs": {},
             "audio_kwargs": {},
             "videos_kwargs": {},
-            "common_kwargs": {},
         }
 
         default_kwargs = {
@@ -1256,7 +1267,6 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
             "images_kwargs": {},
             "audio_kwargs": {},
             "videos_kwargs": {},
-            "common_kwargs": {},
         }
 
         possible_modality_keywords = {"text", "audio", "videos", "images"}
@@ -1314,17 +1324,21 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
         else:
             # kwargs is a flat dictionary
             for key, kwarg in kwargs.items():
-                if key not in used_keys:
-                    if key in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__:
-                        output_kwargs["common_kwargs"][key] = kwarg
-                    elif key not in possible_modality_keywords:
-                        logger.warning_once(
-                            f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
-                        )
+                if key not in used_keys and key not in possible_modality_keywords:
+                    logger.warning_once(
+                        f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
+                    )
+
+        # BC for `common_kwargs` to update all modality-specific kwargs
+        common_kwargs = kwargs.get("common_kwargs", {})
+        if common_kwargs:
+            for kwarg in output_kwargs.values():
+                kwarg.update(common_kwargs)
 
-        # all modality-specific kwargs are updated with common kwargs
-        for kwarg in output_kwargs.values():
-            kwarg.update(output_kwargs["common_kwargs"])
+        # Perform type validation on collected kwargs
+        for key, typed_dict_obj in ModelProcessorKwargs.__annotations__.items():
+            type_validator = TypedDictAdapter(typed_dict_obj)
+            type_validator.validate_fields(**output_kwargs[key])
         return output_kwargs
 
     @classmethod
diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py
new file mode 100644
index 000000000000..cb846b8a6255
--- /dev/null
+++ b/src/transformers/utils/type_validators.py
@@ -0,0 +1,174 @@
+from collections.abc import Iterable
+from dataclasses import MISSING, field, make_dataclass
+from typing import Annotated, Optional, TypedDict, Union, get_args, get_origin, get_type_hints
+
+from huggingface_hub.dataclasses import as_validated_field, strict
+
+from ..tokenization_utils_base import PaddingStrategy, TruncationStrategy
+from ..video_utils import VideoMetadata
+from .generic import TensorType
+from .import_utils import is_vision_available
+
+
+if is_vision_available():
+    from ..image_utils import PILImageResampling
+
+
+def unpack_annotated_type(type):
+    if get_origin(type) is Annotated:
+        base, *meta = get_args(type)
+        return base, meta[0]
+    return type, field(default=MISSING)
+
+
+# Minimalistic version on pydantic.TypeAdapter tailored for `TypedDict`
+class TypedDictAdapter:
+    """
+    A utility class used to convert a TypedDict object to dataclass and attach
+    a hub validator on top based on TypedDict annotations.
+
+    Args:
+        type: The TypedDict object that needs to be validated.
+    """
+
+    def __init__(self, type: type[TypedDict]) -> None:
+        self.type = type
+        self.dataclass = self.create_dataclass()
+        self.dataclass = strict(self.dataclass)
+
+    def validate_fields(self, **kwargs):
+        # If not all kwargs are set, dataclass raises an error in python <= 3.9
+        # In newer python we can bypass by creating a dataclass with `kw_only=True`
+        for field in self.fields:
+            if field[0] not in kwargs:
+                kwargs[field[0]] = None
+        self.dataclass(**kwargs)
+
+    def create_dataclass(self):
+        """
+        Creates a dataclass object dynamically from `TypedDict`, so that
+        we can use strict type validation from typing hints with `TypedDict`.
+
+        Example:
+
+        @as_validated_field
+        def padding_validator(value: Union[bool, str, PaddingStrategy] = None):
+            if value is None:
+                return
+            if not isinstance(value, (bool, str, PaddingStrategy)):
+                raise ValueError(f"Value must be one of '[bool, string, PaddingStrategy]'")
+            if isinstance(value, str) and value not in ["longest", "max_length", "do_not_pad"]:
+                raise ValueError(f'Value for padding must be one of `["longest", "max_length", "do_not_pad"]`')
+
+        class TokenizerKwargs(TypedDict, total=False):
+            text: str
+            padding: Annotated[Union[bool, str, PaddingStrategy], padding_validator()]
+
+        # Now we can create a dataclass and warp it with hub validators for type constraints
+        # The dataclass can also be used as a simple config class for easier kwarg management
+        dataclass = dataclass_from_typed_dict(TokenizerKwargs)
+        """
+        hints = get_type_hints(self.type, include_extras=True)
+        fields = [(k, *unpack_annotated_type(v)) for k, v in hints.items()]
+        self.fields = fields
+        return make_dataclass(self.type.__name__ + "Config", fields)
+
+
+@as_validated_field
+def strictly_positive_number(value: Optional[Union[int, float]] = None):
+    if value is not None and (not isinstance(value, (int, float)) or not value > 0):
+        raise ValueError(f"Value must be strictly positive, got {value}")
+
+
+@as_validated_field
+def strictly_positive_int(value: Optional[int] = None):
+    if value is not None and (not isinstance(value, int) or not value > 0):
+        raise ValueError(f"Value must be strictly positive integer, got {value}")
+
+
+@as_validated_field
+def padding_validator(value: Optional[Union[bool, str, PaddingStrategy]] = None):
+    possible_names = ["longest", "max_length", "do_not_pad"]
+    if value is None:
+        pass
+    elif not isinstance(value, (bool, str, PaddingStrategy)):
+        raise ValueError("Value for padding must be either a boolean, a string or a `PaddingStrategy`")
+    elif isinstance(value, str) and value not in possible_names:
+        raise ValueError(f"If padding is a string, the value must be one of {possible_names}")
+
+
+@as_validated_field
+def truncation_validator(value: Optional[Union[bool, str, TruncationStrategy]] = None):
+    possible_names = ["only_first", "only_second", "longest_first", "do_not_truncate"]
+    if value is None:
+        pass
+    elif not isinstance(value, (bool, str, TruncationStrategy)):
+        raise ValueError("Value for truncation must be either a boolean, a string or a `TruncationStrategy`")
+    elif isinstance(value, str) and value not in possible_names:
+        raise ValueError(f"If truncation is a string, value must be one of {possible_names}")
+
+
+@as_validated_field
+def image_size_validator(value: Optional[dict[str, int]] = None):
+    possible_keys = ["height", "width", "longest_edge", "shortest_edge", "max_height", "max_width"]
+    if value is None:
+        pass
+    elif not isinstance(value, dict) or any(k not in possible_keys for k in value.keys()):
+        raise ValueError(f"Value for size must be a dict with keys {possible_keys} but got size={value}")
+
+
+@as_validated_field
+def device_validator(value: Optional[Union[str, int]] = None):
+    possible_names = ["cpu", "cuda", "xla", "xpu", "mps", "meta"]
+    if value is None:
+        pass
+    elif isinstance(value, int) and value < 0:
+        raise ValueError(
+            f"If device is an integer, the value must be a strictly positive integer but got device={value}"
+        )
+    elif isinstance(value, str) or value.split(":")[0] not in possible_names:
+        raise ValueError(f"If device is an integer, the value must be one of {possible_names} but got device={value}")
+    elif not isinstance(value, (int, str)):
+        raise ValueError(
+            f"Device must be either an integer device ID or a string (e.g., 'cpu', 'cuda:0'), but got device={value}"
+        )
+
+
+@as_validated_field
+def resampling_validator(value: Optional[Union[int, PILImageResampling]] = None):
+    if value is None:
+        pass
+    elif isinstance(value, int) and value not in list(range(6)):
+        raise ValueError(
+            f"The resampling should be one of {list(range(6))} when provided as integer, but got resampling={value}"
+        )
+    elif isinstance(value, (PILImageResampling, int)):
+        raise ValueError(f"The resampling should an integer or `PIL.Image.Resampling`, but got resampling={value}")
+
+
+@as_validated_field
+def video_metadata_validator(value: Optional[Union[VideoMetadata, dict, Iterable[VideoMetadata, dict]]] = None):
+    possible_keys = ["total_num_frames", "fps", "width", "height", "duration", "video_backend", "frames_indices"]
+    if value is None:
+        pass
+    elif isinstance(value, Iterable) and not all(isinstance(item, (VideoMetadata, dict)) for item in value):
+        raise ValueError(
+            f"If `video_metadata` is a list, each item in the list should be either a dict or a `VideoMetadata` object but got video_metadata={value}"
+        )
+    elif isinstance(value, dict) and not all(key in possible_keys for key in value.keys()):
+        raise ValueError(
+            f"If video_metadata is a dict, the keys should be one of {possible_keys} but got device={value.keys()}"
+        )
+    elif not isinstance(value, (VideoMetadata, dict, Iterable)):
+        raise ValueError(
+            f"Video metadata must be either a dict, a VideoMetadata or a batched list of metadata, but got device={value}"
+        )
+
+
+@as_validated_field
+def tensor_type_validator(value: Optional[Union[str, TensorType]] = None):
+    possible_names = ["pt", "np", "mlx"]
+    if value is None:
+        pass
+    elif not isinstance(value, str) or value not in possible_names:
+        raise ValueError(f"The tensor type should be one of {possible_names} but got tensor_type={value}")
diff --git a/tests/models/colpali/test_processing_colpali.py b/tests/models/colpali/test_processing_colpali.py
index 221836db8423..119af1432ce1 100644
--- a/tests/models/colpali/test_processing_colpali.py
+++ b/tests/models/colpali/test_processing_colpali.py
@@ -133,7 +133,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
 
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         """
-        We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor.
+        We use do_rescale=True, rescale_factor=-1.0 to ensure that image_processor kwargs are preserved in the processor.
         We then check that the mean of the pixel_values is less than or equal to 0 after processing.
         Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
         """
@@ -141,7 +141,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["image_processor"] = self.get_component(
-            "image_processor", do_rescale=True, rescale_factor=-1
+            "image_processor", do_rescale=True, rescale_factor=-1.0
         )
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
 
@@ -179,7 +179,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
 
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
+        inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1.0, return_tensors="pt")
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     def test_unstructured_kwargs(self):
@@ -194,7 +194,7 @@ def test_unstructured_kwargs(self):
             text=input_str,
             return_tensors="pt",
             do_rescale=True,
-            rescale_factor=-1,
+            rescale_factor=-1.0,
             padding="max_length",
             max_length=76,
         )
@@ -213,7 +213,7 @@ def test_unstructured_kwargs_batched(self):
             images=image_input,
             return_tensors="pt",
             do_rescale=True,
-            rescale_factor=-1,
+            rescale_factor=-1.0,
             padding="longest",
             max_length=76,
         )
@@ -231,7 +231,7 @@ def test_doubly_passed_kwargs(self):
         with self.assertRaises(ValueError):
             _ = processor(
                 images=image_input,
-                images_kwargs={"do_rescale": True, "rescale_factor": -1},
+                images_kwargs={"do_rescale": True, "rescale_factor": -1.0},
                 do_rescale=True,
                 return_tensors="pt",
             )
@@ -248,7 +248,7 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"do_rescale": True, "rescale_factor": -1},
+            "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -268,7 +268,7 @@ def test_structured_kwargs_nested_from_dict(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"do_rescale": True, "rescale_factor": -1},
+            "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
diff --git a/tests/models/colqwen2/test_processing_colqwen2.py b/tests/models/colqwen2/test_processing_colqwen2.py
index 7346c0d5079c..236456dd7f88 100644
--- a/tests/models/colqwen2/test_processing_colqwen2.py
+++ b/tests/models/colqwen2/test_processing_colqwen2.py
@@ -132,7 +132,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
 
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         """
-        We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor.
+        We use do_rescale=True, rescale_factor=-1.0 to ensure that image_processor kwargs are preserved in the processor.
         We then check that the mean of the pixel_values is less than or equal to 0 after processing.
         Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
         """
@@ -140,7 +140,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["image_processor"] = self.get_component(
-            "image_processor", do_rescale=True, rescale_factor=-1
+            "image_processor", do_rescale=True, rescale_factor=-1.0
         )
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
 
@@ -178,7 +178,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
 
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
+        inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1.0, return_tensors="pt")
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     def test_unstructured_kwargs(self):
@@ -193,7 +193,7 @@ def test_unstructured_kwargs(self):
             text=input_str,
             return_tensors="pt",
             do_rescale=True,
-            rescale_factor=-1,
+            rescale_factor=-1.0,
             padding="max_length",
             max_length=76,
         )
@@ -212,7 +212,7 @@ def test_unstructured_kwargs_batched(self):
             images=image_input,
             return_tensors="pt",
             do_rescale=True,
-            rescale_factor=-1,
+            rescale_factor=-1.0,
             padding="longest",
             max_length=76,
         )
@@ -230,7 +230,7 @@ def test_doubly_passed_kwargs(self):
         with self.assertRaises(ValueError):
             _ = processor(
                 images=image_input,
-                images_kwargs={"do_rescale": True, "rescale_factor": -1},
+                images_kwargs={"do_rescale": True, "rescale_factor": -1.0},
                 do_rescale=True,
                 return_tensors="pt",
             )
@@ -247,7 +247,7 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"do_rescale": True, "rescale_factor": -1},
+            "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -267,7 +267,7 @@ def test_structured_kwargs_nested_from_dict(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"do_rescale": True, "rescale_factor": -1},
+            "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
diff --git a/tests/models/janus/test_processing_janus.py b/tests/models/janus/test_processing_janus.py
index 7e1b025721dc..2ebf55862650 100644
--- a/tests/models/janus/test_processing_janus.py
+++ b/tests/models/janus/test_processing_janus.py
@@ -444,7 +444,7 @@ def test_chat_template_accepts_processing_kwargs(self):
             tokenize=True,
             return_dict=True,
             do_rescale=True,
-            rescale_factor=-1,
+            rescale_factor=-1.0,
             return_tensors="np",
         )
         self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0)
diff --git a/tests/models/mllama/test_processing_mllama.py b/tests/models/mllama/test_processing_mllama.py
index be1472496823..9481d91f08bb 100644
--- a/tests/models/mllama/test_processing_mllama.py
+++ b/tests/models/mllama/test_processing_mllama.py
@@ -370,7 +370,7 @@ def test_unstructured_kwargs_batched(self):
             images=image_input,
             return_tensors="pt",
             do_rescale=True,
-            rescale_factor=-1,
+            rescale_factor=-1.0,
             padding="longest",
             max_length=76,
         )
diff --git a/tests/models/smolvlm/test_processing_smolvlm.py b/tests/models/smolvlm/test_processing_smolvlm.py
index 3a11103d6efb..40aaaf7a6ca2 100644
--- a/tests/models/smolvlm/test_processing_smolvlm.py
+++ b/tests/models/smolvlm/test_processing_smolvlm.py
@@ -482,7 +482,7 @@ def test_unstructured_kwargs_batched_video(self):
             videos=video_input,
             return_tensors="pt",
             do_rescale=True,
-            rescale_factor=-1,
+            rescale_factor=-1.0,
             padding="max_length",
             max_length=172,
         )
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index eef97c5b06c7..8eb30d787c01 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -383,7 +383,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
 
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         """
-        We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor.
+        We use do_rescale=True, rescale_factor=-1.0 to ensure that image_processor kwargs are preserved in the processor.
         We then check that the mean of the pixel_values is less than or equal to 0 after processing.
         Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
         """
@@ -391,7 +391,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["image_processor"] = self.get_component(
-            "image_processor", do_rescale=True, rescale_factor=-1
+            "image_processor", do_rescale=True, rescale_factor=-1.0
         )
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
         processor_kwargs = self.prepare_processor_dict()
@@ -437,7 +437,9 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         input_str = self.prepare_text_inputs(modalities="image")
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
+        inputs = processor(
+            text=input_str, images=image_input, do_rescale=True, rescale_factor=-1.0, return_tensors="pt"
+        )
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
 
     def test_unstructured_kwargs(self):
@@ -455,7 +457,7 @@ def test_unstructured_kwargs(self):
             images=image_input,
             return_tensors="pt",
             do_rescale=True,
-            rescale_factor=-1,
+            rescale_factor=-1.0,
             padding="max_length",
             max_length=76,
         )
@@ -478,7 +480,7 @@ def test_unstructured_kwargs_batched(self):
             images=image_input,
             return_tensors="pt",
             do_rescale=True,
-            rescale_factor=-1,
+            rescale_factor=-1.0,
             padding="longest",
             max_length=76,
         )
@@ -503,7 +505,7 @@ def test_doubly_passed_kwargs(self):
             _ = processor(
                 text=input_str,
                 images=image_input,
-                images_kwargs={"do_rescale": True, "rescale_factor": -1},
+                images_kwargs={"do_rescale": True, "rescale_factor": -1.0},
                 do_rescale=True,
                 return_tensors="pt",
             )
@@ -534,7 +536,7 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"do_rescale": True, "rescale_factor": -1},
+            "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -557,7 +559,7 @@ def test_structured_kwargs_nested_from_dict(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"do_rescale": True, "rescale_factor": -1},
+            "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -683,7 +685,7 @@ def test_tokenizer_defaults_preserved_by_kwargs_video(self):
 
     def test_video_processor_defaults_preserved_by_video_kwargs(self):
         """
-        We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor.
+        We use do_rescale=True, rescale_factor=-1.0 to ensure that image_processor kwargs are preserved in the processor.
         We then check that the mean of the pixel_values is less than or equal to 0 after processing.
         Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
         """
@@ -691,7 +693,7 @@ def test_video_processor_defaults_preserved_by_video_kwargs(self):
             self.skipTest(f"video_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["video_processor"] = self.get_component(
-            "video_processor", do_rescale=True, rescale_factor=-1
+            "video_processor", do_rescale=True, rescale_factor=-1.0
         )
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
         processor_kwargs = self.prepare_processor_dict()
@@ -747,7 +749,7 @@ def test_kwargs_overrides_default_video_processor_kwargs(self):
             videos=video_input,
             do_sample_frames=False,
             do_rescale=True,
-            rescale_factor=-1,
+            rescale_factor=-1.0,
             return_tensors="pt",
         )
         self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
@@ -768,7 +770,7 @@ def test_unstructured_kwargs_video(self):
             do_sample_frames=False,
             return_tensors="pt",
             do_rescale=True,
-            rescale_factor=-1,
+            rescale_factor=-1.0,
             padding="max_length",
             max_length=176,
         )
@@ -792,7 +794,7 @@ def test_unstructured_kwargs_batched_video(self):
             do_sample_frames=False,
             return_tensors="pt",
             do_rescale=True,
-            rescale_factor=-1,
+            rescale_factor=-1.0,
             padding="longest",
             max_length=176,
         )
@@ -818,7 +820,7 @@ def test_doubly_passed_kwargs_video(self):
                 text=input_str,
                 videos=video_input,
                 do_sample_frames=False,
-                videos_kwargs={"do_rescale": True, "rescale_factor": -1},
+                videos_kwargs={"do_rescale": True, "rescale_factor": -1.0},
                 do_rescale=True,
                 return_tensors="pt",
             )

From 02e22c611f08a36f8fcb2ff877b2f4536d4bbe04 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 10 Sep 2025 14:39:54 +0200
Subject: [PATCH 02/28] delete

---
 smolvlm.py | 119 -----------------------------------------------------
 1 file changed, 119 deletions(-)
 delete mode 100644 smolvlm.py

diff --git a/smolvlm.py b/smolvlm.py
deleted file mode 100644
index ef01fd9f087c..000000000000
--- a/smolvlm.py
+++ /dev/null
@@ -1,119 +0,0 @@
-from transformers import Qwen2VLProcessor
-
-if __name__ == "__main__":
-
-    for i in range(1):
-      processor = Qwen2VLProcessor.from_pretrained(pretrained_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", use_fast=True)
-      processor
-
-
-from typing_extensions import Unpack
-from transformers.tokenization_utils_base import PaddingStrategy
-from typing import Union, TypeVar, Generic, get_type_hints, TypedDict, Literal, Annotated, Optional, get_origin, get_args
-from dataclasses import make_dataclass, field
-
-my_int = TypeVar('my_int', bound=int)
-
-
-class Mixin:
-    def mixin_method(self):
-        return 0
-
-class Stack(Mixin, Generic[my_int]):
-    def __init__(self) -> None:
-        # Create an empty list with items of type T
-        self.items: list[my_int] = []
-
-    def push(self, item: my_int) -> None:
-        self.items.append(item)
-
-
-class ModelStack(Stack[str]):
-    pass
-
-s = ModelStack()
-s.push(0)
-
-
-
-from dataclasses import dataclass, MISSING, fields
-from huggingface_hub.dataclasses import as_validated_field, strict, validated_field
-
-def positive_int(value: int):
-    if not value >= 0:
-        raise ValueError(f"Value must be positive, got {value}")
-
-
-def multiple_of_64(value: int):
-    if not value % 64 == 0:
-        raise ValueError(f"Value must be a multiple of 64, got {value}")
-
-
-@as_validated_field
-def strictly_positive(value: int = None):
-    if value is not None and not value > 0:
-        raise ValueError(f"Value must be strictly positive, got {value}")
-
-@as_validated_field
-def padding_validator(value: Union[bool, str, PaddingStrategy] = None):
-    if value is None:
-        return
-
-    if not isinstance(value, (bool, str, PaddingStrategy)):
-        raise ValueError(f"Value must be padding")
-    if isinstance(value, str) and value not in ["longest", "max_length", "do_not_pad"]:
-        raise ValueError(f'Value for padding must be one of ["longest", "max_length", "do_not_pad"]')
-
-@strict
-@dataclass
-class Config:
-    model_type: str
-    hidden_size: int = validated_field(validator=[positive_int, multiple_of_64])
-    vocab_size: int = strictly_positive(default=16)
-
-
-class AnotherKwargs(TypedDict, total=False):
-    name: Union[str, list[str]]
-    age: Annotated[Optional[int], strictly_positive()]
-    padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()]
-    padding_side: Optional[Literal["right", "left"]]
-
-
-def unpack_annotated_type(type):
-    if get_origin(type) is Annotated:
-        base, *meta = get_args(type)
-        return base, meta[0]
-    return type, field(default=MISSING)
-
-
-def dataclass_from_typed_dict(td: type[TypedDict]):
-    hints = get_type_hints(td, include_extras=True)
-    dc_fields = [
-        (k, *unpack_annotated_type(v))
-        for k, v in hints.items()
-    ]
-    return make_dataclass(td.__name__ + "Config", dc_fields)
-
-
-class HubTypeAdapter():
-    def __init__(self, type: type[TypedDict]) -> None:
-        self.type = type
-        dataclass = dataclass_from_typed_dict(type)
-        self.dataclass = strict(dataclass)
-    
-    def validate_fields(self, **kwargs):
-        for f in fields(self.dataclass):
-            if f.name not in kwargs:
-                kwargs[f.name] = None
-        self.dataclass(**kwargs)
-    
-
-config = Config(model_type="bert", vocab_size=30000, hidden_size=768)
-print(config.__dataclass_fields__)
-assert config.model_type == "bert"
-assert config.vocab_size == 30000
-assert config.hidden_size == 768
-
-HubTypeAdapter(AnotherKwargs).validate_fields(name=["BOB", "MARY"], age=100, padding=None)
-print(AnotherKwargs.__annotations__['age'])
-

From e74487502acce9d84a89f53f26a3cd21e203091b Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 10 Sep 2025 19:15:33 +0200
Subject: [PATCH 03/28] fix a few tests

---
 src/transformers/models/aria/modular_aria.py  | 10 ++++-
 .../models/aria/processing_aria.py            | 10 ++++-
 .../models/glm4v/processing_glm4v.py          |  2 +-
 .../models/glm4v/video_processing_glm4v.py    |  2 +-
 .../internvl/video_processing_internvl.py     |  2 +-
 .../models/kosmos2/processing_kosmos2.py      |  4 +-
 .../models/mllama/processing_mllama.py        | 16 +++-----
 .../models/owlv2/processing_owlv2.py          | 16 +-------
 .../models/qwen2_5_vl/modular_qwen2_5_vl.py   |  2 +-
 .../qwen2_5_vl/processing_qwen2_5_vl.py       |  2 +-
 .../models/smolvlm/processing_smolvlm.py      | 15 +++++++-
 .../smolvlm/video_processing_smolvlm.py       |  2 +-
 .../models/udop/processing_udop.py            |  2 +-
 src/transformers/processing_utils.py          | 38 +++++++++++--------
 src/transformers/utils/type_validators.py     | 27 ++++++++-----
 15 files changed, 88 insertions(+), 62 deletions(-)

diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
index 790003d853c4..2c3b3e996ee5 100644
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@@ -36,7 +36,7 @@
 )
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_utils import PreTrainedModel
-from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils import PreTokenizedInput, TextInput
 from ...utils import TensorType, TransformersKwargs, auto_docstring, can_return_tuple, logging
 from ...utils.import_utils import is_torch_available
@@ -910,7 +910,15 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
         return num_patches
 
 
+class AriaImagesKwargs(ImagesKwargs, total=False):
+    split_image: Optional[bool]
+    max_image_size: Optional[int]
+    min_image_size: Optional[int]
+
+
 class AriaProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: AriaImagesKwargs
+
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py
index 9264776e80fd..34b03f126d70 100644
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@@ -24,13 +24,21 @@
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils import PreTokenizedInput, TextInput
 from ...utils import TensorType
 from ..auto import AutoTokenizer
 
 
+class AriaImagesKwargs(ImagesKwargs, total=False):
+    split_image: Optional[bool]
+    max_image_size: Optional[int]
+    min_image_size: Optional[int]
+
+
 class AriaProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: AriaImagesKwargs
+
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py
index 817da3630d52..e0a005b1ad1d 100644
--- a/src/transformers/models/glm4v/processing_glm4v.py
+++ b/src/transformers/models/glm4v/processing_glm4v.py
@@ -34,7 +34,7 @@
 
 
 class Glm4vVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Union[list[float], float]
+    fps: Optional[Union[list[float], float]]
 
 
 class Glm4vImagesKwargs(ImagesKwargs):
diff --git a/src/transformers/models/glm4v/video_processing_glm4v.py b/src/transformers/models/glm4v/video_processing_glm4v.py
index a327ac200507..cf616318df51 100644
--- a/src/transformers/models/glm4v/video_processing_glm4v.py
+++ b/src/transformers/models/glm4v/video_processing_glm4v.py
@@ -55,7 +55,7 @@
 
 
 class Glm4vVideoProcessorInitKwargs(VideosKwargs):
-    max_image_size: dict[str, int] = None
+    max_image_size: Optional[dict[str, int]] = None
     patch_size: Optional[int] = None
     temporal_patch_size: Optional[int] = None
     merge_size: Optional[int] = None
diff --git a/src/transformers/models/internvl/video_processing_internvl.py b/src/transformers/models/internvl/video_processing_internvl.py
index 2fc5729119e9..d2fd594ddceb 100644
--- a/src/transformers/models/internvl/video_processing_internvl.py
+++ b/src/transformers/models/internvl/video_processing_internvl.py
@@ -50,7 +50,7 @@
 
 
 class InternVLVideoProcessorInitKwargs(VideosKwargs):
-    initial_shift: Union[bool, float, int]
+    initial_shift: Optional[Union[bool, float, int]]
 
 
 @requires(backends=("torchvision",))
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index 58b3dff1e07a..152e73e04cdd 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -17,7 +17,7 @@
 import copy
 import math
 import re
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
@@ -35,7 +35,7 @@
 
 
 class Kosmos2ImagesKwargs(ImagesKwargs, total=False):
-    bboxes: Optional[list[float]]
+    bboxes: Optional[list[Any]]
     num_image_tokens: Optional[int]
     first_image_token_id: Optional[int]
 
diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index 0dae7c834303..0ea3a6c0f1cb 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -269,10 +269,8 @@ def __call__(
             **kwargs,
         )
 
-        text_kwargs = output_kwargs["text_kwargs"]
-        text_kwargs["return_tensors"] = None
-        images_kwargs = output_kwargs["images_kwargs"]
-        common_kwargs = output_kwargs["common_kwargs"]
+        # Pop return_tensors for now because we perform manipulations with token ids below
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
 
         data = {}
         if text is not None:
@@ -282,8 +280,7 @@ def __call__(
                 raise ValueError("Invalid input text. Please provide a string, or a list of strings")
             n_images_in_text = [t.count(self.image_token) for t in text]
             text = [build_string_from_input(text_item, self.bos_token, self.image_token) for text_item in text]
-            _ = text_kwargs.pop("padding_side", None)  # hack until padding-side is an accepted kwarg by tokenizers
-            encoding = self.tokenizer(text, **text_kwargs)
+            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
             self._check_special_mm_tokens(text, encoding, modalities=["image"])
             n_images_in_ids = [token_ids.count(self.image_token_id) for token_ids in encoding["input_ids"]]
             data.update(encoding)
@@ -319,7 +316,7 @@ def __call__(
                     )
 
         if images is not None:
-            image_features = self.image_processor(images, **images_kwargs)
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
             num_tiles = image_features.pop("num_tiles")
             data.update(image_features)
 
@@ -336,10 +333,7 @@ def __call__(
             )
             data["cross_attention_mask"] = cross_attention_mask
 
-        return_tensors = common_kwargs.pop("return_tensors", None)
-        batch_feature = BatchFeature(data=data, tensor_type=return_tensors)
-
-        return batch_feature
+        return BatchFeature(data=data, tensor_type=return_tensors)
 
     def post_process_image_text_to_text(
         self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index 2e69379af73f..160daa4f5ae2 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -30,7 +30,7 @@
     Unpack,
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
+from ...utils import TensorType, is_torch_available
 
 
 if TYPE_CHECKING:
@@ -125,7 +125,7 @@ def __call__(
             **kwargs,
         )
         query_images = output_kwargs["images_kwargs"].pop("query_images", None)
-        return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
+        return_tensors = output_kwargs["text_kwargs"].get("return_tensors", None)
 
         if text is None and query_images is None and images is None:
             raise ValueError(
@@ -157,24 +157,12 @@ def __call__(
                 input_ids = np.concatenate([encoding["input_ids"] for encoding in encodings], axis=0)
                 attention_mask = np.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0)
 
-            elif return_tensors == "jax" and is_flax_available():
-                import jax.numpy as jnp
-
-                input_ids = jnp.concatenate([encoding["input_ids"] for encoding in encodings], axis=0)
-                attention_mask = jnp.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0)
-
             elif return_tensors == "pt" and is_torch_available():
                 import torch
 
                 input_ids = torch.cat([encoding["input_ids"] for encoding in encodings], dim=0)
                 attention_mask = torch.cat([encoding["attention_mask"] for encoding in encodings], dim=0)
 
-            elif return_tensors == "tf" and is_tf_available():
-                import tensorflow as tf
-
-                input_ids = tf.stack([encoding["input_ids"] for encoding in encodings], axis=0)
-                attention_mask = tf.stack([encoding["attention_mask"] for encoding in encodings], axis=0)
-
             else:
                 raise ValueError("Target return tensor type could not be returned")
 
diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
index d62f94f37678..8ad5b7d9ec53 100644
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@@ -840,7 +840,7 @@ def prepare_inputs_for_generation(
 
 
 class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Union[list[float], float]
+    fps: Optional[Union[list[float], float]]
 
 
 class Qwen2_5_VLImagesKwargs(Qwen2VLImagesKwargs):
diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
index b357ba850deb..7be98e7e5023 100644
--- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
@@ -36,7 +36,7 @@
 
 
 class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Union[list[float], float]
+    fps: Optional[Union[list[float], float]]
 
 
 class Qwen2_5_VLImagesKwargs(ImagesKwargs):
diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
index 97f0eaa9e7b2..1ea922d6a19d 100644
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -21,7 +21,14 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, make_nested_list_of_images
-from ...processing_utils import AllKwargsForChatTemplate, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import (
+    AllKwargsForChatTemplate,
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+    VideosKwargs,
+)
 from ...tokenization_utils_base import BatchEncoding, TextInput
 from ...utils import is_num2words_available, is_vision_available, logging
 from ...video_utils import VideoInput
@@ -108,8 +115,14 @@ class SmolVLMImagesKwargs(ImagesKwargs, total=False):
     max_image_size: Optional[dict[str, int]]
 
 
+class SmolVLMVideosKwargs(VideosKwargs, total=False):
+    return_row_col_info: Optional[bool]
+    max_image_size: Optional[dict[str, int]]
+
+
 class SmolVLMProcessorKwargs(ProcessingKwargs, total=False):
     images_kwargs: SmolVLMImagesKwargs
+    videos_kwargs: SmolVLMVideosKwargs
 
     _defaults = {
         "text_kwargs": {
diff --git a/src/transformers/models/smolvlm/video_processing_smolvlm.py b/src/transformers/models/smolvlm/video_processing_smolvlm.py
index 5ad70d870c63..9613437be85b 100644
--- a/src/transformers/models/smolvlm/video_processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/video_processing_smolvlm.py
@@ -121,7 +121,7 @@ def get_resize_output_image_size(
 
 
 class SmolVLMVideoProcessorInitKwargs(VideosKwargs):
-    max_image_size: dict[str, int] = None
+    max_image_size: Optional[dict[str, int]] = None
 
 
 @requires(backends=("torchvision",))
diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py
index 94b1565c9a22..206078cad899 100644
--- a/src/transformers/models/udop/processing_udop.py
+++ b/src/transformers/models/udop/processing_udop.py
@@ -31,7 +31,7 @@
 
 class UdopTextKwargs(TextKwargs, total=False):
     word_labels: Optional[Union[list[int], list[list[int]]]]
-    boxes: Union[list[list[int]], list[list[list[int]]]]
+    boxes: Optional[Union[list[list[int]], list[list[list[int]]]]]
 
 
 class UdopProcessorKwargs(ProcessingKwargs, total=False):
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 038c9f5ecaa6..4879c2433222 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -14,6 +14,7 @@
 """
 Processing saving/loading class for common processors.
 """
+# from __future__ import annotations
 
 import bisect
 import copy
@@ -42,9 +43,9 @@
     device_validator,
     image_size_validator,
     padding_validator,
+    positive_any_number,
+    positive_int,
     resampling_validator,
-    strictly_positive_int,
-    strictly_positive_number,
     tensor_type_validator,
     truncation_validator,
     video_metadata_validator,
@@ -55,7 +56,6 @@
 if is_vision_available():
     from .image_utils import PILImageResampling
 
-
 from .tokenization_utils_base import (
     PaddingStrategy,
     PreTokenizedInput,
@@ -156,10 +156,10 @@ class TextKwargs(TypedDict, total=False):
     add_special_tokens: Optional[bool]
     padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()]
     truncation: Annotated[Optional[Union[bool, str, TruncationStrategy]], truncation_validator()]
-    max_length: Annotated[Optional[int], strictly_positive_int()]
-    stride: Annotated[Optional[int], strictly_positive_int()]
+    max_length: Annotated[Optional[int], positive_int()]
+    stride: Annotated[Optional[int], positive_int()]
     is_split_into_words: Optional[bool]
-    pad_to_multiple_of: Annotated[Optional[int], strictly_positive_int()]
+    pad_to_multiple_of: Annotated[Optional[int], positive_int()]
     return_token_type_ids: Optional[bool]
     return_attention_mask: Optional[bool]
     return_overflowing_tokens: Optional[bool]
@@ -186,6 +186,8 @@ class methods and docstrings.
             The size by which to make sure both the height and width can be divided.
         crop_size (`dict[str, int]`, *optional*):
             Desired output size when applying center-cropping.
+        do_convert_rgb (`bool`):
+            Whether to convert the video to RGB format.
         resample (`PILImageResampling`, *optional*):
             Resampling filter to use if resizing the image.
         do_rescale (`bool`, *optional*):
@@ -212,9 +214,10 @@ class methods and docstrings.
             The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
     """
 
+    do_convert_rgb: Optional[bool]
     do_resize: Optional[bool]
     size: Annotated[Optional[dict[str, int]], image_size_validator()]
-    size_divisor: Annotated[Optional[int], strictly_positive_int()]
+    size_divisor: Annotated[Optional[int], positive_int()]
     crop_size: Annotated[Optional[dict[str, int]], image_size_validator()]
     resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()]
     do_rescale: Optional[bool]
@@ -283,7 +286,7 @@ class VideosKwargs(TypedDict, total=False):
     do_convert_rgb: Optional[bool]
     do_resize: Optional[bool]
     size: Annotated[Optional[dict[str, int]], image_size_validator()]
-    size_divisor: Annotated[Optional[int], strictly_positive_int()]
+    size_divisor: Annotated[Optional[int], positive_int()]
     default_to_square: Optional[bool]
     resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()]
     do_rescale: Optional[bool]
@@ -301,8 +304,8 @@ class VideosKwargs(TypedDict, total=False):
     video_metadata: Annotated[
         Optional[Union[VideoMetadata, dict, Iterable[VideoMetadata, dict]]], video_metadata_validator()
     ]
-    fps: Annotated[Optional[Union[int, float]], strictly_positive_number()]
-    num_frames: Annotated[Optional[int], strictly_positive_int()]
+    fps: Annotated[Optional[Union[int, float]], positive_any_number()]
+    num_frames: Annotated[Optional[int], positive_int()]
     return_metadata: Optional[bool]
     return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
 
@@ -337,12 +340,12 @@ class AudioKwargs(TypedDict, total=False):
             Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
     """
 
-    sampling_rate: Annotated[Optional[int], strictly_positive_int()]
+    sampling_rate: Annotated[Optional[int], positive_int()]
     raw_speech: Optional[Union["np.ndarray", list[float], list["np.ndarray"], list[list[float]]]]
     padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()]
-    max_length: Annotated[Optional[int], strictly_positive_int()]
+    max_length: Annotated[Optional[int], positive_int()]
     truncation: Annotated[Optional[Union[bool, str, TruncationStrategy]], truncation_validator()]
-    pad_to_multiple_of: Annotated[Optional[int], strictly_positive_int()]
+    pad_to_multiple_of: Annotated[Optional[int], positive_int()]
     return_attention_mask: Optional[bool]
     return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
 
@@ -1335,9 +1338,14 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
             for kwarg in output_kwargs.values():
                 kwarg.update(common_kwargs)
 
-        # Perform type validation on collected kwargs
+        # Finally perform type validation on collected kwargs
+        # NOTE: When we inherit from BaseTypedDict, the bases won't be in MRO of ModelTypedDict
+        # That causes errors if certain type annotations are not defined/imported in model processor
+        # file. So we will pass globalns of `processing_utils.py` manually to bypass it
+        base_globalns = getattr(sys.modules.get(ProcessingKwargs.__module__, None), "__dict__", {})
         for key, typed_dict_obj in ModelProcessorKwargs.__annotations__.items():
-            type_validator = TypedDictAdapter(typed_dict_obj)
+            child_localns = getattr(sys.modules.get(typed_dict_obj.__module__, None), "__dict__", {})
+            type_validator = TypedDictAdapter(typed_dict_obj, globalns=base_globalns, localns=child_localns)
             type_validator.validate_fields(**output_kwargs[key])
         return output_kwargs
 
diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py
index cb846b8a6255..883b852d5fd7 100644
--- a/src/transformers/utils/type_validators.py
+++ b/src/transformers/utils/type_validators.py
@@ -1,6 +1,6 @@
 from collections.abc import Iterable
 from dataclasses import MISSING, field, make_dataclass
-from typing import Annotated, Optional, TypedDict, Union, get_args, get_origin, get_type_hints
+from typing import Annotated, Any, Optional, TypedDict, Union, get_args, get_origin, get_type_hints
 
 from huggingface_hub.dataclasses import as_validated_field, strict
 
@@ -21,7 +21,7 @@ def unpack_annotated_type(type):
     return type, field(default=MISSING)
 
 
-# Minimalistic version on pydantic.TypeAdapter tailored for `TypedDict`
+# Minimalistic version of pydantic.TypeAdapter tailored for `TypedDict`
 class TypedDictAdapter:
     """
     A utility class used to convert a TypedDict object to dataclass and attach
@@ -31,8 +31,15 @@ class TypedDictAdapter:
         type: The TypedDict object that needs to be validated.
     """
 
-    def __init__(self, type: type[TypedDict]) -> None:
+    def __init__(
+        self,
+        type: type[TypedDict],
+        globalns: Optional[dict[str, Any]] = None,
+        localns: Optional[dict[str, Any]] = None,
+    ):
         self.type = type
+        self.globalns = globalns
+        self.localns = localns
         self.dataclass = self.create_dataclass()
         self.dataclass = strict(self.dataclass)
 
@@ -68,22 +75,22 @@ class TokenizerKwargs(TypedDict, total=False):
         # The dataclass can also be used as a simple config class for easier kwarg management
         dataclass = dataclass_from_typed_dict(TokenizerKwargs)
         """
-        hints = get_type_hints(self.type, include_extras=True)
+        hints = get_type_hints(self.type, globalns=self.globalns, localns=self.localns, include_extras=True)
         fields = [(k, *unpack_annotated_type(v)) for k, v in hints.items()]
         self.fields = fields
         return make_dataclass(self.type.__name__ + "Config", fields)
 
 
 @as_validated_field
-def strictly_positive_number(value: Optional[Union[int, float]] = None):
-    if value is not None and (not isinstance(value, (int, float)) or not value > 0):
-        raise ValueError(f"Value must be strictly positive, got {value}")
+def positive_any_number(value: Optional[Union[int, float]] = None):
+    if value is not None and (not isinstance(value, (int, float)) or not value >= 0):
+        raise ValueError(f"Value must be a positive integer or floating number, got {value}")
 
 
 @as_validated_field
-def strictly_positive_int(value: Optional[int] = None):
-    if value is not None and (not isinstance(value, int) or not value > 0):
-        raise ValueError(f"Value must be strictly positive integer, got {value}")
+def positive_int(value: Optional[int] = None):
+    if value is not None and (not isinstance(value, int) or not value >= 0):
+        raise ValueError(f"Value must be a positive integer, got {value}")
 
 
 @as_validated_field

From 63532bf39b160bb8394ce73b205224e0d8e4164f Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 10 Sep 2025 19:24:10 +0200
Subject: [PATCH 04/28] fix

---
 src/transformers/utils/type_validators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py
index 883b852d5fd7..f35c3fe5c213 100644
--- a/src/transformers/utils/type_validators.py
+++ b/src/transformers/utils/type_validators.py
@@ -142,7 +142,7 @@ def device_validator(value: Optional[Union[str, int]] = None):
 
 
 @as_validated_field
-def resampling_validator(value: Optional[Union[int, PILImageResampling]] = None):
+def resampling_validator(value: Optional[Union[int, "PILImageResampling"]] = None):
     if value is None:
         pass
     elif isinstance(value, int) and value not in list(range(6)):

From 1f62d6f70a8a997befa54656990b07515b4d2491 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 11 Sep 2025 13:25:26 +0200
Subject: [PATCH 05/28] fix the rest of tests

---
 src/transformers/models/dia/processing_dia.py |  7 ++---
 .../models/kosmos2/processing_kosmos2.py      |  6 ++--
 .../models/kosmos2_5/processing_kosmos2_5.py  |  3 +-
 .../models/owlv2/processing_owlv2.py          | 16 +++++++++--
 .../models/owlvit/processing_owlvit.py        |  2 +-
 src/transformers/models/sam/processing_sam.py |  2 +-
 .../models/sam_hq/processing_samhq.py         |  2 +-
 src/transformers/utils/type_validators.py     | 28 +++++++++++++++----
 src/transformers/video_processing_utils.py    |  6 ++++
 tests/test_processing_common.py               |  4 +--
 10 files changed, 57 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py
index 402f5152a64b..e435ba23cc4a 100644
--- a/src/transformers/models/dia/processing_dia.py
+++ b/src/transformers/models/dia/processing_dia.py
@@ -46,6 +46,7 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False):
             "padding": True,
             "padding_side": "right",
             "add_special_tokens": False,
+            "return_tensors": "pt",
         },
         "audio_kwargs": {
             "eos_token_id": 1024,
@@ -54,8 +55,8 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False):
             "delay_pattern": [0, 8, 9, 10, 11, 12, 13, 14, 15],
             "generation": True,
             "sampling_rate": 44100,
+            "return_tensors": "pt",
         },
-        "common_kwargs": {"return_tensors": "pt"},
     }
 
 
@@ -111,9 +112,7 @@ def __call__(
 
         text_kwargs = output_kwargs["text_kwargs"]
         audio_kwargs = output_kwargs["audio_kwargs"]
-        common_kwargs = output_kwargs["common_kwargs"]
-
-        return_tensors = common_kwargs.pop("return_tensors", None)
+        return_tensors = output_kwargs["text_kwargs"].get("return_tensors", None)
         if return_tensors != "pt":
             raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
 
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index 152e73e04cdd..423a395d74a0 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -17,7 +17,7 @@
 import copy
 import math
 import re
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
@@ -33,9 +33,11 @@
     list[list[tuple[float, float, float]]],
 ]
 
+NestedList = list[Union[Optional[int], "NestedList"]]
+
 
 class Kosmos2ImagesKwargs(ImagesKwargs, total=False):
-    bboxes: Optional[list[Any]]
+    bboxes: Optional[NestedList]
     num_image_tokens: Optional[int]
     first_image_token_id: Optional[int]
 
diff --git a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
index 0e3c70c80234..1bc516038b9e 100644
--- a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
+++ b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
@@ -43,12 +43,13 @@ class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
             "return_token_type_ids": False,
             "stride": 0,
             "truncation": True,
+            "return_tensors": "pt",
         },
         "images_kwargs": {
             "max_patches": 4096,
             "num_image_tokens": 2048,
+            "return_tensors": "pt",
         },
-        "common_kwargs": {"return_tensors": "pt"},
     }
 
 
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index 160daa4f5ae2..57da5ce03370 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -30,7 +30,7 @@
     Unpack,
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import TensorType, is_torch_available
+from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
 
 
 if TYPE_CHECKING:
@@ -125,7 +125,7 @@ def __call__(
             **kwargs,
         )
         query_images = output_kwargs["images_kwargs"].pop("query_images", None)
-        return_tensors = output_kwargs["text_kwargs"].get("return_tensors", None)
+        return_tensors = output_kwargs["text_kwargs"]["return_tensors"]
 
         if text is None and query_images is None and images is None:
             raise ValueError(
@@ -157,12 +157,24 @@ def __call__(
                 input_ids = np.concatenate([encoding["input_ids"] for encoding in encodings], axis=0)
                 attention_mask = np.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0)
 
+            elif return_tensors == "jax" and is_flax_available():
+                import jax.numpy as jnp
+
+                input_ids = jnp.concatenate([encoding["input_ids"] for encoding in encodings], axis=0)
+                attention_mask = jnp.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0)
+
             elif return_tensors == "pt" and is_torch_available():
                 import torch
 
                 input_ids = torch.cat([encoding["input_ids"] for encoding in encodings], dim=0)
                 attention_mask = torch.cat([encoding["attention_mask"] for encoding in encodings], dim=0)
 
+            elif return_tensors == "tf" and is_tf_available():
+                import tensorflow as tf
+
+                input_ids = tf.stack([encoding["input_ids"] for encoding in encodings], axis=0)
+                attention_mask = tf.stack([encoding["attention_mask"] for encoding in encodings], axis=0)
+
             else:
                 raise ValueError("Target return tensor type could not be returned")
 
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index 0e0c59d555f2..c62d53c77f9e 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -135,7 +135,7 @@ def __call__(
             **kwargs,
         )
         query_images = output_kwargs["images_kwargs"].pop("query_images", None)
-        return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
+        return_tensors = output_kwargs["text_kwargs"]["return_tensors"]
 
         if text is None and query_images is None and images is None:
             raise ValueError(
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index 603adde95040..2552a4c66460 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -117,7 +117,7 @@ def __call__(
             input_points=input_points,
             input_labels=input_labels,
             input_boxes=input_boxes,
-            return_tensors=output_kwargs["common_kwargs"].get("return_tensors"),
+            return_tensors=output_kwargs["images_kwargs"].get("return_tensors"),
             point_pad_value=point_pad_value,
         )
 
diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py
index 49681c7c6a26..1799a07201e7 100644
--- a/src/transformers/models/sam_hq/processing_samhq.py
+++ b/src/transformers/models/sam_hq/processing_samhq.py
@@ -118,7 +118,7 @@ def __call__(
             input_points=input_points,
             input_labels=input_labels,
             input_boxes=input_boxes,
-            return_tensors=output_kwargs["common_kwargs"].get("return_tensors"),
+            return_tensors=output_kwargs["images_kwargs"].get("return_tensors"),
             point_pad_value=output_kwargs["images_kwargs"].get("point_pad_value"),
         )
 
diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py
index f35c3fe5c213..5fa0c719f82b 100644
--- a/src/transformers/utils/type_validators.py
+++ b/src/transformers/utils/type_validators.py
@@ -1,6 +1,6 @@
 from collections.abc import Iterable
 from dataclasses import MISSING, field, make_dataclass
-from typing import Annotated, Any, Optional, TypedDict, Union, get_args, get_origin, get_type_hints
+from typing import Annotated, Any, ForwardRef, Optional, TypedDict, Union, get_args, get_origin
 
 from huggingface_hub.dataclasses import as_validated_field, strict
 
@@ -21,6 +21,24 @@ def unpack_annotated_type(type):
     return type, field(default=MISSING)
 
 
+def get_type_hints_from_typed_dict(obj: type[TypedDict]):
+    """
+    Same as `typing.get_type_hints` but does not perform evaluation
+    on the ForwardRefs. Evaluating might fails if the package is not imported
+    or installed, therefore we will have our own "guarded" type validations.
+    All `ForwardRef` will be ignored by the hub validator
+    """
+    raw_annots = obj.__dict__.get("__annotations__", {})
+    type_hints = {}
+    for name, value in raw_annots.items():
+        if value is None:
+            value = type(None)
+        if isinstance(value, str):
+            value = ForwardRef(value, is_argument=False)
+        type_hints[name] = value
+    return type_hints
+
+
 # Minimalistic version of pydantic.TypeAdapter tailored for `TypedDict`
 class TypedDictAdapter:
     """
@@ -75,7 +93,7 @@ class TokenizerKwargs(TypedDict, total=False):
         # The dataclass can also be used as a simple config class for easier kwarg management
         dataclass = dataclass_from_typed_dict(TokenizerKwargs)
         """
-        hints = get_type_hints(self.type, globalns=self.globalns, localns=self.localns, include_extras=True)
+        hints = get_type_hints_from_typed_dict(self.type)
         fields = [(k, *unpack_annotated_type(v)) for k, v in hints.items()]
         self.fields = fields
         return make_dataclass(self.type.__name__ + "Config", fields)
@@ -133,8 +151,8 @@ def device_validator(value: Optional[Union[str, int]] = None):
         raise ValueError(
             f"If device is an integer, the value must be a strictly positive integer but got device={value}"
         )
-    elif isinstance(value, str) or value.split(":")[0] not in possible_names:
-        raise ValueError(f"If device is an integer, the value must be one of {possible_names} but got device={value}")
+    elif isinstance(value, str) and value.split(":")[0] not in possible_names:
+        raise ValueError(f"If device is an string, the value must be one of {possible_names} but got device={value}")
     elif not isinstance(value, (int, str)):
         raise ValueError(
             f"Device must be either an integer device ID or a string (e.g., 'cpu', 'cuda:0'), but got device={value}"
@@ -149,7 +167,7 @@ def resampling_validator(value: Optional[Union[int, "PILImageResampling"]] = Non
         raise ValueError(
             f"The resampling should be one of {list(range(6))} when provided as integer, but got resampling={value}"
         )
-    elif isinstance(value, (PILImageResampling, int)):
+    elif is_vision_available() and not isinstance(value, (PILImageResampling, int)):
         raise ValueError(f"The resampling should an integer or `PIL.Image.Resampling`, but got resampling={value}")
 
 
diff --git a/src/transformers/video_processing_utils.py b/src/transformers/video_processing_utils.py
index 562a5de65718..e403d3d90774 100644
--- a/src/transformers/video_processing_utils.py
+++ b/src/transformers/video_processing_utils.py
@@ -52,6 +52,7 @@
 )
 from .utils.hub import cached_files
 from .utils.import_utils import requires
+from .utils.type_validators import TypedDictAdapter
 from .video_utils import (
     VideoInput,
     VideoMetadata,
@@ -364,6 +365,11 @@ def preprocess(
             captured_kwargs=kwargs.keys(),
             valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
         )
+
+        # Perform type validation on received kwargs
+        type_validator = TypedDictAdapter(self.valid_kwargs)
+        type_validator.validate_fields(**kwargs)
+
         # Set default kwargs from self. This ensures that if a kwarg is not provided
         # by the user, it gets its default value from the instance, or is set to None.
         for kwarg_name in self.valid_kwargs.__annotations__:
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 8eb30d787c01..862961f7b299 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -839,7 +839,7 @@ def test_structured_kwargs_nested_video(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "videos_kwargs": {"do_rescale": True, "rescale_factor": -1, "do_sample_frames": False},
+            "videos_kwargs": {"do_rescale": True, "rescale_factor": -1.0, "do_sample_frames": False},
             "text_kwargs": {"padding": "max_length", "max_length": 176},
         }
 
@@ -862,7 +862,7 @@ def test_structured_kwargs_nested_from_dict_video(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "videos_kwargs": {"do_rescale": True, "rescale_factor": -1, "do_sample_frames": False},
+            "videos_kwargs": {"do_rescale": True, "rescale_factor": -1.0, "do_sample_frames": False},
             "text_kwargs": {"padding": "max_length", "max_length": 176},
         }
 

From c203ffd81c1a67dd1015d9f86105a97519ea0380 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 11 Sep 2025 13:54:37 +0200
Subject: [PATCH 06/28] common-kwargs

---
 src/transformers/models/dia/processing_dia.py         |  5 +++--
 src/transformers/models/glm4v/processing_glm4v.py     |  2 +-
 .../models/qwen2_5_vl/modular_qwen2_5_vl.py           |  2 +-
 .../models/qwen2_5_vl/processing_qwen2_5_vl.py        |  2 +-
 src/transformers/processing_utils.py                  | 11 ++++-------
 5 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py
index e435ba23cc4a..391914e198a4 100644
--- a/src/transformers/models/dia/processing_dia.py
+++ b/src/transformers/models/dia/processing_dia.py
@@ -46,7 +46,6 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False):
             "padding": True,
             "padding_side": "right",
             "add_special_tokens": False,
-            "return_tensors": "pt",
         },
         "audio_kwargs": {
             "eos_token_id": 1024,
@@ -55,6 +54,8 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False):
             "delay_pattern": [0, 8, 9, 10, 11, 12, 13, 14, 15],
             "generation": True,
             "sampling_rate": 44100,
+        },
+        "common_kwargs": {
             "return_tensors": "pt",
         },
     }
@@ -112,7 +113,7 @@ def __call__(
 
         text_kwargs = output_kwargs["text_kwargs"]
         audio_kwargs = output_kwargs["audio_kwargs"]
-        return_tensors = output_kwargs["text_kwargs"].get("return_tensors", None)
+        return_tensors = output_kwargs["text_kwargs"]["return_tensors"]
         if return_tensors != "pt":
             raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
 
diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py
index e0a005b1ad1d..8a6d0c25f2fb 100644
--- a/src/transformers/models/glm4v/processing_glm4v.py
+++ b/src/transformers/models/glm4v/processing_glm4v.py
@@ -34,7 +34,7 @@
 
 
 class Glm4vVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Optional[Union[list[float], float]]
+    fps: Optional[Union[list[float | int] | float | int]]
 
 
 class Glm4vImagesKwargs(ImagesKwargs):
diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
index 8ad5b7d9ec53..9ee6b57b1213 100644
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@@ -840,7 +840,7 @@ def prepare_inputs_for_generation(
 
 
 class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Optional[Union[list[float], float]]
+    fps: Optional[Union[list[float | int] | float | int]]
 
 
 class Qwen2_5_VLImagesKwargs(Qwen2VLImagesKwargs):
diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
index 7be98e7e5023..f7f58fef38f6 100644
--- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
@@ -36,7 +36,7 @@
 
 
 class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Optional[Union[list[float], float]]
+    fps: Optional[Union[list[float | int] | float | int]]
 
 
 class Qwen2_5_VLImagesKwargs(ImagesKwargs):
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 4879c2433222..b6053337fcf4 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -1332,20 +1332,17 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                         f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
                     )
 
-        # BC for `common_kwargs` to update all modality-specific kwargs
+        # For `common_kwargs` just update all modality-specific kwargs with same key/values
         common_kwargs = kwargs.get("common_kwargs", {})
+        ModelProcessorKwargs._defaults["common_kwargs"]
+        common_kwargs.update(ModelProcessorKwargs._defaults.get("common_kwargs", {}))
         if common_kwargs:
             for kwarg in output_kwargs.values():
                 kwarg.update(common_kwargs)
 
         # Finally perform type validation on collected kwargs
-        # NOTE: When we inherit from BaseTypedDict, the bases won't be in MRO of ModelTypedDict
-        # That causes errors if certain type annotations are not defined/imported in model processor
-        # file. So we will pass globalns of `processing_utils.py` manually to bypass it
-        base_globalns = getattr(sys.modules.get(ProcessingKwargs.__module__, None), "__dict__", {})
         for key, typed_dict_obj in ModelProcessorKwargs.__annotations__.items():
-            child_localns = getattr(sys.modules.get(typed_dict_obj.__module__, None), "__dict__", {})
-            type_validator = TypedDictAdapter(typed_dict_obj, globalns=base_globalns, localns=child_localns)
+            type_validator = TypedDictAdapter(typed_dict_obj)
             type_validator.validate_fields(**output_kwargs[key])
         return output_kwargs
 

From 725a479e3267251720e55db37482c9d3236adb63 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 11 Sep 2025 13:59:07 +0200
Subject: [PATCH 07/28] why the runner complains about typing with "|"?

---
 src/transformers/models/glm4v/processing_glm4v.py           | 2 +-
 src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py    | 2 +-
 src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py
index 8a6d0c25f2fb..6a44e04f0d0e 100644
--- a/src/transformers/models/glm4v/processing_glm4v.py
+++ b/src/transformers/models/glm4v/processing_glm4v.py
@@ -34,7 +34,7 @@
 
 
 class Glm4vVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Optional[Union[list[float | int] | float | int]]
+    fps: Optional[Union[list[Union[float, int]], float, int]]
 
 
 class Glm4vImagesKwargs(ImagesKwargs):
diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
index 9ee6b57b1213..04cfe85a2026 100644
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@@ -840,7 +840,7 @@ def prepare_inputs_for_generation(
 
 
 class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Optional[Union[list[float | int] | float | int]]
+    fps: Optional[Union[list[Union[float, int]], float, int]]
 
 
 class Qwen2_5_VLImagesKwargs(Qwen2VLImagesKwargs):
diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
index f7f58fef38f6..de859970976e 100644
--- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
@@ -36,7 +36,7 @@
 
 
 class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Optional[Union[list[float | int] | float | int]]
+    fps: Optional[Union[list[Union[float, int]], float, int]]
 
 
 class Qwen2_5_VLImagesKwargs(ImagesKwargs):

From d8ca68348040c8c03cf0f15716e9b701f96949e3 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 11 Sep 2025 14:00:26 +0200
Subject: [PATCH 08/28] revert

---
 src/transformers/models/kosmos2_5/processing_kosmos2_5.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
index 1bc516038b9e..0e3c70c80234 100644
--- a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
+++ b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
@@ -43,13 +43,12 @@ class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
             "return_token_type_ids": False,
             "stride": 0,
             "truncation": True,
-            "return_tensors": "pt",
         },
         "images_kwargs": {
             "max_patches": 4096,
             "num_image_tokens": 2048,
-            "return_tensors": "pt",
         },
+        "common_kwargs": {"return_tensors": "pt"},
     }
 
 

From 8ff15f771e3c60f78f2dd4a0bb3daec15c4dad16 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 11 Sep 2025 14:36:42 +0200
Subject: [PATCH 09/28] forgot to delete

---
 src/transformers/processing_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index b6053337fcf4..51c1a51212ca 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -1334,7 +1334,6 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
 
         # For `common_kwargs` just update all modality-specific kwargs with same key/values
         common_kwargs = kwargs.get("common_kwargs", {})
-        ModelProcessorKwargs._defaults["common_kwargs"]
         common_kwargs.update(ModelProcessorKwargs._defaults.get("common_kwargs", {}))
         if common_kwargs:
             for kwarg in output_kwargs.values():

From b0e8120ee30c8b314bbe8a645928527bd5b461ae Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 11 Sep 2025 16:04:13 +0200
Subject: [PATCH 10/28] update

---
 src/transformers/models/csm/processing_csm.py |  4 +-
 src/transformers/processing_utils.py          |  7 +--
 src/transformers/utils/type_validators.py     | 47 ++++++++++++-------
 src/transformers/video_utils.py               |  7 ++-
 4 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/csm/processing_csm.py b/src/transformers/models/csm/processing_csm.py
index 0f929f6a2a0c..d0cad3fdfb8e 100644
--- a/src/transformers/models/csm/processing_csm.py
+++ b/src/transformers/models/csm/processing_csm.py
@@ -249,9 +249,7 @@ def __call__(
 
         text_kwargs = output_kwargs["text_kwargs"]
         audio_kwargs = output_kwargs["audio_kwargs"]
-        common_kwargs = output_kwargs["common_kwargs"]
-
-        return_tensors = common_kwargs.pop("return_tensors", None)
+        return_tensors = text_kwargs.get("return_tensors", None)
         if return_tensors != "pt":
             raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 51c1a51212ca..f80025ca2d73 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -24,7 +24,6 @@
 import sys
 import typing
 import warnings
-from collections.abc import Iterable
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Annotated, Any, Literal, Optional, TypedDict, TypeVar, Union
@@ -50,7 +49,7 @@
     truncation_validator,
     video_metadata_validator,
 )
-from .video_utils import VideoInput, VideoMetadata
+from .video_utils import VideoInput, VideoMetadataType
 
 
 if is_vision_available():
@@ -301,9 +300,7 @@ class VideosKwargs(TypedDict, total=False):
     input_data_format: Optional[Union[str, ChannelDimension]]
     device: Annotated[Optional[str], device_validator()]
     do_sample_frames: Optional[bool]
-    video_metadata: Annotated[
-        Optional[Union[VideoMetadata, dict, Iterable[VideoMetadata, dict]]], video_metadata_validator()
-    ]
+    video_metadata: Annotated[Optional[VideoMetadataType], video_metadata_validator()]
     fps: Annotated[Optional[Union[int, float]], positive_any_number()]
     num_frames: Annotated[Optional[int], positive_int()]
     return_metadata: Optional[bool]
diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py
index 5fa0c719f82b..54a014a99b20 100644
--- a/src/transformers/utils/type_validators.py
+++ b/src/transformers/utils/type_validators.py
@@ -1,11 +1,11 @@
-from collections.abc import Iterable
+from collections.abc import Sequence
 from dataclasses import MISSING, field, make_dataclass
 from typing import Annotated, Any, ForwardRef, Optional, TypedDict, Union, get_args, get_origin
 
 from huggingface_hub.dataclasses import as_validated_field, strict
 
 from ..tokenization_utils_base import PaddingStrategy, TruncationStrategy
-from ..video_utils import VideoMetadata
+from ..video_utils import VideoMetadataType
 from .generic import TensorType
 from .import_utils import is_vision_available
 
@@ -172,22 +172,35 @@ def resampling_validator(value: Optional[Union[int, "PILImageResampling"]] = Non
 
 
 @as_validated_field
-def video_metadata_validator(value: Optional[Union[VideoMetadata, dict, Iterable[VideoMetadata, dict]]] = None):
-    possible_keys = ["total_num_frames", "fps", "width", "height", "duration", "video_backend", "frames_indices"]
+def video_metadata_validator(value: Optional[VideoMetadataType] = None):
     if value is None:
-        pass
-    elif isinstance(value, Iterable) and not all(isinstance(item, (VideoMetadata, dict)) for item in value):
-        raise ValueError(
-            f"If `video_metadata` is a list, each item in the list should be either a dict or a `VideoMetadata` object but got video_metadata={value}"
-        )
-    elif isinstance(value, dict) and not all(key in possible_keys for key in value.keys()):
-        raise ValueError(
-            f"If video_metadata is a dict, the keys should be one of {possible_keys} but got device={value.keys()}"
-        )
-    elif not isinstance(value, (VideoMetadata, dict, Iterable)):
-        raise ValueError(
-            f"Video metadata must be either a dict, a VideoMetadata or a batched list of metadata, but got device={value}"
-        )
+        return
+
+    valid_keys = ["total_num_frames", "fps", "width", "height", "duration", "video_backend", "frames_indices"]
+
+    def check_dict_keys(d: dict) -> bool:
+        return all(key in valid_keys for key in d.keys())
+
+    if isinstance(value, Sequence) and isinstance(value[0], Sequence) and isinstance(value[0][0], dict):
+        for sublist in value:
+            for item in sublist:
+                if not check_dict_keys(item):
+                    raise ValueError(
+                        f"Invalid keys found in video metadata. Valid keys: {valid_keys} got: {list(item.keys())}"
+                    )
+
+    elif isinstance(value, Sequence) and isinstance(value[0], dict):
+        for item in value:
+            if not check_dict_keys(item):
+                raise ValueError(
+                    f"Invalid keys found in video metadata. Valid keys: {valid_keys} got: {list(item.keys())}"
+                )
+
+    elif isinstance(value, dict):
+        if not check_dict_keys(value):
+            raise ValueError(
+                f"Invalid keys found in video metadata. Valid keys: {valid_keys}, got: {list(value.keys())}"
+            )
 
 
 @as_validated_field
diff --git a/src/transformers/video_utils.py b/src/transformers/video_utils.py
index 1749b0b3b1c5..cfe89c94e65e 100644
--- a/src/transformers/video_utils.py
+++ b/src/transformers/video_utils.py
@@ -112,6 +112,11 @@ def update(self, dictionary):
                 setattr(self, key, value)
 
 
+VideoMetadataType = Union[
+    VideoMetadata, dict, list[Union[dict, VideoMetadata]], list[list[Union[dict, VideoMetadata]]]
+]
+
+
 def is_valid_video_frame(frame):
     return isinstance(frame, PIL.Image.Image) or (
         (is_numpy_array(frame) or is_torch_tensor(frame)) and frame.ndim == 3
@@ -215,7 +220,7 @@ def make_batched_videos(videos) -> list[Union[np.ndarray, "torch.Tensor", "URL",
     return flat_videos_list
 
 
-def make_batched_metadata(videos: VideoInput, video_metadata: Union[VideoMetadata, dict]):
+def make_batched_metadata(videos: VideoInput, video_metadata: VideoMetadataType) -> list[VideoMetadata]:
     if video_metadata is None:
         # Create default metadata and fill attributes we can infer from given video
         video_metadata = [

From 9f761c6421f92986d5188807b0f4003602303d71 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 11 Sep 2025 18:17:05 +0200
Subject: [PATCH 11/28] fix last issues

---
 .../models/qwen2_5_omni/processing_qwen2_5_omni.py        | 8 ++++----
 tests/test_video_processing_common.py                     | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index 45d8cacddeb2..49bf64dcad2d 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -31,10 +31,10 @@
 
 
 class Qwen2_5_OmniVideosKwargs(VideosKwargs):
-    fps: Optional[list[Union[int, float]]] = None
-    use_audio_in_video: Optional[bool] = None
-    seconds_per_chunk: Optional[float] = None
-    position_id_per_seconds: Optional[int] = None
+    fps: Optional[Union[list[Union[int, float]], int, float]]
+    use_audio_in_video: Optional[bool]
+    seconds_per_chunk: Optional[float]
+    position_id_per_seconds: Optional[int]
     min_pixels: Optional[int]
     max_pixels: Optional[int]
     patch_size: Optional[int]
diff --git a/tests/test_video_processing_common.py b/tests/test_video_processing_common.py
index 3d0477ee05d5..67a31cf8d20e 100644
--- a/tests/test_video_processing_common.py
+++ b/tests/test_video_processing_common.py
@@ -398,8 +398,8 @@ def test_call_numpy_4_channels(self):
                 video_inputs[0],
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=0.0,
+                image_std=1.0,
             )[self.input_name]
             expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
             if video_processor.do_convert_rgb:
@@ -412,8 +412,8 @@ def test_call_numpy_4_channels(self):
                 video_inputs,
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=0.0,
+                image_std=1.0,
             )[self.input_name]
             expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
             if video_processor.do_convert_rgb:

From f935cffdc74557504abfe3d3b94b86c84f934bc5 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 16 Sep 2025 17:36:24 +0200
Subject: [PATCH 12/28] add more detalis in docs

---
 src/transformers/utils/type_validators.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py
index 54a014a99b20..ee6ff2be956f 100644
--- a/src/transformers/utils/type_validators.py
+++ b/src/transformers/utils/type_validators.py
@@ -1,6 +1,6 @@
 from collections.abc import Sequence
 from dataclasses import MISSING, field, make_dataclass
-from typing import Annotated, Any, ForwardRef, Optional, TypedDict, Union, get_args, get_origin
+from typing import Annotated, ForwardRef, Optional, TypedDict, Union, get_args, get_origin
 
 from huggingface_hub.dataclasses import as_validated_field, strict
 
@@ -45,6 +45,11 @@ class TypedDictAdapter:
     A utility class used to convert a TypedDict object to dataclass and attach
     a hub validator on top based on TypedDict annotations.
 
+    We don't want to replace `TypedDict` by dataclasses in the codebase because
+    with dataclasses we will lose typing hints that `Unpack[TypedDict]` gives.
+    So this utility is a sweet spot to keep the balance between DevX and strong
+    typing`validation.
+
     Args:
         type: The TypedDict object that needs to be validated.
     """
@@ -52,12 +57,8 @@ class TypedDictAdapter:
     def __init__(
         self,
         type: type[TypedDict],
-        globalns: Optional[dict[str, Any]] = None,
-        localns: Optional[dict[str, Any]] = None,
     ):
         self.type = type
-        self.globalns = globalns
-        self.localns = localns
         self.dataclass = self.create_dataclass()
         self.dataclass = strict(self.dataclass)
 

From e6a77d8629458b30f957fe803b14f257014ada93 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 24 Sep 2025 16:58:22 +0200
Subject: [PATCH 13/28] pin the latest hub release

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b4feedbc77a1..b293479cba28 100644
--- a/setup.py
+++ b/setup.py
@@ -115,7 +115,7 @@
     "GitPython<3.1.19",
     "hf-doc-builder>=0.3.0",
     "hf_xet",
-    "huggingface-hub>=0.34.0,<1.0",
+    "huggingface-hub>=0.35.1,<1.0",
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "jax>=0.4.1,<=0.4.13",

From 5a4263030b8970ac5b0a3c260658cac0b3f1cc39 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 24 Sep 2025 18:19:46 +0200
Subject: [PATCH 14/28] fix tests for new models

---
 src/transformers/dependency_versions_table.py             | 2 +-
 src/transformers/models/lfm2_vl/processing_lfm2_vl.py     | 8 ++++++--
 .../models/qwen2_5_omni/processing_qwen2_5_omni.py        | 2 +-
 .../models/qwen3_omni_moe/processing_qwen3_omni_moe.py    | 4 ++--
 tests/models/lfm2_vl/test_processing_lfm2_vl.py           | 2 +-
 5 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index bd1a34ee747f..1f977cbe267e 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -23,7 +23,7 @@
     "GitPython": "GitPython<3.1.19",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
     "hf_xet": "hf_xet",
-    "huggingface-hub": "huggingface-hub>=0.34.0,<1.0",
+    "huggingface-hub": "huggingface-hub>=0.35.1,<1.0",
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "jinja2": "jinja2>=3.1.0",
diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
index 12f289c266a1..32965e6fc7ac 100755
--- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
@@ -21,6 +21,7 @@
     ImagesKwargs,
     ProcessingKwargs,
     ProcessorMixin,
+    TextKwargs,
     Unpack,
 )
 from ...tokenization_utils_base import BatchEncoding, TextInput
@@ -46,8 +47,13 @@ class Lfm2VlImagesKwargs(ImagesKwargs, total=False):
     return_row_col_info: Optional[bool]
 
 
+class Lfm2VlTextKwargs(TextKwargs, total=False):
+    use_image_special_tokens: Optional[bool]
+
+
 class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False):
     images_kwargs: Lfm2VlImagesKwargs
+    text_kwargs: Lfm2VlTextKwargs
 
     _defaults = {
         "images_kwargs": {
@@ -88,12 +94,10 @@ def __init__(
         image_processor,
         tokenizer,
         chat_template: Optional[str] = None,
-        use_image_special_tokens: Optional[bool] = True,
         **kwargs,
     ):
         self.image_token = tokenizer.image_token
         self.image_token_id = tokenizer.image_token_id
-        self.use_image_special_tokens = use_image_special_tokens
         self.image_start_token = tokenizer.image_start_token
         self.image_end_token = tokenizer.image_end_token
         self.image_thumbnail_token = tokenizer.image_thumbnail
diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index 4e5cbf484c34..2ee7b7b43c2b 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -34,7 +34,7 @@ class Qwen2_5_OmniVideosKwargs(VideosKwargs):
     fps: Optional[Union[list[Union[int, float]], int, float]]
     use_audio_in_video: Optional[bool]
     seconds_per_chunk: Optional[float]
-    position_id_per_seconds: Optional[int]
+    position_id_per_seconds: Optional[Union[int, float]]
     min_pixels: Optional[int]
     max_pixels: Optional[int]
     patch_size: Optional[int]
diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
index 86041fc3de16..c77d9c75ae5f 100644
--- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
@@ -33,10 +33,10 @@
 
 
 class Qwen3OmniMoeVideosKwargs(VideosKwargs):
-    fps: Optional[list[Union[int, float]]]
+    fps: Optional[Union[list[Union[int, float]], int, float]]
     use_audio_in_video: Optional[bool]
     seconds_per_chunk: Optional[float]
-    position_id_per_seconds: Optional[int]
+    position_id_per_seconds: Optional[Union[int, float]]
     min_pixels: Optional[int]
     max_pixels: Optional[int]
     patch_size: Optional[int]
diff --git a/tests/models/lfm2_vl/test_processing_lfm2_vl.py b/tests/models/lfm2_vl/test_processing_lfm2_vl.py
index f2c33e40e3f6..d1f7669bdddd 100755
--- a/tests/models/lfm2_vl/test_processing_lfm2_vl.py
+++ b/tests/models/lfm2_vl/test_processing_lfm2_vl.py
@@ -100,7 +100,7 @@ def prepare_processor_dict():
             "{{'<|im_start|>assistant\n' }}"
             "{% endif %}"
         )
-        return {"chat_template": chat_template, "use_image_special_tokens": True}
+        return {"chat_template": chat_template}
 
     # Override as Lfm2VL needs images/video to be an explicitly nested batch
     def prepare_image_inputs(self, batch_size=None):

From fe4ba565c4e38dafd36d289204571a25cee382a2 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 24 Sep 2025 18:37:53 +0200
Subject: [PATCH 15/28] also fast image processor

---
 src/transformers/image_processing_utils_fast.py  | 14 ++++++++++----
 .../image_processing_deepseek_vl_fast.py         |  2 +-
 .../image_processing_deepseek_vl_hybrid_fast.py  | 10 +++++-----
 .../modular_deepseek_vl_hybrid.py                | 10 +++++-----
 .../image_processing_efficientnet_fast.py        |  4 ++--
 .../models/eomt/image_processing_eomt_fast.py    | 16 ++++++----------
 .../models/janus/image_processing_janus_fast.py  |  2 +-
 .../image_processing_llava_onevision_fast.py     |  5 ++---
 .../llava_onevision/modular_llava_onevision.py   |  5 ++---
 src/transformers/processing_utils.py             | 16 ++++++++--------
 10 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index 75210680b57a..0607d4fac550 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -51,6 +51,7 @@
     logging,
 )
 from .utils.import_utils import is_rocm_platform
+from .utils.type_validators import TypedDictAdapter
 
 
 if is_vision_available():
@@ -169,21 +170,21 @@ class DefaultFastImageProcessorKwargs(TypedDict, total=False):
     do_resize: Optional[bool]
     size: Optional[dict[str, int]]
     default_to_square: Optional[bool]
-    resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
+    resample: Optional[Union["PILImageResampling", "F.InterpolationMode", int]]
     do_center_crop: Optional[bool]
     crop_size: Optional[dict[str, int]]
     do_rescale: Optional[bool]
     rescale_factor: Optional[Union[int, float]]
     do_normalize: Optional[bool]
-    image_mean: Optional[Union[float, list[float]]]
-    image_std: Optional[Union[float, list[float]]]
+    image_mean: Optional[Union[float, list[float], tuple[float, float, float]]]
+    image_std: Optional[Union[float, list[float], tuple[float, float, float]]]
     do_pad: Optional[bool]
     pad_size: Optional[dict[str, int]]
     do_convert_rgb: Optional[bool]
     return_tensors: Optional[Union[str, TensorType]]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
-    device: Optional["torch.device"]
+    device: Optional[Union[str, "torch.device"]]
     disable_grouping: Optional[bool]
 
 
@@ -737,6 +738,11 @@ def __call__(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageP
     def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
         # args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)
+
+        # Perform type validation on received kwargs
+        type_validator = TypedDictAdapter(self.valid_kwargs)
+        type_validator.validate_fields(**kwargs)
+
         # Set default kwargs from self. This ensures that if a kwarg is not provided
         # by the user, it gets its default value from the instance, or is set to None.
         for kwarg_name in self._valid_kwargs_names:
diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py
index 896e91f0692c..87ccf3b4a6fe 100644
--- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py
+++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py
@@ -42,7 +42,7 @@ class DeepseekVLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
         falls below this value after resizing.
     """
 
-    min_size: int
+    min_size: Optional[int]
 
 
 @auto_docstring
diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py
index db9c9ad987c1..82b6357f24f4 100644
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py
@@ -67,11 +67,11 @@ class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
         number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
     """
 
-    min_size: int
-    high_res_size: dict
-    high_res_resample: "PILImageResampling"
-    high_res_image_mean: list[float]
-    high_res_image_std: list[float]
+    min_size: Optional[int]
+    high_res_size: Optional[dict[str, int]]
+    high_res_resample: Optional[Union["PILImageResampling", "F.InterpolationMode", int]]
+    high_res_image_mean: Optional[Union[float, list[float], tuple[float, float, float]]]
+    high_res_image_std: Optional[Union[float, list[float], tuple[float, float, float]]]
 
 
 @auto_docstring
diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
index e9808b02ce34..c0a45431c5e3 100644
--- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@@ -752,11 +752,11 @@ class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
         number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
     """
 
-    min_size: int
-    high_res_size: dict
-    high_res_resample: "PILImageResampling"
-    high_res_image_mean: list[float]
-    high_res_image_std: list[float]
+    min_size: Optional[int]
+    high_res_size: Optional[dict[str, int]]
+    high_res_resample: Optional[Union["PILImageResampling", "F.InterpolationMode", int]]
+    high_res_image_mean: Optional[Union[float, list[float], tuple[float, float, float]]]
+    high_res_image_std: Optional[Union[float, list[float], tuple[float, float, float]]]
 
 
 class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py b/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py
index 3544d927c146..e6d056481870 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py
@@ -45,8 +45,8 @@ class EfficientNetFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
             Normalize the image again with the standard deviation only for image classification if set to True.
     """
 
-    rescale_offset: bool
-    include_top: bool
+    rescale_offset: Optional[bool]
+    include_top: Optional[bool]
 
 
 @auto_docstring
diff --git a/src/transformers/models/eomt/image_processing_eomt_fast.py b/src/transformers/models/eomt/image_processing_eomt_fast.py
index 97a13a0745eb..3e44a87bd1bf 100644
--- a/src/transformers/models/eomt/image_processing_eomt_fast.py
+++ b/src/transformers/models/eomt/image_processing_eomt_fast.py
@@ -59,19 +59,15 @@
 class EomtImageProcessorFastKwargs(DefaultFastImageProcessorKwargs):
     """
     do_split_image (`bool`, *optional*, defaults to `False`):
-            Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
-            input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches.
-            Otherwise, the input images will be padded to the target size.
-    do_pad (`bool`, *optional*, defaults to `False`):
-            Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
-            number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+        Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
+        input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches.
+        Otherwise, the input images will be padded to the target size.
     ignore_index (`int`, *optional*):
-            Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
-            denoted with 0 (background) will be replaced with `ignore_index`.
+        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
+        denoted with 0 (background) will be replaced with `ignore_index`.
     """
 
-    do_split_image: bool
-    do_pad: bool
+    do_split_image: Optional[bool]
     ignore_index: Optional[int] = None
 
 
diff --git a/src/transformers/models/janus/image_processing_janus_fast.py b/src/transformers/models/janus/image_processing_janus_fast.py
index 9ed2732fb3d0..881f38d6681a 100644
--- a/src/transformers/models/janus/image_processing_janus_fast.py
+++ b/src/transformers/models/janus/image_processing_janus_fast.py
@@ -53,7 +53,7 @@ class JanusFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
         falls below this value after resizing.
     """
 
-    min_size: int
+    min_size: Optional[int]
 
 
 @auto_docstring
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
index 4392d64e9ebf..6dc0752d9f94 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
@@ -92,8 +92,7 @@ def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImag
             batch_num_images = [1] * len(images)
         else:
             batch_num_images = [1]
-        kwargs["batch_num_images"] = batch_num_images
-        return super().preprocess(images, **kwargs)
+        return super().preprocess(images, batch_num_images, **kwargs)
 
     def _resize_for_patching(
         self,
@@ -218,6 +217,7 @@ def _pad_for_batching(
     def _preprocess(
         self,
         images: list["torch.Tensor"],
+        batch_num_images: list[int],
         do_resize: bool,
         size: SizeDict,
         image_grid_pinpoints: list[list[int]],
@@ -230,7 +230,6 @@ def _preprocess(
         image_mean: Optional[Union[float, list[float]]],
         image_std: Optional[Union[float, list[float]]],
         do_pad: bool,
-        batch_num_images: list[int],
         disable_grouping: Optional[bool],
         return_tensors: Optional[Union[str, TensorType]],
         **kwargs,
diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py
index ec2304e09dd1..c664764bbaf9 100644
--- a/src/transformers/models/llava_onevision/modular_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py
@@ -143,12 +143,12 @@ def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImag
             batch_num_images = [1] * len(images)
         else:
             batch_num_images = [1]
-        kwargs["batch_num_images"] = batch_num_images
-        return super().preprocess(images, **kwargs)
+        return super().preprocess(images, batch_num_images, **kwargs)
 
     def _preprocess(
         self,
         images: list["torch.Tensor"],
+        batch_num_images: list[int],
         do_resize: bool,
         size: SizeDict,
         image_grid_pinpoints: list[list[int]],
@@ -161,7 +161,6 @@ def _preprocess(
         image_mean: Optional[Union[float, list[float]]],
         image_std: Optional[Union[float, list[float]]],
         do_pad: bool,
-        batch_num_images: list[int],
         disable_grouping: Optional[bool],
         return_tensors: Optional[Union[str, TensorType]],
         **kwargs,
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 949069f14c5c..89dea69b000d 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -193,9 +193,9 @@ class methods and docstrings.
             Scale factor to use if rescaling the image.
         do_normalize (`bool`, *optional*):
             Whether to normalize the image.
-        image_mean (`float` or `list[float]`, *optional*):
+        image_mean (`float` or `list[float] or tuple[float, float, float]`, *optional*):
             Mean to use if normalizing the image.
-        image_std (`float` or `list[float]`, *optional*):
+        image_std (`float` or `list[float] or tuple[float, float, float]`, *optional*):
             Standard deviation to use if normalizing the image.
         do_pad (`bool`, *optional*):
             Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
@@ -219,8 +219,8 @@ class methods and docstrings.
     do_rescale: Optional[bool]
     rescale_factor: Optional[float]
     do_normalize: Optional[bool]
-    image_mean: Optional[Union[float, list[float]]]
-    image_std: Optional[Union[float, list[float]]]
+    image_mean: Optional[Union[float, list[float], tuple[float, float, float]]]
+    image_std: Optional[Union[float, list[float], tuple[float, float, float]]]
     do_pad: Optional[bool]
     pad_size: Annotated[Optional[dict[str, int]], image_size_validator()]
     do_center_crop: Optional[bool]
@@ -251,9 +251,9 @@ class VideosKwargs(TypedDict, total=False):
             Scale factor to use if rescaling the video.
         do_normalize (`bool`, *optional*):
             Whether to normalize the video.
-        image_mean (`float` or `list[float]`, *optional*):
+        image_mean (`float` or `list[float] or tuple[float, float, float]`, *optional*):
             Mean to use if normalizing the video.
-        image_std (`float` or `list[float]`, *optional*):
+        image_std (`float` or `list[float] or tuple[float, float, float]`, *optional*):
             Standard deviation to use if normalizing the video.
         do_center_crop (`bool`, *optional*):
             Whether to center crop the video.
@@ -283,8 +283,8 @@ class VideosKwargs(TypedDict, total=False):
     do_rescale: Optional[bool]
     rescale_factor: Optional[float]
     do_normalize: Optional[bool]
-    image_mean: Optional[Union[float, list[float]]]
-    image_std: Optional[Union[float, list[float]]]
+    image_mean: Optional[Union[float, list[float], tuple[float, float, float]]]
+    image_std: Optional[Union[float, list[float], tuple[float, float, float]]]
     do_center_crop: Optional[bool]
     crop_size: Annotated[Optional[dict[str, int]], image_size_validator()]
     data_format: Optional[ChannelDimension]

From 6e8d77e49354095a85a66d3914629c0f20892415 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 24 Sep 2025 18:49:32 +0200
Subject: [PATCH 16/28] fix copies

---
 .../models/llava_onevision/modular_llava_onevision.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py
index c664764bbaf9..afa633d8f61a 100644
--- a/src/transformers/models/llava_onevision/modular_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py
@@ -34,7 +34,12 @@
 
 from ...cache_utils import Cache
 from ...image_processing_utils import BatchFeature
-from ...image_processing_utils_fast import DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
 from ...image_utils import (
     OPENAI_CLIP_MEAN,
     OPENAI_CLIP_STD,
@@ -74,7 +79,7 @@ class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     image_grid_pinpoints: Optional[list[list[int]]]
 
 
-class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast):
+class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast, BaseImageProcessorFast):
     resample = PILImageResampling.BICUBIC
     image_mean = OPENAI_CLIP_MEAN
     image_std = OPENAI_CLIP_STD
@@ -143,7 +148,7 @@ def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImag
             batch_num_images = [1] * len(images)
         else:
             batch_num_images = [1]
-        return super().preprocess(images, batch_num_images, **kwargs)
+        return BaseImageProcessorFast.preprocess(images, batch_num_images, **kwargs)
 
     def _preprocess(
         self,

From ba419921889e0a3221c3ac3f9754597602051125 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 25 Sep 2025 10:53:16 +0200
Subject: [PATCH 17/28] image processing ast validated

---
 .../image_processing_perception_lm_fast.py    |  9 +++++---
 .../pixtral/image_processing_pixtral_fast.py  |  4 ++--
 .../test_image_processing_oneformer.py        |  1 -
 .../test_image_processing_vitmatte.py         | 22 ++++++++++++-------
 tests/test_image_processing_common.py         |  8 +++----
 5 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index c26132a48439..8642fe2e5f94 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -51,9 +51,9 @@ class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
         Maximum number of tiles an image can be split into based on its aspect ratio.
     """
 
-    vision_input_type: str = "thumb+tile"
-    tile_size: int = 448
-    max_num_tiles: int = 36
+    vision_input_type: Optional[str]
+    tile_size: Optional[int]
+    max_num_tiles: Optional[int]
 
 
 @auto_docstring
@@ -66,6 +66,9 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_convert_rgb = True
+    vision_input_type = "thumb+tile"
+    tile_size = 448
+    max_num_tiles = 36
     size = {"width": 448, "height": 448}  # for backward compatibility in tests
     valid_kwargs = PerceptionLMFastImageProcessorKwargs
 
diff --git a/src/transformers/models/pixtral/image_processing_pixtral_fast.py b/src/transformers/models/pixtral/image_processing_pixtral_fast.py
index db3e75760318..473ebefd02f4 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral_fast.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral_fast.py
@@ -46,11 +46,11 @@
 
 class PixtralFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     """
-    patch_size (`dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`):
+    patch_size (`Union[int, dict[str, int]]` *optional*, defaults to `{"height": 16, "width": 16}`):
         Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
     """
 
-    patch_size: Optional[dict[str, int]]
+    patch_size: Optional[Union[int, dict[str, int]]]
 
 
 @auto_docstring
diff --git a/tests/models/oneformer/test_image_processing_oneformer.py b/tests/models/oneformer/test_image_processing_oneformer.py
index 4fe89959bf0b..1343d069d819 100644
--- a/tests/models/oneformer/test_image_processing_oneformer.py
+++ b/tests/models/oneformer/test_image_processing_oneformer.py
@@ -224,7 +224,6 @@ def comm_get_image_processor_inputs(
             annotations,
             return_tensors="pt",
             instance_id_to_semantic_id=instance_id_to_semantic_id,
-            pad_and_return_pixel_mask=True,
         )
 
         return inputs
diff --git a/tests/models/vitmatte/test_image_processing_vitmatte.py b/tests/models/vitmatte/test_image_processing_vitmatte.py
index a103c33a9cca..e57fce61747d 100644
--- a/tests/models/vitmatte/test_image_processing_vitmatte.py
+++ b/tests/models/vitmatte/test_image_processing_vitmatte.py
@@ -255,18 +255,24 @@ def test_image_processor_preprocess_arguments(self):
         # vitmatte require additional trimap input for image_processor
         # that is why we override original common test
 
-        for image_processing_class in self.image_processor_list:
+        for i, image_processing_class in enumerate(self.image_processor_list):
             image_processor = image_processing_class(**self.image_processor_dict)
             image = self.image_processor_tester.prepare_image_inputs()[0]
             trimap = np.random.randint(0, 3, size=image.size[::-1])
 
-            with warnings.catch_warnings(record=True) as raised_warnings:
-                warnings.simplefilter("always")
-                image_processor(image, trimaps=trimap, extra_argument=True)
-
-            messages = " ".join([str(w.message) for w in raised_warnings])
-            self.assertGreaterEqual(len(raised_warnings), 1)
-            self.assertIn("extra_argument", messages)
+            # Type validation will fail for fast processors only (for now)
+            if i == 1:
+                with self.assertRaises(TypeError):
+                    image_processor(image, trimaps=trimap, extra_argument=True)
+            else:
+                # Else we just consume extra kwargs and raise a warning
+                with warnings.catch_warnings(record=True) as raised_warnings:
+                    warnings.simplefilter("always")
+                    image_processor(image, trimaps=trimap, extra_argument=True)
+
+                messages = " ".join([str(w.message) for w in raised_warnings])
+                self.assertGreaterEqual(len(raised_warnings), 1)
+                self.assertIn("extra_argument", messages)
 
     @unittest.skip(reason="Many failing cases. This test needs a more deep investigation.")
     def test_fast_is_faster_than_slow(self):
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index ce0bd4181be5..4ab674051ec8 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -514,8 +514,8 @@ def test_call_numpy_4_channels(self):
                 image_inputs[0],
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=[0.0, 0.0, 0.0, 0.0],
+                image_std=[1.0, 1.0, 1.0, 1.0],
             ).pixel_values
             expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
             self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
@@ -525,8 +525,8 @@ def test_call_numpy_4_channels(self):
                 image_inputs,
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=[0.0, 0.0, 0.0, 0.0],
+                image_std=[1.0, 1.0, 1.0, 1.0],
             ).pixel_values
             expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
             self.assertEqual(

From 3233a703a3ce30fcab2ab0ad634575e793eacefe Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 25 Sep 2025 12:39:31 +0200
Subject: [PATCH 18/28] fix more tests

---
 .../image_processing_utils_fast.py            | 20 +++++-----
 src/transformers/processing_utils.py          | 38 +++++++++++++------
 .../test_image_processing_cohere2_vision.py   |  8 ++--
 .../glm4v/test_image_processing_glm4v.py      |  8 ++--
 .../glm4v/test_video_processing_glm4v.py      |  8 ++--
 .../nougat/test_image_processing_nougat.py    |  8 ++--
 .../test_video_processing_qwen2_vl.py         |  8 ++--
 .../test_video_processing_qwen3_vl.py         |  8 ++--
 tests/models/tvp/test_image_processing_tvp.py | 12 +++++-
 .../test_image_processing_videomae.py         | 12 +++++-
 .../test_image_processing_vitmatte.py         |  4 +-
 .../vitpose/test_image_processing_vitpose.py  |  8 ++--
 .../vivit/test_image_processing_vivit.py      | 12 +++++-
 13 files changed, 97 insertions(+), 57 deletions(-)

diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index 0607d4fac550..01aab0eee3e7 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -15,7 +15,7 @@
 from collections.abc import Iterable
 from copy import deepcopy
 from functools import lru_cache, partial
-from typing import Any, Optional, TypedDict, Union
+from typing import Annotated, Any, Optional, TypedDict, Union
 
 import numpy as np
 
@@ -51,7 +51,7 @@
     logging,
 )
 from .utils.import_utils import is_rocm_platform
-from .utils.type_validators import TypedDictAdapter
+from .utils.type_validators import TypedDictAdapter, device_validator, image_size_validator, tensor_type_validator
 
 
 if is_vision_available():
@@ -168,23 +168,23 @@ def divide_to_patches(
 
 class DefaultFastImageProcessorKwargs(TypedDict, total=False):
     do_resize: Optional[bool]
-    size: Optional[dict[str, int]]
+    size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
     default_to_square: Optional[bool]
     resample: Optional[Union["PILImageResampling", "F.InterpolationMode", int]]
     do_center_crop: Optional[bool]
-    crop_size: Optional[dict[str, int]]
+    crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
     do_rescale: Optional[bool]
     rescale_factor: Optional[Union[int, float]]
     do_normalize: Optional[bool]
-    image_mean: Optional[Union[float, list[float], tuple[float, float, float]]]
-    image_std: Optional[Union[float, list[float], tuple[float, float, float]]]
+    image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
+    image_std: Optional[Union[float, list[float], tuple[float, ...]]]
     do_pad: Optional[bool]
-    pad_size: Optional[dict[str, int]]
+    crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
     do_convert_rgb: Optional[bool]
-    return_tensors: Optional[Union[str, TensorType]]
-    data_format: Optional[ChannelDimension]
+    return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
+    data_format: Optional[Union[str, ChannelDimension]]
     input_data_format: Optional[Union[str, ChannelDimension]]
-    device: Optional[Union[str, "torch.device"]]
+    device: Annotated[Optional[str], device_validator()]
     disable_grouping: Optional[bool]
 
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 89dea69b000d..f3d92420691f 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -147,6 +147,10 @@ class TextKwargs(TypedDict, total=False):
             The side on which padding will be applied.
         return_mm_token_type_ids (`bool`, *optional*):
             Whether to return multimodal token type ids indicating mm placeholder token positions.
+        return_tensors (`str` or [`~utils.TensorType`], *optional*):
+            If set, will return tensors of a particular framework. Acceptable values are:
+            - `'pt'`: Return PyTorch `torch.Tensor` objects.
+            - `'np'`: Return NumPy `np.ndarray` objects.
     """
 
     text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
@@ -209,20 +213,24 @@ class methods and docstrings.
             The channel dimension format for the input image.
         device (`str`, *optional*):
             The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
+        return_tensors (`str` or [`~utils.TensorType`], *optional*):
+            If set, will return tensors of a particular framework. Acceptable values are:
+            - `'pt'`: Return PyTorch `torch.Tensor` objects.
+            - `'np'`: Return NumPy `np.ndarray` objects.
     """
 
     do_convert_rgb: Optional[bool]
     do_resize: Optional[bool]
-    size: Annotated[Optional[dict[str, int]], image_size_validator()]
-    crop_size: Annotated[Optional[dict[str, int]], image_size_validator()]
+    size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
+    crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
     resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()]
     do_rescale: Optional[bool]
     rescale_factor: Optional[float]
     do_normalize: Optional[bool]
-    image_mean: Optional[Union[float, list[float], tuple[float, float, float]]]
-    image_std: Optional[Union[float, list[float], tuple[float, float, float]]]
+    image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
+    image_std: Optional[Union[float, list[float], tuple[float, ...]]]
     do_pad: Optional[bool]
-    pad_size: Annotated[Optional[dict[str, int]], image_size_validator()]
+    pad_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
     do_center_crop: Optional[bool]
     data_format: Optional[Union[str, ChannelDimension]]
     input_data_format: Optional[Union[str, ChannelDimension]]
@@ -271,23 +279,27 @@ class VideosKwargs(TypedDict, total=False):
             The channel dimension format for the output video.
         input_data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the input video.
-        return_metadata (`ChannelDimension` or `str`, *optional*):
+        return_metadata (`bool`, *optional*):
             Whether to return video metadata or not.
+        return_tensors (`str` or [`~utils.TensorType`], *optional*):
+            If set, will return tensors of a particular framework. Acceptable values are:
+            - `'pt'`: Return PyTorch `torch.Tensor` objects.
+            - `'np'`: Return NumPy `np.ndarray` objects.
     """
 
     do_convert_rgb: Optional[bool]
     do_resize: Optional[bool]
-    size: Annotated[Optional[dict[str, int]], image_size_validator()]
+    size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
     default_to_square: Optional[bool]
     resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()]
     do_rescale: Optional[bool]
     rescale_factor: Optional[float]
     do_normalize: Optional[bool]
-    image_mean: Optional[Union[float, list[float], tuple[float, float, float]]]
-    image_std: Optional[Union[float, list[float], tuple[float, float, float]]]
+    image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
+    image_std: Optional[Union[float, list[float], tuple[float, ...]]]
     do_center_crop: Optional[bool]
-    crop_size: Annotated[Optional[dict[str, int]], image_size_validator()]
-    data_format: Optional[ChannelDimension]
+    crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
+    data_format: Optional[Union[str, ChannelDimension]]
     input_data_format: Optional[Union[str, ChannelDimension]]
     device: Annotated[Optional[str], device_validator()]
     do_sample_frames: Optional[bool]
@@ -326,6 +338,10 @@ class AudioKwargs(TypedDict, total=False):
             If set, will pad the sequence to a multiple of the provided value.
         return_attention_mask (`bool`, *optional*):
             Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
+        return_tensors (`str` or [`~utils.TensorType`], *optional*):
+            If set, will return tensors of a particular framework. Acceptable values are:
+            - `'pt'`: Return PyTorch `torch.Tensor` objects.
+            - `'np'`: Return NumPy `np.ndarray` objects.
     """
 
     sampling_rate: Annotated[Optional[int], positive_int()]
diff --git a/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py b/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py
index 7ab3bf70d57b..81a16ba39c14 100644
--- a/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py
+++ b/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py
@@ -176,8 +176,8 @@ def test_call_numpy_4_channels(self):
                 image_inputs[0],
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
             ).pixel_values
             self.assertEqual(tuple(encoded_images.shape), (10, 4, 30, 30))
 
@@ -186,7 +186,7 @@ def test_call_numpy_4_channels(self):
                 image_inputs,
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
             ).pixel_values
             self.assertEqual(tuple(encoded_images.shape), (70, 4, 30, 30))
diff --git a/tests/models/glm4v/test_image_processing_glm4v.py b/tests/models/glm4v/test_image_processing_glm4v.py
index cb5af4b275d2..1226fe473db9 100644
--- a/tests/models/glm4v/test_image_processing_glm4v.py
+++ b/tests/models/glm4v/test_image_processing_glm4v.py
@@ -236,8 +236,8 @@ def test_call_numpy_4_channels(self):
                 image_inputs[0],
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
             ).pixel_values
             expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
@@ -247,8 +247,8 @@ def test_call_numpy_4_channels(self):
                 image_inputs,
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
             ).pixel_values
             expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
             self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
diff --git a/tests/models/glm4v/test_video_processing_glm4v.py b/tests/models/glm4v/test_video_processing_glm4v.py
index 1dcd4bdecca6..8443c728f2f2 100644
--- a/tests/models/glm4v/test_video_processing_glm4v.py
+++ b/tests/models/glm4v/test_video_processing_glm4v.py
@@ -250,8 +250,8 @@ def test_call_numpy_4_channels(self):
                 video_inputs[0],
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
             )[self.input_name]
             expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
             self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
@@ -261,8 +261,8 @@ def test_call_numpy_4_channels(self):
                 video_inputs,
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
             )[self.input_name]
             expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
             self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
diff --git a/tests/models/nougat/test_image_processing_nougat.py b/tests/models/nougat/test_image_processing_nougat.py
index c014c21828f4..68a71a6dfb8c 100644
--- a/tests/models/nougat/test_image_processing_nougat.py
+++ b/tests/models/nougat/test_image_processing_nougat.py
@@ -282,8 +282,8 @@ def test_call_numpy_4_channels(self):
                     image_inputs[0],
                     return_tensors="pt",
                     input_data_format="channels_last",
-                    image_mean=0,
-                    image_std=1,
+                    image_mean=(0.0, 0.0, 0.0, 0.0),
+                    image_std=(1.0, 1.0, 1.0, 1.0),
                 ).pixel_values
                 expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(
                     [image_inputs[0]]
@@ -295,8 +295,8 @@ def test_call_numpy_4_channels(self):
                     image_inputs,
                     return_tensors="pt",
                     input_data_format="channels_last",
-                    image_mean=0,
-                    image_std=1,
+                    image_mean=(0.0, 0.0, 0.0, 0.0),
+                    image_std=(1.0, 1.0, 1.0, 1.0),
                 ).pixel_values
                 expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
                 self.assertEqual(
diff --git a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
index 4d6026a06289..b80adebbd9ab 100644
--- a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
@@ -265,8 +265,8 @@ def test_call_numpy_4_channels(self):
                 video_inputs[0],
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
             )[self.input_name]
             expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
             self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
@@ -276,8 +276,8 @@ def test_call_numpy_4_channels(self):
                 video_inputs,
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
             )[self.input_name]
             expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
             self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
index 9230f0f9502e..60f4023938bb 100644
--- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
+++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py
@@ -249,8 +249,8 @@ def test_call_numpy_4_channels(self):
                 video_inputs[0],
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
             )[self.input_name]
             expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
             self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
@@ -260,8 +260,8 @@ def test_call_numpy_4_channels(self):
                 video_inputs,
                 return_tensors="pt",
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
             )[self.input_name]
             expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
             self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
diff --git a/tests/models/tvp/test_image_processing_tvp.py b/tests/models/tvp/test_image_processing_tvp.py
index c2c8b81dfc0a..390ed75a913b 100644
--- a/tests/models/tvp/test_image_processing_tvp.py
+++ b/tests/models/tvp/test_image_processing_tvp.py
@@ -274,7 +274,11 @@ def test_call_numpy_4_channels(self):
             # Test not batched input
             expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs)
             encoded_videos = image_processing(
-                test_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+                test_inputs[0],
+                return_tensors="pt",
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
+                input_data_format="channels_first",
             ).pixel_values
             self.assertListEqual(
                 list(encoded_videos.shape),
@@ -292,7 +296,11 @@ def test_call_numpy_4_channels(self):
                 video_inputs, batched=True
             )
             encoded_videos = image_processing(
-                test_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+                test_inputs,
+                return_tensors="pt",
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
+                input_data_format="channels_first",
             ).pixel_values
             self.assertListEqual(
                 list(encoded_videos.shape),
diff --git a/tests/models/videomae/test_image_processing_videomae.py b/tests/models/videomae/test_image_processing_videomae.py
index 2fe9303f3705..f8576a7bc8af 100644
--- a/tests/models/videomae/test_image_processing_videomae.py
+++ b/tests/models/videomae/test_image_processing_videomae.py
@@ -177,14 +177,22 @@ def test_call_numpy_4_channels(self):
 
         # Test not batched input
         encoded_videos = image_processing(
-            video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+            video_inputs[0],
+            return_tensors="pt",
+            image_mean=(0.0, 0.0, 0.0, 0.0),
+            image_std=(1.0, 1.0, 1.0, 1.0),
+            input_data_format="channels_first",
         ).pixel_values
         expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
         self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
 
         # Test batched
         encoded_videos = image_processing(
-            video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+            video_inputs,
+            return_tensors="pt",
+            image_mean=(0.0, 0.0, 0.0, 0.0),
+            image_std=(1.0, 1.0, 1.0, 1.0),
+            input_data_format="channels_first",
         ).pixel_values
         expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
         self.assertEqual(
diff --git a/tests/models/vitmatte/test_image_processing_vitmatte.py b/tests/models/vitmatte/test_image_processing_vitmatte.py
index e57fce61747d..a17968a1d567 100644
--- a/tests/models/vitmatte/test_image_processing_vitmatte.py
+++ b/tests/models/vitmatte/test_image_processing_vitmatte.py
@@ -220,8 +220,8 @@ def test_call_numpy_4_channels(self):
                 images=image,
                 trimaps=trimap,
                 input_data_format="channels_last",
-                image_mean=0,
-                image_std=1,
+                image_mean=(0.0, 0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0, 1.0),
                 return_tensors="pt",
             ).pixel_values
 
diff --git a/tests/models/vitpose/test_image_processing_vitpose.py b/tests/models/vitpose/test_image_processing_vitpose.py
index 44d9ddf8eb59..c0ede8e22de0 100644
--- a/tests/models/vitpose/test_image_processing_vitpose.py
+++ b/tests/models/vitpose/test_image_processing_vitpose.py
@@ -205,8 +205,8 @@ def test_call_numpy_4_channels(self):
             boxes=boxes,
             return_tensors="pt",
             input_data_format="channels_last",
-            image_mean=0,
-            image_std=1,
+            image_mean=(0.0, 0.0, 0.0, 0.0),
+            image_std=(1.0, 1.0, 1.0, 1.0),
         ).pixel_values
         expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
         self.assertEqual(tuple(encoded_images.shape), (len(boxes[0]), *expected_output_image_shape))
@@ -218,8 +218,8 @@ def test_call_numpy_4_channels(self):
             boxes=boxes,
             return_tensors="pt",
             input_data_format="channels_last",
-            image_mean=0,
-            image_std=1,
+            image_mean=(0.0, 0.0, 0.0, 0.0),
+            image_std=(1.0, 1.0, 1.0, 1.0),
         ).pixel_values
         expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
         self.assertEqual(
diff --git a/tests/models/vivit/test_image_processing_vivit.py b/tests/models/vivit/test_image_processing_vivit.py
index bf61fc1082b2..323dbd3cc55f 100644
--- a/tests/models/vivit/test_image_processing_vivit.py
+++ b/tests/models/vivit/test_image_processing_vivit.py
@@ -191,14 +191,22 @@ def test_call_numpy_4_channels(self):
 
         # Test not batched input
         encoded_videos = image_processing(
-            video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+            video_inputs[0],
+            return_tensors="pt",
+            image_mean=(0.0, 0.0, 0.0, 0.0),
+            image_std=(1.0, 1.0, 1.0, 1.0),
+            input_data_format="channels_first",
         ).pixel_values
         expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
         self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
 
         # Test batched
         encoded_videos = image_processing(
-            video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+            video_inputs,
+            return_tensors="pt",
+            image_mean=(0.0, 0.0, 0.0, 0.0),
+            image_std=(1.0, 1.0, 1.0, 1.0),
+            input_data_format="channels_first",
         ).pixel_values
         expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
         self.assertEqual(

From 909b98ed81fcab8fba0fdc26da3a84330c4107e4 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 25 Sep 2025 12:56:42 +0200
Subject: [PATCH 19/28] typo.and fix copies

---
 src/transformers/image_processing_utils_fast.py       | 2 +-
 src/transformers/models/lfm2_vl/processing_lfm2_vl.py | 2 --
 src/transformers/utils/type_validators.py             | 4 ++--
 tests/models/tvp/test_image_processing_tvp.py         | 8 ++++----
 4 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index 01aab0eee3e7..975ddb282061 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -179,7 +179,7 @@ class DefaultFastImageProcessorKwargs(TypedDict, total=False):
     image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
     image_std: Optional[Union[float, list[float], tuple[float, ...]]]
     do_pad: Optional[bool]
-    crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
+    pad_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()]
     do_convert_rgb: Optional[bool]
     return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
     data_format: Optional[Union[str, ChannelDimension]]
diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
index 32965e6fc7ac..9495a1a3bda2 100755
--- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
@@ -81,8 +81,6 @@ class Lfm2VlProcessor(ProcessorMixin):
             An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
         chat_template (`str`, *optional*):
             A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
-        use_image_special_tokens (`bool`, *optional*, defaults to `True`):
-            Whether to use image special tokens or not when processing.
     """
 
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py
index ee6ff2be956f..4b56f7842480 100644
--- a/src/transformers/utils/type_validators.py
+++ b/src/transformers/utils/type_validators.py
@@ -135,11 +135,11 @@ def truncation_validator(value: Optional[Union[bool, str, TruncationStrategy]] =
 
 
 @as_validated_field
-def image_size_validator(value: Optional[dict[str, int]] = None):
+def image_size_validator(value: Optional[Union[int, Sequence[int], dict[str, int]]] = None):
     possible_keys = ["height", "width", "longest_edge", "shortest_edge", "max_height", "max_width"]
     if value is None:
         pass
-    elif not isinstance(value, dict) or any(k not in possible_keys for k in value.keys()):
+    elif isinstance(value, dict) and any(k not in possible_keys for k in value.keys()):
         raise ValueError(f"Value for size must be a dict with keys {possible_keys} but got size={value}")
 
 
diff --git a/tests/models/tvp/test_image_processing_tvp.py b/tests/models/tvp/test_image_processing_tvp.py
index 390ed75a913b..6d454daf9e4b 100644
--- a/tests/models/tvp/test_image_processing_tvp.py
+++ b/tests/models/tvp/test_image_processing_tvp.py
@@ -276,8 +276,8 @@ def test_call_numpy_4_channels(self):
             encoded_videos = image_processing(
                 test_inputs[0],
                 return_tensors="pt",
-                image_mean=(0.0, 0.0, 0.0, 0.0),
-                image_std=(1.0, 1.0, 1.0, 1.0),
+                image_mean=(0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0),
                 input_data_format="channels_first",
             ).pixel_values
             self.assertListEqual(
@@ -298,8 +298,8 @@ def test_call_numpy_4_channels(self):
             encoded_videos = image_processing(
                 test_inputs,
                 return_tensors="pt",
-                image_mean=(0.0, 0.0, 0.0, 0.0),
-                image_std=(1.0, 1.0, 1.0, 1.0),
+                image_mean=(0.0, 0.0, 0.0),
+                image_std=(1.0, 1.0, 1.0),
                 input_data_format="channels_first",
             ).pixel_values
             self.assertListEqual(

From 4410dd3aa632066b7a7677f7bd43b3984b2dbefd Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 25 Sep 2025 17:21:55 +0200
Subject: [PATCH 20/28] bump

---
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 3945537c49ff..a498aed58805 100644
--- a/setup.py
+++ b/setup.py
@@ -114,7 +114,7 @@
     "GitPython<3.1.19",
     "hf-doc-builder>=0.3.0",
     "hf_xet",
-    "huggingface-hub==1.0.0.rc1",
+    "huggingface-hub==1.0.0.rc2",
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "jinja2>=3.1.0",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 80b107d93c4d..4f6c65966713 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -23,7 +23,7 @@
     "GitPython": "GitPython<3.1.19",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
     "hf_xet": "hf_xet",
-    "huggingface-hub": "huggingface-hub==1.0.0.rc1",
+    "huggingface-hub": "huggingface-hub==1.0.0.rc2",
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "jinja2": "jinja2>=3.1.0",

From 1daa883fe176da4a0c614452484e26a357cc5c76 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Fri, 3 Oct 2025 15:38:08 +0200
Subject: [PATCH 21/28] style

---
 src/transformers/image_processing_utils_fast.py               | 4 ++--
 .../models/llava_onevision/modular_llava_onevision.py         | 2 +-
 src/transformers/processing_utils.py                          | 2 --
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index 157ad6b6acdd..09da3e263470 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -15,7 +15,7 @@
 from collections.abc import Iterable
 from copy import deepcopy
 from functools import lru_cache, partial
-from typing import Annotated, Any, Optional, TypedDict, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 
@@ -50,7 +50,7 @@
     logging,
 )
 from .utils.import_utils import is_rocm_platform
-from .utils.type_validators import TypedDictAdapter, device_validator, image_size_validator, tensor_type_validator
+from .utils.type_validators import TypedDictAdapter
 
 
 if is_vision_available():
diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py
index f2d78dfaf81a..88d1c10ab122 100644
--- a/src/transformers/models/llava_onevision/modular_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py
@@ -35,7 +35,7 @@
 
 from ...cache_utils import Cache
 from ...image_processing_utils import BatchFeature
-from ...image_processing_utils_fast import group_images_by_shape, reorder_images
+from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
 from ...image_utils import (
     OPENAI_CLIP_MEAN,
     OPENAI_CLIP_STD,
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index c67d9a49b76d..8ad4d57d1d10 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -84,8 +84,6 @@
 
 
 if is_torch_available():
-    import torch
-
     from .modeling_utils import PreTrainedAudioTokenizerBase
 
 

From b8385a201276d2cb31d4c38f13efed01176f7903 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 7 Oct 2025 19:19:17 +0200
Subject: [PATCH 22/28] fix some tests

---
 .../models/deepseek_vl/image_processing_deepseek_vl.py |  2 +-
 .../image_processing_deepseek_vl_hybrid.py             | 10 +++++-----
 .../deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py   | 10 +++++-----
 .../efficientnet/image_processing_efficientnet.py      |  4 ++--
 src/transformers/models/eomt/image_processing_eomt.py  |  4 ++--
 .../models/janus/image_processing_janus.py             |  2 +-
 .../models/pixtral/image_processing_pixtral.py         |  4 ++--
 .../models/qwen2_5_omni/processing_qwen2_5_omni.py     |  2 +-
 .../models/qwen3_omni_moe/processing_qwen3_omni_moe.py |  4 ++--
 src/transformers/processing_utils.py                   | 10 +++++++++-
 10 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
index c41ac586753e..88dcbff6c416 100644
--- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
@@ -56,7 +56,7 @@ class DeepseekVLImageProcessorKwargs(ImagesKwargs):
         falls below this value after resizing.
     """
 
-    min_size: int
+    min_size: Optional[int]
 
 
 class DeepseekVLImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
index 8b93f7fa6c94..091e02742d89 100644
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
@@ -69,11 +69,11 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
         number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
     """
 
-    min_size: int
-    high_res_size: dict
-    high_res_resample: "PILImageResampling"
-    high_res_image_mean: list[float]
-    high_res_image_std: list[float]
+    min_size: Optional[int]
+    high_res_size: Optional[dict]
+    high_res_resample: Optional[Union["PILImageResampling", int]]
+    high_res_image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
+    high_res_image_std: Optional[Union[float, list[float], tuple[float, ...]]]
 
 
 class DeepseekVLHybridImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
index 4135623743ae..7a745a351c9e 100644
--- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@@ -448,11 +448,11 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
         number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
     """
 
-    min_size: int
-    high_res_size: dict
-    high_res_resample: "PILImageResampling"
-    high_res_image_mean: list[float]
-    high_res_image_std: list[float]
+    min_size: Optional[int]
+    high_res_size: Optional[dict]
+    high_res_resample: Optional[Union["PILImageResampling", int]]
+    high_res_image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
+    high_res_image_std: Optional[Union[float, list[float], tuple[float, ...]]]
 
 
 class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index f5a69eff70e4..4868f573a517 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -52,8 +52,8 @@ class EfficientNetImageProcessorKwargs(ImagesKwargs):
         Normalize the image again with the standard deviation only for image classification if set to True.
     """
 
-    rescale_offset: bool
-    include_top: bool
+    rescale_offset: Optional[bool]
+    include_top: Optional[bool]
 
 
 class EfficientNetImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/eomt/image_processing_eomt.py b/src/transformers/models/eomt/image_processing_eomt.py
index 3381e5bcac50..55e57e9b2343 100644
--- a/src/transformers/models/eomt/image_processing_eomt.py
+++ b/src/transformers/models/eomt/image_processing_eomt.py
@@ -66,8 +66,8 @@ class EomtImageProcessorKwargs(ImagesKwargs):
         denoted with 0 (background) will be replaced with `ignore_index`.
     """
 
-    do_split_image: bool
-    ignore_index: Optional[int] = None
+    do_split_image: Optional[bool]
+    ignore_index: Optional[int]
 
 
 # Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
diff --git a/src/transformers/models/janus/image_processing_janus.py b/src/transformers/models/janus/image_processing_janus.py
index 06ea0fe0e4d1..b342966ece55 100644
--- a/src/transformers/models/janus/image_processing_janus.py
+++ b/src/transformers/models/janus/image_processing_janus.py
@@ -58,7 +58,7 @@ class JanusImageProcessorKwargs(ImagesKwargs):
         falls below this value after resizing.
     """
 
-    min_size: int
+    min_size: Optional[int]
 
 
 class JanusImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index f5df895e66a4..864e76eb0e57 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -52,11 +52,11 @@
 
 class PixtralImageProcessorKwargs(ImagesKwargs):
     """
-    patch_size (`dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`):
+    patch_size (`Union[dict[str, int], int]` *optional*, defaults to `{"height": 16, "width": 16}`):
         Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
     """
 
-    patch_size: Optional[dict[str, int]]
+    patch_size: Optional[Union[dict[str, int], int]]
 
 
 # Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white.
diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index 95f687e1414a..fb0ec1c89420 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -42,7 +42,7 @@ class Qwen2_5_OmniVideosKwargs(VideosKwargs):
     max_frames: Optional[int]
     use_audio_in_video: Optional[bool]
     seconds_per_chunk: Optional[float]
-    position_id_per_seconds: Optional[int]
+    position_id_per_seconds: Optional[Union[int, float]]
 
 
 class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
index a118f7d2260b..70988b3b77c5 100644
--- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
-from typing import Optional
+from typing import Optional, Union
 
 import numpy as np
 
@@ -44,7 +44,7 @@ class Qwen3OmniMoeVideosKwargs(VideosKwargs):
     max_frames: Optional[int]
     use_audio_in_video: Optional[bool]
     seconds_per_chunk: Optional[float]
-    position_id_per_seconds: Optional[int]
+    position_id_per_seconds: Optional[Union[int, float]]
 
 
 class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False):
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 8ad4d57d1d10..17af2698ae56 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -1389,8 +1389,16 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
             for kwarg in output_kwargs.values():
                 kwarg.update(common_kwargs)
 
-        # Finally perform type validation on collected kwargs
         for key, typed_dict_obj in ModelProcessorKwargs.__annotations__.items():
+            if key in map_preprocessor_kwargs:
+                preprocessor = getattr(self, map_preprocessor_kwargs[key], None)
+                if preprocessor is None or getattr(preprocessor, "valid_kwargs", None) is None:
+                    continue
+                preprocessor_typed_dict_obj = getattr(preprocessor, "valid_kwargs")
+                typed_dict_obj = TypedDict(
+                    "merged_typed_dict",
+                    {**preprocessor_typed_dict_obj.__annotations__, **typed_dict_obj.__annotations__},
+                )
             type_validator = TypedDictAdapter(typed_dict_obj)
             type_validator.validate_fields(**output_kwargs[key])
         return output_kwargs

From 69448bb1341f2895090670763383d33c4ba5d08d Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 8 Oct 2025 11:31:07 +0200
Subject: [PATCH 23/28] fix copies

---
 src/transformers/models/janus/modular_janus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py
index a2f2541d84fa..307bbbd38890 100644
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@@ -1296,7 +1296,7 @@ class JanusImageProcessorKwargs(ImagesKwargs):
         falls below this value after resizing.
     """
 
-    min_size: int
+    min_size: Optional[int]
 
 
 class JanusImageProcessor(BlipImageProcessor):

From d25361509f358c01a008a492f52f2093f9cae31b Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 8 Oct 2025 13:42:33 +0200
Subject: [PATCH 24/28] pin rc4 and mark all TypedDict as non-total

---
 setup.py                                              |  2 +-
 src/transformers/dependency_versions_table.py         |  2 +-
 src/transformers/image_processing_utils_fast.py       |  5 ++---
 src/transformers/models/beit/image_processing_beit.py |  2 +-
 .../bridgetower/image_processing_bridgetower.py       |  2 +-
 .../image_processing_cohere2_vision_fast.py           |  2 +-
 .../models/cohere2_vision/modular_cohere2_vision.py   |  2 +-
 .../image_processing_conditional_detr.py              |  2 +-
 .../models/convnext/image_processing_convnext.py      |  2 +-
 .../deepseek_vl/image_processing_deepseek_vl.py       |  2 +-
 .../image_processing_deepseek_vl_hybrid.py            |  2 +-
 .../deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py  |  2 +-
 .../image_processing_deformable_detr.py               |  2 +-
 src/transformers/models/detr/image_processing_detr.py |  2 +-
 .../models/donut/image_processing_donut.py            |  2 +-
 src/transformers/models/dpt/image_processing_dpt.py   |  2 +-
 .../efficientloftr/image_processing_efficientloftr.py |  2 +-
 .../efficientnet/image_processing_efficientnet.py     |  2 +-
 src/transformers/models/emu3/image_processing_emu3.py |  2 +-
 src/transformers/models/eomt/image_processing_eomt.py |  2 +-
 .../models/flava/image_processing_flava.py            |  2 +-
 .../models/gemma3/image_processing_gemma3.py          |  2 +-
 .../models/glm4v/image_processing_glm4v.py            |  2 +-
 .../models/glm4v/video_processing_glm4v.py            |  2 +-
 .../models/got_ocr2/image_processing_got_ocr2.py      |  2 +-
 .../grounding_dino/image_processing_grounding_dino.py |  2 +-
 .../models/idefics/image_processing_idefics.py        |  2 +-
 .../models/idefics2/image_processing_idefics2.py      |  2 +-
 .../models/idefics3/image_processing_idefics3.py      |  2 +-
 .../models/imagegpt/image_processing_imagegpt.py      |  2 +-
 .../video_processing_instructblipvideo.py             |  2 +-
 .../models/internvl/video_processing_internvl.py      |  2 +-
 .../models/janus/image_processing_janus.py            |  2 +-
 src/transformers/models/janus/modular_janus.py        |  2 +-
 .../models/kosmos2_5/image_processing_kosmos2_5.py    |  2 +-
 .../models/layoutlmv2/image_processing_layoutlmv2.py  |  2 +-
 .../models/layoutlmv3/image_processing_layoutlmv3.py  |  2 +-
 .../models/lfm2_vl/image_processing_lfm2_vl_fast.py   |  2 +-
 .../models/llama4/image_processing_llama4_fast.py     |  2 +-
 .../models/llava_next/image_processing_llava_next.py  |  2 +-
 .../image_processing_llava_onevision.py               |  2 +-
 .../mask2former/image_processing_mask2former.py       |  2 +-
 .../models/maskformer/image_processing_maskformer.py  |  2 +-
 .../models/mllama/image_processing_mllama.py          |  2 +-
 .../mobilenet_v2/image_processing_mobilenet_v2.py     |  2 +-
 .../models/mobilevit/image_processing_mobilevit.py    |  2 +-
 .../models/nougat/image_processing_nougat.py          |  2 +-
 .../models/oneformer/image_processing_oneformer.py    |  2 +-
 .../models/ovis2/image_processing_ovis2.py            |  2 +-
 .../image_processing_perception_lm_fast.py            |  2 +-
 .../image_processing_phi4_multimodal_fast.py          |  2 +-
 .../models/pix2struct/image_processing_pix2struct.py  |  2 +-
 .../models/pixtral/image_processing_pixtral.py        |  2 +-
 .../models/poolformer/image_processing_poolformer.py  |  2 +-
 .../image_processing_prompt_depth_anything.py         |  2 +-
 .../models/qwen2_5_omni/processing_qwen2_5_omni.py    |  2 +-
 .../models/qwen2_vl/image_processing_qwen2_vl.py      |  2 +-
 .../models/qwen2_vl/video_processing_qwen2_vl.py      |  2 +-
 .../qwen3_omni_moe/processing_qwen3_omni_moe.py       |  2 +-
 .../models/qwen3_vl/video_processing_qwen3_vl.py      |  2 +-
 .../models/rt_detr/image_processing_rt_detr.py        |  2 +-
 src/transformers/models/sam/image_processing_sam.py   |  2 +-
 src/transformers/models/sam/processing_sam.py         |  2 +-
 .../models/sam2/image_processing_sam2_fast.py         |  2 +-
 src/transformers/models/sam2/modular_sam2.py          |  2 +-
 src/transformers/models/sam_hq/processing_samhq.py    |  2 +-
 .../models/segformer/image_processing_segformer.py    |  2 +-
 .../models/siglip2/image_processing_siglip2.py        |  2 +-
 .../models/smolvlm/image_processing_smolvlm.py        |  2 +-
 .../models/smolvlm/video_processing_smolvlm.py        |  2 +-
 .../models/superpoint/image_processing_superpoint.py  |  2 +-
 .../models/swin2sr/image_processing_swin2sr.py        |  2 +-
 .../models/textnet/image_processing_textnet.py        |  2 +-
 src/transformers/models/tvp/image_processing_tvp.py   |  2 +-
 src/transformers/models/vilt/image_processing_vilt.py |  2 +-
 .../models/vitmatte/image_processing_vitmatte.py      |  2 +-
 .../models/yolos/image_processing_yolos.py            |  2 +-
 .../models/zoedepth/image_processing_zoedepth.py      |  2 +-
 src/transformers/processing_utils.py                  |  6 +++---
 src/transformers/utils/type_validators.py             | 11 +----------
 src/transformers/video_processing_utils.py            |  5 ++---
 81 files changed, 85 insertions(+), 96 deletions(-)

diff --git a/setup.py b/setup.py
index 41743d8e2ef1..462a97c44930 100644
--- a/setup.py
+++ b/setup.py
@@ -114,7 +114,7 @@
     "GitPython<3.1.19",
     "hf-doc-builder>=0.3.0",
     "hf_xet",
-    "huggingface-hub==1.0.0.rc2",
+    "huggingface-hub==1.0.0.rc4",
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "jinja2>=3.1.0",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index a6b6a9c445e6..1caefce16c3e 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -23,7 +23,7 @@
     "GitPython": "GitPython<3.1.19",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
     "hf_xet": "hf_xet",
-    "huggingface-hub": "huggingface-hub==1.0.0.rc2",
+    "huggingface-hub": "huggingface-hub==1.0.0.rc4",
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "jinja2": "jinja2>=3.1.0",
diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index 09da3e263470..8d4c79afbb1d 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -18,6 +18,7 @@
 from typing import Any, Optional, Union
 
 import numpy as np
+from huggingface_hub.dataclasses import validate_typed_dict
 
 from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from .image_transforms import (
@@ -50,7 +51,6 @@
     logging,
 )
 from .utils.import_utils import is_rocm_platform
-from .utils.type_validators import TypedDictAdapter
 
 
 if is_vision_available():
@@ -713,8 +713,7 @@ def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs])
         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)
 
         # Perform type validation on received kwargs
-        type_validator = TypedDictAdapter(self.valid_kwargs)
-        type_validator.validate_fields(**kwargs)
+        validate_typed_dict(self.valid_kwargs, kwargs)
 
         # Set default kwargs from self. This ensures that if a kwarg is not provided
         # by the user, it gets its default value from the instance, or is set to None.
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index f65709168379..9025e2868df8 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -55,7 +55,7 @@
 logger = logging.get_logger(__name__)
 
 
-class BeitImageProcessorKwargs(ImagesKwargs):
+class BeitImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
         Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
index cad23d02893f..c44cf6a7ee3d 100644
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -123,7 +123,7 @@ def get_resize_output_image_size(
     return new_height, new_width
 
 
-class BridgeTowerImageProcessorKwargs(ImagesKwargs):
+class BridgeTowerImageProcessorKwargs(ImagesKwargs, total=False):
     size_divisor: Optional[int]
 
 
diff --git a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
index 358d84ac6d7c..638b7549bfae 100644
--- a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
+++ b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
@@ -33,7 +33,7 @@
 from ...utils import TensorType, auto_docstring
 
 
-class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
+class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
     """
     crop_to_patches (`bool`, *optional*, defaults to `False`):
         Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
diff --git a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
index 997a6f2d638e..5df3e075ed20 100644
--- a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
@@ -303,7 +303,7 @@ def get_optimal_tiled_canvas(
     return best_grid
 
 
-class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
+class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
     """
     crop_to_patches (`bool`, *optional*, defaults to `False`):
         Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 6df784585e9b..efc532d413c4 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -729,7 +729,7 @@ def compute_segments(
     return segmentation, segments
 
 
-class ConditionalDetrImageProcessorKwargs(ImagesKwargs):
+class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
         Data format of the annotations. One of "coco_detection" or "coco_panoptic".
diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py
index d2e180de2464..95e9cd91bd4a 100644
--- a/src/transformers/models/convnext/image_processing_convnext.py
+++ b/src/transformers/models/convnext/image_processing_convnext.py
@@ -50,7 +50,7 @@
 logger = logging.get_logger(__name__)
 
 
-class ConvNextImageProcessorKwargs(ImagesKwargs):
+class ConvNextImageProcessorKwargs(ImagesKwargs, total=False):
     """
     crop_pct (`float`, *optional*):
         Percentage of the image to crop. Only has an effect if size < 384. Can be
diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
index 88dcbff6c416..f3f4c1ac6e34 100644
--- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
@@ -49,7 +49,7 @@
 logger = logging.get_logger(__name__)
 
 
-class DeepseekVLImageProcessorKwargs(ImagesKwargs):
+class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     min_size (`int`, *optional*, defaults to 14):
         The minimum allowed size for the resized image. Ensures that neither the height nor width
diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
index 091e02742d89..7837cff2d33b 100644
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
@@ -50,7 +50,7 @@
 logger = logging.get_logger(__name__)
 
 
-class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
+class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     min_size (`int`, *optional*, defaults to 14):
         The minimum allowed size for the resized image. Ensures that neither the height nor width
diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
index 7a745a351c9e..343ba01ac874 100644
--- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@@ -429,7 +429,7 @@ def prepare_inputs_for_generation(
         return model_inputs
 
 
-class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
+class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     min_size (`int`, *optional*, defaults to 14):
         The minimum allowed size for the resized image. Ensures that neither the height nor width
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index eabdb536ff70..9fd63b6340c6 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -82,7 +82,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class DeformableDetrImageProcessorKwargs(ImagesKwargs):
+class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
         Data format of the annotations. One of "coco_detection" or "coco_panoptic".
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index 02261fc2a129..1cd636a0cb72 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -84,7 +84,7 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
-class DetrImageProcessorKwargs(ImagesKwargs):
+class DetrImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
         Data format of the annotations. One of "coco_detection" or "coco_panoptic".
diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py
index 5af365099724..802b05c776fd 100644
--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@@ -52,7 +52,7 @@
     import PIL
 
 
-class DonutImageProcessorKwargs(ImagesKwargs):
+class DonutImageProcessorKwargs(ImagesKwargs, total=False):
     """
     do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
         Whether to resize the image using thumbnail method.
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 3ba5a6e30c21..71e930a7bfcf 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -64,7 +64,7 @@
 logger = logging.get_logger(__name__)
 
 
-class DPTImageProcessorKwargs(ImagesKwargs):
+class DPTImageProcessorKwargs(ImagesKwargs, total=False):
     """
     ensure_multiple_of (`int`, *optional*, defaults to 1):
         If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
diff --git a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py
index d1beabb6c2b9..a5c06be89e98 100644
--- a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py
+++ b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py
@@ -50,7 +50,7 @@
 logger = logging.get_logger(__name__)
 
 
-class EfficientLoFTRImageProcessorKwargs(ImagesKwargs):
+class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     do_grayscale (`bool`, *optional*, defaults to `True`):
         Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index 4868f573a517..0a3a7542ff67 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -44,7 +44,7 @@
 logger = logging.get_logger(__name__)
 
 
-class EfficientNetImageProcessorKwargs(ImagesKwargs):
+class EfficientNetImageProcessorKwargs(ImagesKwargs, total=False):
     """
     rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
         Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py
index fca5316a3fca..ec270fef2c87 100644
--- a/src/transformers/models/emu3/image_processing_emu3.py
+++ b/src/transformers/models/emu3/image_processing_emu3.py
@@ -47,7 +47,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Emu3ImageProcessorKwargs(ImagesKwargs):
+class Emu3ImageProcessorKwargs(ImagesKwargs, total=False):
     ratio: Optional[str]
     image_area: Optional[int]
 
diff --git a/src/transformers/models/eomt/image_processing_eomt.py b/src/transformers/models/eomt/image_processing_eomt.py
index 55e57e9b2343..b422ca3020d9 100644
--- a/src/transformers/models/eomt/image_processing_eomt.py
+++ b/src/transformers/models/eomt/image_processing_eomt.py
@@ -55,7 +55,7 @@
     import torch.nn.functional as F
 
 
-class EomtImageProcessorKwargs(ImagesKwargs):
+class EomtImageProcessorKwargs(ImagesKwargs, total=False):
     """
     do_split_image (`bool`, *optional*, defaults to `False`):
         Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py
index 3c19a2405169..2005011dcb2b 100644
--- a/src/transformers/models/flava/image_processing_flava.py
+++ b/src/transformers/models/flava/image_processing_flava.py
@@ -57,7 +57,7 @@
 LOGIT_LAPLACE_EPS: float = 0.1
 
 
-class FlavaImageProcessorKwargs(ImagesKwargs):
+class FlavaImageProcessorKwargs(ImagesKwargs, total=False):
     """
     return_image_mask (`bool`, *optional*, defaults to `False`):
         Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
diff --git a/src/transformers/models/gemma3/image_processing_gemma3.py b/src/transformers/models/gemma3/image_processing_gemma3.py
index 5206a13a04a3..4b32a1f31a05 100644
--- a/src/transformers/models/gemma3/image_processing_gemma3.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3.py
@@ -51,7 +51,7 @@
     import PIL
 
 
-class Gemma3ImageProcessorKwargs(ImagesKwargs):
+class Gemma3ImageProcessorKwargs(ImagesKwargs, total=False):
     """
     do_pan_and_scan (`bool`, *optional*):
         Whether to apply `pan_and_scan` to images.
diff --git a/src/transformers/models/glm4v/image_processing_glm4v.py b/src/transformers/models/glm4v/image_processing_glm4v.py
index 13f4472e61f3..e58d295ca465 100644
--- a/src/transformers/models/glm4v/image_processing_glm4v.py
+++ b/src/transformers/models/glm4v/image_processing_glm4v.py
@@ -47,7 +47,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Glm4vImageProcessorKwargs(ImagesKwargs):
+class Glm4vImageProcessorKwargs(ImagesKwargs, total=False):
     """
     patch_size (`int`, *optional*, defaults to 14):
         The spatial patch size of the vision encoder.
diff --git a/src/transformers/models/glm4v/video_processing_glm4v.py b/src/transformers/models/glm4v/video_processing_glm4v.py
index 8324ad482baa..95ea8160c606 100644
--- a/src/transformers/models/glm4v/video_processing_glm4v.py
+++ b/src/transformers/models/glm4v/video_processing_glm4v.py
@@ -36,7 +36,7 @@
 from .image_processing_glm4v import smart_resize
 
 
-class Glm4vVideoProcessorInitKwargs(VideosKwargs):
+class Glm4vVideoProcessorInitKwargs(VideosKwargs, total=False):
     max_image_size: Optional[dict[str, int]]
     patch_size: Optional[int]
     temporal_patch_size: Optional[int]
diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
index 3424020c65b3..5882f22b1bd0 100644
--- a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
@@ -49,7 +49,7 @@
 logger = logging.get_logger(__name__)
 
 
-class GotOcr2ImageProcessorKwargs(ImagesKwargs):
+class GotOcr2ImageProcessorKwargs(ImagesKwargs, total=False):
     """
     crop_to_patches (`bool`, *optional*, defaults to `False`):
         Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index c099d44e3d58..f556b2d295c3 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -93,7 +93,7 @@ class AnnotationFormat(ExplicitEnum):
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
-class GroundingDinoImageProcessorKwargs(ImagesKwargs):
+class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
         Data format of the annotations. One of "coco_detection" or "coco_panoptic".
diff --git a/src/transformers/models/idefics/image_processing_idefics.py b/src/transformers/models/idefics/image_processing_idefics.py
index 7fda46e3a990..6a53e4d9b7d4 100644
--- a/src/transformers/models/idefics/image_processing_idefics.py
+++ b/src/transformers/models/idefics/image_processing_idefics.py
@@ -36,7 +36,7 @@
 IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
 
 
-class IdeficsImageProcessorKwargs(ImagesKwargs):
+class IdeficsImageProcessorKwargs(ImagesKwargs, total=False):
     """
     transform (`Callable`, *optional*):
         A custom transform function that accepts a single image can be passed for training. For example,
diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py
index b9b741a9704b..1baec594bc90 100644
--- a/src/transformers/models/idefics2/image_processing_idefics2.py
+++ b/src/transformers/models/idefics2/image_processing_idefics2.py
@@ -47,7 +47,7 @@
     from PIL import Image
 
 
-class Idefics2ImageProcessorKwargs(ImagesKwargs):
+class Idefics2ImageProcessorKwargs(ImagesKwargs, total=False):
     """
     do_image_splitting (`bool`, *optional*, defaults to `False`):
         Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index f098a9f54dc1..d53a75596fea 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -48,7 +48,7 @@
     from PIL import Image
 
 
-class Idefics3ImageProcessorKwargs(ImagesKwargs):
+class Idefics3ImageProcessorKwargs(ImagesKwargs, total=False):
     """
     do_image_splitting (`bool`, *optional*, defaults to `True`):
         Whether to split the image into sub-images concatenated with the original image. They are split into patches
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index 8f79cd58ec5f..f5a1682d9be3 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -45,7 +45,7 @@
 logger = logging.get_logger(__name__)
 
 
-class ImageGPTImageProcessorKwargs(ImagesKwargs):
+class ImageGPTImageProcessorKwargs(ImagesKwargs, total=False):
     """
     clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
         The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
diff --git a/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py
index d2fe3cc7f343..d89d9069d495 100644
--- a/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py
@@ -30,7 +30,7 @@
 from ...video_utils import group_videos_by_shape, reorder_videos
 
 
-class InstructBlipVideoVideoProcessorInitKwargs(VideosKwargs): ...
+class InstructBlipVideoVideoProcessorInitKwargs(VideosKwargs, total=False): ...
 
 
 class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
diff --git a/src/transformers/models/internvl/video_processing_internvl.py b/src/transformers/models/internvl/video_processing_internvl.py
index 0e3f0469fe5e..703c40b94f0c 100644
--- a/src/transformers/models/internvl/video_processing_internvl.py
+++ b/src/transformers/models/internvl/video_processing_internvl.py
@@ -27,7 +27,7 @@
 from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
 
 
-class InternVLVideoProcessorInitKwargs(VideosKwargs):
+class InternVLVideoProcessorInitKwargs(VideosKwargs, total=False):
     initial_shift: Optional[Union[bool, float, int]]
 
 
diff --git a/src/transformers/models/janus/image_processing_janus.py b/src/transformers/models/janus/image_processing_janus.py
index b342966ece55..6b83ecf8eb5c 100644
--- a/src/transformers/models/janus/image_processing_janus.py
+++ b/src/transformers/models/janus/image_processing_janus.py
@@ -51,7 +51,7 @@
 logger = logging.get_logger(__name__)
 
 
-class JanusImageProcessorKwargs(ImagesKwargs):
+class JanusImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     min_size (`int`, *optional*, defaults to 14):
         The minimum allowed size for the resized image. Ensures that neither the height nor width
diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py
index 307bbbd38890..36fe7abae438 100644
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@@ -1289,7 +1289,7 @@ def generate(
             return generated_tokens
 
 
-class JanusImageProcessorKwargs(ImagesKwargs):
+class JanusImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     min_size (`int`, *optional*, defaults to 14):
         The minimum allowed size for the resized image. Ensures that neither the height nor width
diff --git a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
index 5f337e4b04c9..ca80cb978ec9 100644
--- a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
+++ b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
@@ -46,7 +46,7 @@
 DEFAULT_FONT_PATH = "ybelkada/fonts"
 
 
-class Kosmos2_5ImageProcessorKwargs(ImagesKwargs):
+class Kosmos2_5ImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
         The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
index d5a7e95537c5..5e9289c2701b 100644
--- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
@@ -52,7 +52,7 @@
 logger = logging.get_logger(__name__)
 
 
-class LayoutLMv2ImageProcessorKwargs(ImagesKwargs):
+class LayoutLMv2ImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     apply_ocr (`bool`, *optional*, defaults to `True`):
         Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
index b9273dc75cad..9ba4f5507fc1 100644
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -56,7 +56,7 @@
 logger = logging.get_logger(__name__)
 
 
-class LayoutLMv3ImageProcessorKwargs(ImagesKwargs):
+class LayoutLMv3ImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     apply_ocr (`bool`, *optional*, defaults to `True`):
         Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
diff --git a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
index ad99504fcad6..3d16a4d0a273 100755
--- a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
+++ b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
@@ -169,7 +169,7 @@ def pad_along_first_dim(
     return images, pixel_mask
 
 
-class Lfm2VlImageProcessorKwargs(ImagesKwargs):
+class Lfm2VlImageProcessorKwargs(ImagesKwargs, total=False):
     """
     downsample_factor (`int`, *optional*, defaults to `2`):
         The downsampling factor for images used when resizing the image.
diff --git a/src/transformers/models/llama4/image_processing_llama4_fast.py b/src/transformers/models/llama4/image_processing_llama4_fast.py
index e2678f556d02..42b2221d6a18 100644
--- a/src/transformers/models/llama4/image_processing_llama4_fast.py
+++ b/src/transformers/models/llama4/image_processing_llama4_fast.py
@@ -308,7 +308,7 @@ def get_best_fit(
     return optimal_canvas
 
 
-class Llama4ImageProcessorKwargs(ImagesKwargs):
+class Llama4ImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     max_patches (`int`, *optional*, defaults to 16):
         The maximum number of patches to be extracted from the image.
diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py
index 56ebc10f391d..26ca94dad6e9 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -59,7 +59,7 @@
     from PIL import Image
 
 
-class LlavaNextImageProcessorKwargs(ImagesKwargs):
+class LlavaNextImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     image_grid_pinpoints (`list[list[int]]`, *optional*):
         A list of possible resolutions to use for processing high resolution images. The best resolution is selected
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
index 119df9550a2a..3654edcdbf71 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
@@ -58,7 +58,7 @@
     from PIL import Image
 
 
-class LlavaOnevisionImageProcessorKwargs(ImagesKwargs):
+class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     image_grid_pinpoints (`list[list[int]]`, *optional*):
         A list of possible resolutions to use for processing high resolution images. The best resolution is selected
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
index ce79107f05b3..235dbe8039f1 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -61,7 +61,7 @@
     from torch import nn
 
 
-class Mask2FormerImageProcessorKwargs(ImagesKwargs):
+class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     ignore_index (`int`, *optional*):
         Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index 60e703405605..4046fcafb07a 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -67,7 +67,7 @@
     from torch import nn
 
 
-class MaskFormerImageProcessorKwargs(ImagesKwargs):
+class MaskFormerImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     ignore_index (`int`, *optional*):
         Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
diff --git a/src/transformers/models/mllama/image_processing_mllama.py b/src/transformers/models/mllama/image_processing_mllama.py
index cd79f7de3121..50579703b905 100644
--- a/src/transformers/models/mllama/image_processing_mllama.py
+++ b/src/transformers/models/mllama/image_processing_mllama.py
@@ -50,7 +50,7 @@
 logger = logging.get_logger(__name__)
 
 
-class MllamaImageProcessorKwargs(ImagesKwargs):
+class MllamaImageProcessorKwargs(ImagesKwargs, total=False):
     """
     max_image_tiles (`int`, *optional*):
         The maximum number of tiles allowed.
diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
index e8dfe992544a..ad4c6937b76a 100644
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
@@ -51,7 +51,7 @@
 logger = logging.get_logger(__name__)
 
 
-class MobileNetV2ImageProcessorKwargs(ImagesKwargs):
+class MobileNetV2ImageProcessorKwargs(ImagesKwargs, total=False):
     """
     do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
         Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index 576ef9f449dc..8a914608295f 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -53,7 +53,7 @@
 logger = logging.get_logger(__name__)
 
 
-class MobileVitImageProcessorKwargs(ImagesKwargs):
+class MobileVitImageProcessorKwargs(ImagesKwargs, total=False):
     """
     do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`):
         Whether to flip the color channels from RGB to BGR or vice versa.
diff --git a/src/transformers/models/nougat/image_processing_nougat.py b/src/transformers/models/nougat/image_processing_nougat.py
index 0a5c445645e0..35db47bdf0c9 100644
--- a/src/transformers/models/nougat/image_processing_nougat.py
+++ b/src/transformers/models/nougat/image_processing_nougat.py
@@ -52,7 +52,7 @@
     import PIL
 
 
-class NougatImageProcessorKwargs(ImagesKwargs):
+class NougatImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     do_crop_margin (`bool`, *optional*, defaults to `True`):
         Whether to crop the image margins.
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
index 86ce8abf084e..058c655559a8 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -64,7 +64,7 @@
     from torch import nn
 
 
-class OneFormerImageProcessorKwargs(ImagesKwargs):
+class OneFormerImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     repo_path (`str`, *optional*, defaults to `shi-labs/oneformer_demo`):
         Path to a local directory or Hugging Face Hub repository containing model metadata.
diff --git a/src/transformers/models/ovis2/image_processing_ovis2.py b/src/transformers/models/ovis2/image_processing_ovis2.py
index 2bc883f95e73..ab4ae57f0cb8 100644
--- a/src/transformers/models/ovis2/image_processing_ovis2.py
+++ b/src/transformers/models/ovis2/image_processing_ovis2.py
@@ -44,7 +44,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Ovis2ImageProcessorKwargs(ImagesKwargs):
+class Ovis2ImageProcessorKwargs(ImagesKwargs, total=False):
     """
     crop_to_patches (`bool`, *optional*, defaults to `False`):
         Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index ec721499917c..8409be51e034 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -42,7 +42,7 @@
 )
 
 
-class PerceptionLMImageProcessorKwargs(ImagesKwargs):
+class PerceptionLMImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     vision_input_type (`str`, *optional*, defaults to `"thumb+tile"`):
         Vision processing strategy. `"thumb+tile"` uses both thumbnails and multiple tiles for
diff --git a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py
index 33b10915354f..6dcbf61f38f6 100644
--- a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py
+++ b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py
@@ -35,7 +35,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Phi4MultimodalImageProcessorKwargs(ImagesKwargs):
+class Phi4MultimodalImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     patch_size (`int`, *optional*):
         The size of the patch.
diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py
index e0c630369029..66cc7cd0b04a 100644
--- a/src/transformers/models/pix2struct/image_processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@@ -49,7 +49,7 @@
 DEFAULT_FONT_PATH = "ybelkada/fonts"
 
 
-class Pix2StructImageProcessorKwargs(ImagesKwargs):
+class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False):
     """
     max_patches (`int`, *optional*):
         Maximum number of patches to extract.
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 864e76eb0e57..387c82f4e0a0 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -50,7 +50,7 @@
     import PIL
 
 
-class PixtralImageProcessorKwargs(ImagesKwargs):
+class PixtralImageProcessorKwargs(ImagesKwargs, total=False):
     """
     patch_size (`Union[dict[str, int], int]` *optional*, defaults to `{"height": 16, "width": 16}`):
         Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py
index 7d03f8281285..c7bb38c4340c 100644
--- a/src/transformers/models/poolformer/image_processing_poolformer.py
+++ b/src/transformers/models/poolformer/image_processing_poolformer.py
@@ -48,7 +48,7 @@
 logger = logging.get_logger(__name__)
 
 
-class PoolFormerImageProcessorKwargs(ImagesKwargs):
+class PoolFormerImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     crop_pct (`float`, *optional*, defaults to `self.crop_pct`):
         Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`.
diff --git a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
index b8220a30fa42..cb8c0e37af5f 100644
--- a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
+++ b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
@@ -54,7 +54,7 @@
 logger = logging.get_logger(__name__)
 
 
-class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs):
+class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     keep_aspect_ratio (`bool`, *optional*):
         If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index fb0ec1c89420..f522f63beb33 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -32,7 +32,7 @@
 
 # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
 # and does not use them in video processor class
-class Qwen2_5_OmniVideosKwargs(VideosKwargs):
+class Qwen2_5_OmniVideosKwargs(VideosKwargs, total=False):
     min_pixels: Optional[int]
     max_pixels: Optional[int]
     patch_size: Optional[int]
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
index fe218bd05b9d..82b7bce43fe9 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -52,7 +52,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Qwen2VLImageProcessorKwargs(ImagesKwargs):
+class Qwen2VLImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     min_pixels (`int`, *optional*, defaults to `56 * 56`):
         The min pixels of the image to resize the image.
diff --git a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
index 84bcd827f02e..2fc8bf1ac5b3 100644
--- a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
@@ -41,7 +41,7 @@
 from .image_processing_qwen2_vl import smart_resize
 
 
-class Qwen2VLVideoProcessorInitKwargs(VideosKwargs):
+class Qwen2VLVideoProcessorInitKwargs(VideosKwargs, total=False):
     min_pixels: Optional[int]
     max_pixels: Optional[int]
     patch_size: Optional[int]
diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
index 70988b3b77c5..ba54b3a0c1f6 100644
--- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
@@ -34,7 +34,7 @@
 
 # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
 # and does not use them in video processor class
-class Qwen3OmniMoeVideosKwargs(VideosKwargs):
+class Qwen3OmniMoeVideosKwargs(VideosKwargs, total=False):
     min_pixels: Optional[int]
     max_pixels: Optional[int]
     patch_size: Optional[int]
diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
index c4648788c9dc..3bb06d7f2b08 100644
--- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
@@ -64,7 +64,7 @@ def smart_resize(
     return h_bar, w_bar
 
 
-class Qwen3VLVideoProcessorInitKwargs(VideosKwargs):
+class Qwen3VLVideoProcessorInitKwargs(VideosKwargs, total=False):
     patch_size: Optional[int]
     temporal_patch_size: Optional[int]
     merge_size: Optional[int]
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py
index b3c77a8920cd..5e91e3c4fc01 100644
--- a/src/transformers/models/rt_detr/image_processing_rt_detr.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py
@@ -68,7 +68,7 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
 
 
-class RTDetrImageProcessorKwargs(ImagesKwargs):
+class RTDetrImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
         Data format of the annotations. One of "coco_detection" or "coco_panoptic".
diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
index e9da260a6e9c..3ba27f63d993 100644
--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -58,7 +58,7 @@
 logger = logging.get_logger(__name__)
 
 
-class SamImageProcessorKwargs(ImagesKwargs):
+class SamImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     mask_size (`dict[str, int]`, *optional*):
         The size `{"longest_edge": int}` to resize the segmentation maps to.
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index bc82daf2034d..18d812fc6825 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -31,7 +31,7 @@
     import torch
 
 
-class SamImagesKwargs(ImagesKwargs):
+class SamImagesKwargs(ImagesKwargs, total=False):
     segmentation_maps: Optional[ImageInput]
     input_points: Optional[list[list[float]]]
     input_labels: Optional[list[list[int]]]
diff --git a/src/transformers/models/sam2/image_processing_sam2_fast.py b/src/transformers/models/sam2/image_processing_sam2_fast.py
index c468f6400d54..b00fba952973 100644
--- a/src/transformers/models/sam2/image_processing_sam2_fast.py
+++ b/src/transformers/models/sam2/image_processing_sam2_fast.py
@@ -43,7 +43,7 @@
 from ...utils import TensorType, auto_docstring
 
 
-class Sam2FastImageProcessorKwargs(ImagesKwargs):
+class Sam2FastImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     mask_size (`dict[str, int]`, *optional*):
         The size `{"height": int, "width": int}` to resize the segmentation maps to.
diff --git a/src/transformers/models/sam2/modular_sam2.py b/src/transformers/models/sam2/modular_sam2.py
index 8fcfe36a759e..eac3cc232c1a 100644
--- a/src/transformers/models/sam2/modular_sam2.py
+++ b/src/transformers/models/sam2/modular_sam2.py
@@ -70,7 +70,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Sam2FastImageProcessorKwargs(ImagesKwargs):
+class Sam2FastImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     mask_size (`dict[str, int]`, *optional*):
         The size `{"height": int, "width": int}` to resize the segmentation maps to.
diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py
index f2852b8623c4..8e1945971ebc 100644
--- a/src/transformers/models/sam_hq/processing_samhq.py
+++ b/src/transformers/models/sam_hq/processing_samhq.py
@@ -31,7 +31,7 @@
     import torch
 
 
-class SamHQImagesKwargs(ImagesKwargs):
+class SamHQImagesKwargs(ImagesKwargs, total=False):
     segmentation_maps: Optional[ImageInput]
     input_points: Optional[list[list[float]]]
     input_labels: Optional[list[list[int]]]
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index ce9ace8115a4..bedb4ff54651 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -55,7 +55,7 @@
 logger = logging.get_logger(__name__)
 
 
-class SegformerImageProcessorKwargs(ImagesKwargs):
+class SegformerImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
         Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
diff --git a/src/transformers/models/siglip2/image_processing_siglip2.py b/src/transformers/models/siglip2/image_processing_siglip2.py
index caff1bce0bc9..d71b66464223 100644
--- a/src/transformers/models/siglip2/image_processing_siglip2.py
+++ b/src/transformers/models/siglip2/image_processing_siglip2.py
@@ -48,7 +48,7 @@
     from PIL import Image
 
 
-class Siglip2ImageProcessorKwargs(ImagesKwargs):
+class Siglip2ImageProcessorKwargs(ImagesKwargs, total=False):
     """
     patch_size (`int`, *optional*, defaults to 16):
         The size (resolution) of each patch the image will be split to.
diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm.py b/src/transformers/models/smolvlm/image_processing_smolvlm.py
index e231c1ec6b07..c12c08182a94 100644
--- a/src/transformers/models/smolvlm/image_processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/image_processing_smolvlm.py
@@ -53,7 +53,7 @@
 logger = logging.get_logger(__name__)
 
 
-class SmolVLMImageProcessorKwargs(ImagesKwargs):
+class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False):
     """
     do_image_splitting (`bool`, *optional*, defaults to `True`):
         Whether to split the image into sub-images concatenated with the original image. They are split into patches
diff --git a/src/transformers/models/smolvlm/video_processing_smolvlm.py b/src/transformers/models/smolvlm/video_processing_smolvlm.py
index ce73dfb4a82e..d8cecb6c0c5c 100644
--- a/src/transformers/models/smolvlm/video_processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/video_processing_smolvlm.py
@@ -90,7 +90,7 @@ def get_resize_output_image_size(
     return height, width
 
 
-class SmolVLMVideoProcessorInitKwargs(VideosKwargs):
+class SmolVLMVideoProcessorInitKwargs(VideosKwargs, total=False):
     max_image_size: Optional[dict[str, int]]
 
 
diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index 633d9b0b16b9..9c810c450ad7 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -46,7 +46,7 @@
 logger = logging.get_logger(__name__)
 
 
-class SuperPointImageProcessorKwargs(ImagesKwargs):
+class SuperPointImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     do_grayscale (`bool`, *optional*, defaults to `True`):
         Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py
index 018a1bf0f4df..d9de6c684959 100644
--- a/src/transformers/models/swin2sr/image_processing_swin2sr.py
+++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py
@@ -38,7 +38,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Swin2SRImageProcessorKwargs(ImagesKwargs):
+class Swin2SRImageProcessorKwargs(ImagesKwargs, total=False):
     size_divisor: Optional[int]
 
 
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 1a4d68522205..e5e127d987e0 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -49,7 +49,7 @@
     import PIL
 
 
-class TextNetImageProcessorKwargs(ImagesKwargs):
+class TextNetImageProcessorKwargs(ImagesKwargs, total=False):
     size_divisor: Optional[int]
 
 
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 67c1ffe4fae8..42834287e110 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -50,7 +50,7 @@
 logger = logging.get_logger(__name__)
 
 
-class TvpImageProcessorKwargs(ImagesKwargs):
+class TvpImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     do_flip_channel_order (`bool`, *optional*):
         Whether to flip the channel order of the image from RGB to BGR.
diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py
index bb29e1d1ee30..fadb5302d4ee 100644
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -47,7 +47,7 @@
 logger = logging.get_logger(__name__)
 
 
-class ViltImageProcessorKwargs(ImagesKwargs):
+class ViltImageProcessorKwargs(ImagesKwargs, total=False):
     size_divisor: Optional[int]
 
 
diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte.py b/src/transformers/models/vitmatte/image_processing_vitmatte.py
index 95933c053ce5..eb994b641962 100644
--- a/src/transformers/models/vitmatte/image_processing_vitmatte.py
+++ b/src/transformers/models/vitmatte/image_processing_vitmatte.py
@@ -41,7 +41,7 @@
 logger = logging.get_logger(__name__)
 
 
-class VitMatteImageProcessorKwargs(ImagesKwargs):
+class VitMatteImageProcessorKwargs(ImagesKwargs, total=False):
     size_divisor: Optional[int]
 
 
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 21aac76adac8..19fda87897ae 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -81,7 +81,7 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
-class YolosImageProcessorKwargs(ImagesKwargs):
+class YolosImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
         Data format of the annotations. One of "coco_detection" or "coco_panoptic".
diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py
index e8ad44dd76c3..3fdf414bc20c 100644
--- a/src/transformers/models/zoedepth/image_processing_zoedepth.py
+++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py
@@ -62,7 +62,7 @@
 logger = logging.get_logger(__name__)
 
 
-class ZoeDepthImageProcessorKwargs(ImagesKwargs):
+class ZoeDepthImageProcessorKwargs(ImagesKwargs, total=False):
     """
     keep_aspect_ratio (`bool`, *optional*, defaults to `True`):
         If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 17af2698ae56..0d49db5fa229 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -30,6 +30,7 @@
 
 import numpy as np
 import typing_extensions
+from huggingface_hub.dataclasses import validate_typed_dict
 from huggingface_hub.errors import EntryNotFoundError
 
 from .audio_utils import AudioInput, load_audio
@@ -38,7 +39,6 @@
 from .image_utils import ChannelDimension, ImageInput, is_vision_available
 from .utils.chat_template_utils import render_jinja_template
 from .utils.type_validators import (
-    TypedDictAdapter,
     device_validator,
     image_size_validator,
     padding_validator,
@@ -1398,9 +1398,9 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                 typed_dict_obj = TypedDict(
                     "merged_typed_dict",
                     {**preprocessor_typed_dict_obj.__annotations__, **typed_dict_obj.__annotations__},
+                    total=False,
                 )
-            type_validator = TypedDictAdapter(typed_dict_obj)
-            type_validator.validate_fields(**output_kwargs[key])
+            validate_typed_dict(typed_dict_obj, output_kwargs[key])
         return output_kwargs
 
     @classmethod
diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py
index 4b56f7842480..57c839151a76 100644
--- a/src/transformers/utils/type_validators.py
+++ b/src/transformers/utils/type_validators.py
@@ -2,7 +2,7 @@
 from dataclasses import MISSING, field, make_dataclass
 from typing import Annotated, ForwardRef, Optional, TypedDict, Union, get_args, get_origin
 
-from huggingface_hub.dataclasses import as_validated_field, strict
+from huggingface_hub.dataclasses import strict
 
 from ..tokenization_utils_base import PaddingStrategy, TruncationStrategy
 from ..video_utils import VideoMetadataType
@@ -100,19 +100,16 @@ class TokenizerKwargs(TypedDict, total=False):
         return make_dataclass(self.type.__name__ + "Config", fields)
 
 
-@as_validated_field
 def positive_any_number(value: Optional[Union[int, float]] = None):
     if value is not None and (not isinstance(value, (int, float)) or not value >= 0):
         raise ValueError(f"Value must be a positive integer or floating number, got {value}")
 
 
-@as_validated_field
 def positive_int(value: Optional[int] = None):
     if value is not None and (not isinstance(value, int) or not value >= 0):
         raise ValueError(f"Value must be a positive integer, got {value}")
 
 
-@as_validated_field
 def padding_validator(value: Optional[Union[bool, str, PaddingStrategy]] = None):
     possible_names = ["longest", "max_length", "do_not_pad"]
     if value is None:
@@ -123,7 +120,6 @@ def padding_validator(value: Optional[Union[bool, str, PaddingStrategy]] = None)
         raise ValueError(f"If padding is a string, the value must be one of {possible_names}")
 
 
-@as_validated_field
 def truncation_validator(value: Optional[Union[bool, str, TruncationStrategy]] = None):
     possible_names = ["only_first", "only_second", "longest_first", "do_not_truncate"]
     if value is None:
@@ -134,7 +130,6 @@ def truncation_validator(value: Optional[Union[bool, str, TruncationStrategy]] =
         raise ValueError(f"If truncation is a string, value must be one of {possible_names}")
 
 
-@as_validated_field
 def image_size_validator(value: Optional[Union[int, Sequence[int], dict[str, int]]] = None):
     possible_keys = ["height", "width", "longest_edge", "shortest_edge", "max_height", "max_width"]
     if value is None:
@@ -143,7 +138,6 @@ def image_size_validator(value: Optional[Union[int, Sequence[int], dict[str, int
         raise ValueError(f"Value for size must be a dict with keys {possible_keys} but got size={value}")
 
 
-@as_validated_field
 def device_validator(value: Optional[Union[str, int]] = None):
     possible_names = ["cpu", "cuda", "xla", "xpu", "mps", "meta"]
     if value is None:
@@ -160,7 +154,6 @@ def device_validator(value: Optional[Union[str, int]] = None):
         )
 
 
-@as_validated_field
 def resampling_validator(value: Optional[Union[int, "PILImageResampling"]] = None):
     if value is None:
         pass
@@ -172,7 +165,6 @@ def resampling_validator(value: Optional[Union[int, "PILImageResampling"]] = Non
         raise ValueError(f"The resampling should an integer or `PIL.Image.Resampling`, but got resampling={value}")
 
 
-@as_validated_field
 def video_metadata_validator(value: Optional[VideoMetadataType] = None):
     if value is None:
         return
@@ -204,7 +196,6 @@ def check_dict_keys(d: dict) -> bool:
             )
 
 
-@as_validated_field
 def tensor_type_validator(value: Optional[Union[str, TensorType]] = None):
     possible_names = ["pt", "np", "mlx"]
     if value is None:
diff --git a/src/transformers/video_processing_utils.py b/src/transformers/video_processing_utils.py
index e5f842611a7b..4283c163c574 100644
--- a/src/transformers/video_processing_utils.py
+++ b/src/transformers/video_processing_utils.py
@@ -21,6 +21,7 @@
 from typing import Any, Callable, Optional, Union
 
 import numpy as np
+from huggingface_hub.dataclasses import validate_typed_dict
 
 from .dynamic_module_utils import custom_object_save
 from .image_processing_utils import (
@@ -51,7 +52,6 @@
 )
 from .utils.hub import cached_file
 from .utils.import_utils import requires
-from .utils.type_validators import TypedDictAdapter
 from .video_utils import (
     VideoInput,
     VideoMetadata,
@@ -361,8 +361,7 @@ def preprocess(
         )
 
         # Perform type validation on received kwargs
-        type_validator = TypedDictAdapter(self.valid_kwargs)
-        type_validator.validate_fields(**kwargs)
+        validate_typed_dict(self.valid_kwargs, kwargs)
 
         # Set default kwargs from self. This ensures that if a kwarg is not provided
         # by the user, it gets its default value from the instance, or is set to None.

From 7a4e79f59d91b9aa95d692a5d491cfc601042ad4 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 8 Oct 2025 13:57:20 +0200
Subject: [PATCH 25/28] delete typed dict adaptor

---
 src/transformers/utils/type_validators.py | 91 +----------------------
 1 file changed, 1 insertion(+), 90 deletions(-)

diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py
index 57c839151a76..6e6ccdc4c8e9 100644
--- a/src/transformers/utils/type_validators.py
+++ b/src/transformers/utils/type_validators.py
@@ -1,8 +1,5 @@
 from collections.abc import Sequence
-from dataclasses import MISSING, field, make_dataclass
-from typing import Annotated, ForwardRef, Optional, TypedDict, Union, get_args, get_origin
-
-from huggingface_hub.dataclasses import strict
+from typing import Optional, Union
 
 from ..tokenization_utils_base import PaddingStrategy, TruncationStrategy
 from ..video_utils import VideoMetadataType
@@ -14,92 +11,6 @@
     from ..image_utils import PILImageResampling
 
 
-def unpack_annotated_type(type):
-    if get_origin(type) is Annotated:
-        base, *meta = get_args(type)
-        return base, meta[0]
-    return type, field(default=MISSING)
-
-
-def get_type_hints_from_typed_dict(obj: type[TypedDict]):
-    """
-    Same as `typing.get_type_hints` but does not perform evaluation
-    on the ForwardRefs. Evaluating might fails if the package is not imported
-    or installed, therefore we will have our own "guarded" type validations.
-    All `ForwardRef` will be ignored by the hub validator
-    """
-    raw_annots = obj.__dict__.get("__annotations__", {})
-    type_hints = {}
-    for name, value in raw_annots.items():
-        if value is None:
-            value = type(None)
-        if isinstance(value, str):
-            value = ForwardRef(value, is_argument=False)
-        type_hints[name] = value
-    return type_hints
-
-
-# Minimalistic version of pydantic.TypeAdapter tailored for `TypedDict`
-class TypedDictAdapter:
-    """
-    A utility class used to convert a TypedDict object to dataclass and attach
-    a hub validator on top based on TypedDict annotations.
-
-    We don't want to replace `TypedDict` by dataclasses in the codebase because
-    with dataclasses we will lose typing hints that `Unpack[TypedDict]` gives.
-    So this utility is a sweet spot to keep the balance between DevX and strong
-    typing`validation.
-
-    Args:
-        type: The TypedDict object that needs to be validated.
-    """
-
-    def __init__(
-        self,
-        type: type[TypedDict],
-    ):
-        self.type = type
-        self.dataclass = self.create_dataclass()
-        self.dataclass = strict(self.dataclass)
-
-    def validate_fields(self, **kwargs):
-        # If not all kwargs are set, dataclass raises an error in python <= 3.9
-        # In newer python we can bypass by creating a dataclass with `kw_only=True`
-        for field in self.fields:
-            if field[0] not in kwargs:
-                kwargs[field[0]] = None
-        self.dataclass(**kwargs)
-
-    def create_dataclass(self):
-        """
-        Creates a dataclass object dynamically from `TypedDict`, so that
-        we can use strict type validation from typing hints with `TypedDict`.
-
-        Example:
-
-        @as_validated_field
-        def padding_validator(value: Union[bool, str, PaddingStrategy] = None):
-            if value is None:
-                return
-            if not isinstance(value, (bool, str, PaddingStrategy)):
-                raise ValueError(f"Value must be one of '[bool, string, PaddingStrategy]'")
-            if isinstance(value, str) and value not in ["longest", "max_length", "do_not_pad"]:
-                raise ValueError(f'Value for padding must be one of `["longest", "max_length", "do_not_pad"]`')
-
-        class TokenizerKwargs(TypedDict, total=False):
-            text: str
-            padding: Annotated[Union[bool, str, PaddingStrategy], padding_validator()]
-
-        # Now we can create a dataclass and warp it with hub validators for type constraints
-        # The dataclass can also be used as a simple config class for easier kwarg management
-        dataclass = dataclass_from_typed_dict(TokenizerKwargs)
-        """
-        hints = get_type_hints_from_typed_dict(self.type)
-        fields = [(k, *unpack_annotated_type(v)) for k, v in hints.items()]
-        self.fields = fields
-        return make_dataclass(self.type.__name__ + "Config", fields)
-
-
 def positive_any_number(value: Optional[Union[int, float]] = None):
     if value is not None and (not isinstance(value, (int, float)) or not value >= 0):
         raise ValueError(f"Value must be a positive integer or floating number, got {value}")

From 0395b54b2ed57df75b96888364b93a583ac92fe2 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 8 Oct 2025 14:04:26 +0200
Subject: [PATCH 26/28] address comments

---
 src/transformers/models/kosmos2/processing_kosmos2.py   | 2 +-
 tests/models/vitmatte/test_image_processing_vitmatte.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index c839bbd6cfb4..2bc653ab3276 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -37,7 +37,7 @@
 
 
 class Kosmos2ImagesKwargs(ImagesKwargs, total=False):
-    bboxes: Optional[NestedList]
+    bboxes: Optional[NestedList]  # NOTE: hub validators can't accept `Sequence`
     num_image_tokens: Optional[int]
     first_image_token_id: Optional[int]
 
diff --git a/tests/models/vitmatte/test_image_processing_vitmatte.py b/tests/models/vitmatte/test_image_processing_vitmatte.py
index a17968a1d567..b100fb3c30b6 100644
--- a/tests/models/vitmatte/test_image_processing_vitmatte.py
+++ b/tests/models/vitmatte/test_image_processing_vitmatte.py
@@ -261,7 +261,7 @@ def test_image_processor_preprocess_arguments(self):
             trimap = np.random.randint(0, 3, size=image.size[::-1])
 
             # Type validation will fail for fast processors only (for now)
-            if i == 1:
+            if image_processing_class.__name__.endswith("Fast"):
                 with self.assertRaises(TypeError):
                     image_processor(image, trimaps=trimap, extra_argument=True)
             else:

From 34c9ec71945965a6e18279c2669f8681daed2945 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 8 Oct 2025 15:44:59 +0200
Subject: [PATCH 27/28] delete optionals

---
 src/transformers/models/aria/modular_aria.py  |  6 +--
 .../models/beit/image_processing_beit.py      |  2 +-
 .../image_processing_bridgetower.py           |  2 +-
 .../cohere2_vision/modular_cohere2_vision.py  |  6 +--
 .../image_processing_conditional_detr.py      |  6 +--
 .../convnext/image_processing_convnext.py     |  2 +-
 .../image_processing_deepseek_vl.py           |  2 +-
 .../modular_deepseek_vl_hybrid.py             | 10 ++---
 .../image_processing_deformable_detr.py       |  6 +--
 .../models/detr/image_processing_detr.py      |  6 +--
 .../models/donut/image_processing_donut.py    |  4 +-
 .../models/dpt/image_processing_dpt.py        |  8 ++--
 .../image_processing_efficientloftr.py        |  2 +-
 .../image_processing_efficientnet.py          |  4 +-
 .../models/emu3/image_processing_emu3.py      |  4 +-
 .../models/eomt/image_processing_eomt.py      |  2 +-
 .../models/flava/image_processing_flava.py    | 38 +++++++++----------
 .../models/gemma3/image_processing_gemma3.py  |  8 ++--
 .../models/glm4v/image_processing_glm4v.py    |  6 +--
 .../models/glm4v/video_processing_glm4v.py    | 10 ++---
 .../got_ocr2/image_processing_got_ocr2.py     |  6 +--
 .../models/got_ocr2/processing_got_ocr2.py    | 10 ++---
 .../image_processing_grounding_dino.py        |  6 +--
 .../idefics/image_processing_idefics.py       |  2 +-
 .../idefics2/image_processing_idefics2.py     |  2 +-
 .../idefics3/image_processing_idefics3.py     |  6 +--
 .../imagegpt/image_processing_imagegpt.py     |  2 +-
 .../video_processing_instructblipvideo.py     |  8 ----
 .../internvl/video_processing_internvl.py     |  2 +-
 .../models/janus/modular_janus.py             |  2 +-
 .../models/kosmos2/processing_kosmos2.py      |  4 +-
 .../kosmos2_5/image_processing_kosmos2_5.py   |  4 +-
 .../layoutlmv2/image_processing_layoutlmv2.py |  2 +-
 .../layoutlmv3/image_processing_layoutlmv3.py |  2 +-
 .../lfm2_vl/image_processing_lfm2_vl_fast.py  | 26 ++++++-------
 .../models/lfm2_vl/processing_lfm2_vl.py      | 19 ----------
 .../llama4/image_processing_llama4_fast.py    |  4 +-
 .../llava_next/image_processing_llava_next.py |  2 +-
 .../image_processing_llava_onevision.py       |  2 +-
 .../image_processing_mask2former.py           |  4 +-
 .../maskformer/image_processing_maskformer.py |  4 +-
 .../models/mllama/image_processing_mllama.py  |  2 +-
 .../image_processing_mobilenet_v2.py          |  2 +-
 .../mobilevit/image_processing_mobilevit.py   |  4 +-
 .../models/nougat/image_processing_nougat.py  |  6 +--
 .../oneformer/image_processing_oneformer.py   |  2 +-
 .../models/ovis2/image_processing_ovis2.py    |  8 ++--
 .../image_processing_perception_lm_fast.py    |  4 +-
 .../image_processing_phi4_multimodal_fast.py  |  4 +-
 .../pix2struct/image_processing_pix2struct.py |  2 +-
 .../pixtral/image_processing_pixtral.py       |  2 +-
 .../poolformer/image_processing_poolformer.py |  2 +-
 .../image_processing_prompt_depth_anything.py |  8 ++--
 .../qwen2_5_omni/processing_qwen2_5_omni.py   | 20 +++++-----
 .../qwen2_vl/image_processing_qwen2_vl.py     | 10 ++---
 .../qwen2_vl/video_processing_qwen2_vl.py     | 14 +++----
 .../processing_qwen3_omni_moe.py              | 22 +++++------
 .../qwen3_vl/video_processing_qwen3_vl.py     | 10 ++---
 .../rt_detr/image_processing_rt_detr.py       |  6 +--
 .../models/sam/image_processing_sam.py        |  4 +-
 src/transformers/models/sam/processing_sam.py |  6 +--
 .../models/sam2/image_processing_sam2_fast.py |  2 +-
 src/transformers/models/sam2/modular_sam2.py  |  2 +-
 .../models/sam_hq/processing_samhq.py         |  6 +--
 .../segformer/image_processing_segformer.py   |  2 +-
 .../siglip2/image_processing_siglip2.py       |  4 +-
 .../smolvlm/image_processing_smolvlm.py       |  6 +--
 .../smolvlm/video_processing_smolvlm.py       |  2 +-
 .../superpoint/image_processing_superpoint.py |  2 +-
 .../swin2sr/image_processing_swin2sr.py       |  2 +-
 .../textnet/image_processing_textnet.py       |  2 +-
 .../models/tvp/image_processing_tvp.py        |  2 +-
 .../models/vilt/image_processing_vilt.py      |  2 +-
 .../vitmatte/image_processing_vitmatte.py     |  2 +-
 .../models/yolos/image_processing_yolos.py    |  6 +--
 .../zoedepth/image_processing_zoedepth.py     |  4 +-
 src/transformers/processing_utils.py          |  1 -
 77 files changed, 205 insertions(+), 233 deletions(-)

diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
index a4dda1eee8f4..46e35911c1f1 100644
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@@ -905,9 +905,9 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
 
 
 class AriaImagesKwargs(ImagesKwargs, total=False):
-    split_image: Optional[bool]
-    max_image_size: Optional[int]
-    min_image_size: Optional[int]
+    split_image: bool
+    max_image_size: int
+    min_image_size: int
 
 
 class AriaProcessorKwargs(ProcessingKwargs, total=False):
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index 9025e2868df8..884619f12b13 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -63,7 +63,7 @@ class BeitImageProcessorKwargs(ImagesKwargs, total=False):
         ADE20k). The background label will be replaced by 255.
     """
 
-    do_reduce_labels: Optional[bool]
+    do_reduce_labels: bool
 
 
 @requires(backends=("vision",))
diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
index c44cf6a7ee3d..73bfc7407666 100644
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -124,7 +124,7 @@ def get_resize_output_image_size(
 
 
 class BridgeTowerImageProcessorKwargs(ImagesKwargs, total=False):
-    size_divisor: Optional[int]
+    size_divisor: int
 
 
 class BridgeTowerImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
index 5df3e075ed20..b801c24575ca 100644
--- a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
@@ -316,9 +316,9 @@ class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
         set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
     """
 
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
+    crop_to_patches: bool
+    min_patches: int
+    max_patches: int
 
 
 @auto_docstring
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index efc532d413c4..3f639e0c1ae3 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -745,9 +745,9 @@ class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
         Path to the directory containing the segmentation masks.
     """
 
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
+    format: Union[str, AnnotationFormat]
+    do_convert_annotations: bool
+    return_segmentation_masks: bool
     annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
     masks_path: Optional[Union[str, pathlib.Path]]
 
diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py
index 95e9cd91bd4a..c4e279346f3c 100644
--- a/src/transformers/models/convnext/image_processing_convnext.py
+++ b/src/transformers/models/convnext/image_processing_convnext.py
@@ -57,7 +57,7 @@ class ConvNextImageProcessorKwargs(ImagesKwargs, total=False):
         overridden by `crop_pct` in the`preprocess` method.
     """
 
-    crop_pct: Optional[float]
+    crop_pct: float
 
 
 @requires(backends=("vision",))
diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
index f3f4c1ac6e34..763182de4039 100644
--- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
@@ -56,7 +56,7 @@ class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
         falls below this value after resizing.
     """
 
-    min_size: Optional[int]
+    min_size: int
 
 
 class DeepseekVLImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
index 343ba01ac874..43af7d43dfb3 100644
--- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@@ -448,11 +448,11 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
         number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
     """
 
-    min_size: Optional[int]
-    high_res_size: Optional[dict]
-    high_res_resample: Optional[Union["PILImageResampling", int]]
-    high_res_image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
-    high_res_image_std: Optional[Union[float, list[float], tuple[float, ...]]]
+    min_size: int
+    high_res_size: dict
+    high_res_resample: Union["PILImageResampling", int]
+    high_res_image_mean: Union[float, list[float], tuple[float, ...]]
+    high_res_image_std: Union[float, list[float], tuple[float, ...]]
 
 
 class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 9fd63b6340c6..83587f45c295 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -98,9 +98,9 @@ class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
         Path to the directory containing the segmentation masks.
     """
 
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
+    format: Union[str, AnnotationFormat]
+    do_convert_annotations: bool
+    return_segmentation_masks: bool
     annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
     masks_path: Optional[Union[str, pathlib.Path]]
 
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index 1cd636a0cb72..2f149b662ec2 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -100,9 +100,9 @@ class DetrImageProcessorKwargs(ImagesKwargs, total=False):
         Path to the directory containing the segmentation masks.
     """
 
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
+    format: Union[str, AnnotationFormat]
+    do_convert_annotations: bool
+    return_segmentation_masks: bool
     annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
     masks_path: Optional[Union[str, pathlib.Path]]
 
diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py
index 802b05c776fd..0f74ac62ec92 100644
--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@@ -60,8 +60,8 @@ class DonutImageProcessorKwargs(ImagesKwargs, total=False):
         Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
     """
 
-    do_thumbnail: Optional[bool]
-    do_align_long_axis: Optional[bool]
+    do_thumbnail: bool
+    do_align_long_axis: bool
 
 
 @requires(backends=("vision",))
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 71e930a7bfcf..6246b1f3f7c0 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -78,10 +78,10 @@ class DPTImageProcessorKwargs(ImagesKwargs, total=False):
         ADE20k). The background label will be replaced by 255.
     """
 
-    ensure_multiple_of: Optional[int]
-    size_divisor: Optional[int]
-    keep_aspect_ratio: Optional[bool]
-    do_reduce_labels: Optional[bool]
+    ensure_multiple_of: int
+    size_divisor: int
+    keep_aspect_ratio: bool
+    do_reduce_labels: bool
 
 
 def get_resize_output_image_size(
diff --git a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py
index a5c06be89e98..acf9105fe77a 100644
--- a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py
+++ b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py
@@ -56,7 +56,7 @@ class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
         Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
     """
 
-    do_grayscale: Optional[bool] = True
+    do_grayscale: bool
 
 
 # Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index 0a3a7542ff67..2a5b5c93749b 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -52,8 +52,8 @@ class EfficientNetImageProcessorKwargs(ImagesKwargs, total=False):
         Normalize the image again with the standard deviation only for image classification if set to True.
     """
 
-    rescale_offset: Optional[bool]
-    include_top: Optional[bool]
+    rescale_offset: bool
+    include_top: bool
 
 
 class EfficientNetImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py
index ec270fef2c87..0c550937581f 100644
--- a/src/transformers/models/emu3/image_processing_emu3.py
+++ b/src/transformers/models/emu3/image_processing_emu3.py
@@ -48,8 +48,8 @@
 
 
 class Emu3ImageProcessorKwargs(ImagesKwargs, total=False):
-    ratio: Optional[str]
-    image_area: Optional[int]
+    ratio: str
+    image_area: int
 
 
 def smart_resize(
diff --git a/src/transformers/models/eomt/image_processing_eomt.py b/src/transformers/models/eomt/image_processing_eomt.py
index b422ca3020d9..3459911cde1f 100644
--- a/src/transformers/models/eomt/image_processing_eomt.py
+++ b/src/transformers/models/eomt/image_processing_eomt.py
@@ -66,7 +66,7 @@ class EomtImageProcessorKwargs(ImagesKwargs, total=False):
         denoted with 0 (background) will be replaced with `ignore_index`.
     """
 
-    do_split_image: Optional[bool]
+    do_split_image: bool
     ignore_index: Optional[int]
 
 
diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py
index 2005011dcb2b..b62717ae2cd6 100644
--- a/src/transformers/models/flava/image_processing_flava.py
+++ b/src/transformers/models/flava/image_processing_flava.py
@@ -118,26 +118,26 @@ class FlavaImageProcessorKwargs(ImagesKwargs, total=False):
     """
 
     # Mask related params
-    return_image_mask: Optional[bool]
-    input_size_patches: Optional[int]
-    total_mask_patches: Optional[int]
-    mask_group_min_patches: Optional[int]
-    mask_group_max_patches: Optional[int]
-    mask_group_min_aspect_ratio: Optional[float]
-    mask_group_max_aspect_ratio: Optional[float]
+    return_image_mask: bool
+    input_size_patches: int
+    total_mask_patches: int
+    mask_group_min_patches: int
+    mask_group_max_patches: int
+    mask_group_min_aspect_ratio: float
+    mask_group_max_aspect_ratio: float
     # Codebook related params
-    return_codebook_pixels: Optional[bool]
-    codebook_do_resize: Optional[bool]
-    codebook_size: Optional[bool]
-    codebook_resample: Optional[int]
-    codebook_do_center_crop: Optional[bool]
-    codebook_crop_size: Optional[int]
-    codebook_do_rescale: Optional[bool]
-    codebook_rescale_factor: Optional[Union[int, float]]
-    codebook_do_map_pixels: Optional[bool]
-    codebook_do_normalize: Optional[bool]
-    codebook_image_mean: Optional[Union[float, Iterable[float]]]
-    codebook_image_std: Optional[Union[float, Iterable[float]]]
+    return_codebook_pixels: bool
+    codebook_do_resize: bool
+    codebook_size: bool
+    codebook_resample: int
+    codebook_do_center_crop: bool
+    codebook_crop_size: int
+    codebook_do_rescale: bool
+    codebook_rescale_factor: Union[int, float]
+    codebook_do_map_pixels: bool
+    codebook_do_normalize: bool
+    codebook_image_mean: Union[float, Iterable[float]]
+    codebook_image_std: Union[float, Iterable[float]]
 
 
 # Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py
diff --git a/src/transformers/models/gemma3/image_processing_gemma3.py b/src/transformers/models/gemma3/image_processing_gemma3.py
index 4b32a1f31a05..d4bd7a00000e 100644
--- a/src/transformers/models/gemma3/image_processing_gemma3.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3.py
@@ -63,10 +63,10 @@ class Gemma3ImageProcessorKwargs(ImagesKwargs, total=False):
         Minimum aspect ratio to activate pan and scan.
     """
 
-    do_pan_and_scan: Optional[bool]
-    pan_and_scan_min_crop_size: Optional[int]
-    pan_and_scan_max_num_crops: Optional[int]
-    pan_and_scan_min_ratio_to_activate: Optional[float]
+    do_pan_and_scan: bool
+    pan_and_scan_min_crop_size: int
+    pan_and_scan_max_num_crops: int
+    pan_and_scan_min_ratio_to_activate: float
 
 
 class Gemma3ImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/glm4v/image_processing_glm4v.py b/src/transformers/models/glm4v/image_processing_glm4v.py
index e58d295ca465..9a4348010750 100644
--- a/src/transformers/models/glm4v/image_processing_glm4v.py
+++ b/src/transformers/models/glm4v/image_processing_glm4v.py
@@ -57,9 +57,9 @@ class Glm4vImageProcessorKwargs(ImagesKwargs, total=False):
         The merge size of the vision encoder to llm encoder.
     """
 
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
 
 
 def smart_resize(
diff --git a/src/transformers/models/glm4v/video_processing_glm4v.py b/src/transformers/models/glm4v/video_processing_glm4v.py
index 95ea8160c606..f27adfc7e25e 100644
--- a/src/transformers/models/glm4v/video_processing_glm4v.py
+++ b/src/transformers/models/glm4v/video_processing_glm4v.py
@@ -37,11 +37,11 @@
 
 
 class Glm4vVideoProcessorInitKwargs(VideosKwargs, total=False):
-    max_image_size: Optional[dict[str, int]]
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-    max_duration: Optional[int]
+    max_image_size: dict[str, int]
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+    max_duration: int
 
 
 @add_start_docstrings(
diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
index 5882f22b1bd0..3fd5f7d512c1 100644
--- a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
@@ -62,9 +62,9 @@ class GotOcr2ImageProcessorKwargs(ImagesKwargs, total=False):
         set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
     """
 
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
+    crop_to_patches: bool
+    min_patches: int
+    max_patches: int
 
 
 # Similar to image_processing_mllama.get_all_supported_aspect_ratios
diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py
index 447122e18c22..1843b7f28830 100644
--- a/src/transformers/models/got_ocr2/processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py
@@ -36,13 +36,13 @@ class GotOcr2TextKwargs(TextKwargs, total=False):
 
 
 class GotOcr2ImagesKwargs(ImagesKwargs, total=False):
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
+    crop_to_patches: bool
+    min_patches: int
+    max_patches: int
     box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]]
     color: Optional[str]
-    num_image_tokens: Optional[int]
-    multi_page: Optional[bool]
+    num_image_tokens: int
+    multi_page: bool
 
 
 class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False):
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index f556b2d295c3..eb21ea3b376e 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -109,9 +109,9 @@ class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
         Path to the directory containing the segmentation masks.
     """
 
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
+    format: Union[str, AnnotationFormat]
+    do_convert_annotations: bool
+    return_segmentation_masks: bool
     annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
     masks_path: Optional[Union[str, pathlib.Path]]
 
diff --git a/src/transformers/models/idefics/image_processing_idefics.py b/src/transformers/models/idefics/image_processing_idefics.py
index 6a53e4d9b7d4..870c741b826d 100644
--- a/src/transformers/models/idefics/image_processing_idefics.py
+++ b/src/transformers/models/idefics/image_processing_idefics.py
@@ -47,7 +47,7 @@ class IdeficsImageProcessorKwargs(ImagesKwargs, total=False):
     """
 
     transform: Optional[Callable]
-    image_size: Optional[dict[str, int]]
+    image_size: dict[str, int]
 
 
 def convert_to_rgb(image):
diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py
index 1baec594bc90..e068ac42f403 100644
--- a/src/transformers/models/idefics2/image_processing_idefics2.py
+++ b/src/transformers/models/idefics2/image_processing_idefics2.py
@@ -53,7 +53,7 @@ class Idefics2ImageProcessorKwargs(ImagesKwargs, total=False):
         Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
     """
 
-    do_image_splitting: Optional[bool]
+    do_image_splitting: bool
 
 
 def get_resize_output_image_size(image, size, input_data_format) -> tuple[int, int]:
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index d53a75596fea..65e17ef4b776 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -59,9 +59,9 @@ class Idefics3ImageProcessorKwargs(ImagesKwargs, total=False):
         Whether to return the row and column information of the images.
     """
 
-    do_image_splitting: Optional[bool]
-    max_image_size: Optional[dict[str, int]]
-    return_row_col_info: Optional[bool]
+    do_image_splitting: bool
+    max_image_size: dict[str, int]
+    return_row_col_info: bool
 
 
 def _resize_output_size_rescale_to_max_len(
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index f5a1682d9be3..ab7057f7d407 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -56,7 +56,7 @@ class ImageGPTImageProcessorKwargs(ImagesKwargs, total=False):
     """
 
     clusters: Optional[Union[np.ndarray, list[list[int]], "torch.Tensor"]]
-    do_color_quantize: Optional[bool]
+    do_color_quantize: bool
 
 
 def squared_euclidean_distance(a, b):
diff --git a/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py
index d89d9069d495..f2c49925ef19 100644
--- a/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py
@@ -24,15 +24,11 @@
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
-from ...processing_utils import Unpack, VideosKwargs
 from ...utils import TensorType
 from ...video_processing_utils import BaseVideoProcessor
 from ...video_utils import group_videos_by_shape, reorder_videos
 
 
-class InstructBlipVideoVideoProcessorInitKwargs(VideosKwargs, total=False): ...
-
-
 class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
     resample = PILImageResampling.BICUBIC
     image_mean = OPENAI_CLIP_MEAN
@@ -44,12 +40,8 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
     do_normalize = True
     do_convert_rgb = True
     do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
-    valid_kwargs = InstructBlipVideoVideoProcessorInitKwargs
     model_input_names = ["pixel_values"]
 
-    def __init__(self, **kwargs: Unpack[InstructBlipVideoVideoProcessorInitKwargs]):
-        super().__init__(**kwargs)
-
     def _preprocess(
         self,
         videos: list["torch.Tensor"],
diff --git a/src/transformers/models/internvl/video_processing_internvl.py b/src/transformers/models/internvl/video_processing_internvl.py
index 703c40b94f0c..a544bb08815a 100644
--- a/src/transformers/models/internvl/video_processing_internvl.py
+++ b/src/transformers/models/internvl/video_processing_internvl.py
@@ -28,7 +28,7 @@
 
 
 class InternVLVideoProcessorInitKwargs(VideosKwargs, total=False):
-    initial_shift: Optional[Union[bool, float, int]]
+    initial_shift: Union[bool, float, int]
 
 
 class InternVLVideoProcessor(BaseVideoProcessor):
diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py
index 36fe7abae438..6a1742b44362 100644
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@@ -1296,7 +1296,7 @@ class JanusImageProcessorKwargs(ImagesKwargs, total=False):
         falls below this value after resizing.
     """
 
-    min_size: Optional[int]
+    min_size: int
 
 
 class JanusImageProcessor(BlipImageProcessor):
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index 2bc653ab3276..f9fb98df6ac2 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -38,12 +38,12 @@
 
 class Kosmos2ImagesKwargs(ImagesKwargs, total=False):
     bboxes: Optional[NestedList]  # NOTE: hub validators can't accept `Sequence`
-    num_image_tokens: Optional[int]
+    num_image_tokens: int
     first_image_token_id: Optional[int]
 
 
 class Kosmos2TextKwargs(TextKwargs, total=False):
-    add_eos_token: Optional[bool]
+    add_eos_token: bool
 
 
 class Kosmos2ProcessorKwargs(ProcessingKwargs, total=False):
diff --git a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
index ca80cb978ec9..fed17e08e1a7 100644
--- a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
+++ b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
@@ -55,8 +55,8 @@ class Kosmos2_5ImageProcessorKwargs(ImagesKwargs, total=False):
         [KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
     """
 
-    patch_size: Optional[dict[str, int]]
-    max_patches: Optional[int]
+    patch_size: dict[str, int]
+    max_patches: int
 
 
 # Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches
diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
index 5e9289c2701b..6f53698f30b2 100644
--- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
@@ -66,7 +66,7 @@ class LayoutLMv2ImageProcessorKwargs(ImagesKwargs, total=False):
         `preprocess` method.
     """
 
-    apply_ocr: Optional[bool]
+    apply_ocr: bool
     ocr_lang: Optional[str]
     tesseract_config: Optional[str]
 
diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
index 9ba4f5507fc1..44d4b33e11d9 100644
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -70,7 +70,7 @@ class LayoutLMv3ImageProcessorKwargs(ImagesKwargs, total=False):
         `preprocess` method.
     """
 
-    apply_ocr: Optional[bool]
+    apply_ocr: bool
     ocr_lang: Optional[str]
     tesseract_config: Optional[str]
 
diff --git a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
index 3d16a4d0a273..85d8fcd11b92 100755
--- a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
+++ b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import math
 from functools import lru_cache
-from typing import Optional, Union
+from typing import Union
 
 import torch
 from torchvision.transforms.v2 import functional as F
@@ -175,18 +175,18 @@ class Lfm2VlImageProcessorKwargs(ImagesKwargs, total=False):
         The downsampling factor for images used when resizing the image.
     """
 
-    downsample_factor: Optional[int]
-    do_image_splitting: Optional[bool]
-    min_tiles: Optional[int]
-    max_tiles: Optional[int]
-    use_thumbnail: Optional[bool]
-    min_image_tokens: Optional[int]
-    max_image_tokens: Optional[int]
-    encoder_patch_size: Optional[int]
-    tile_size: Optional[int]
-    max_pixels_tolerance: Optional[float]
-    do_pad: Optional[bool]
-    return_row_col_info: Optional[bool]
+    downsample_factor: int
+    do_image_splitting: bool
+    min_tiles: int
+    max_tiles: int
+    use_thumbnail: bool
+    min_image_tokens: int
+    max_image_tokens: int
+    encoder_patch_size: int
+    tile_size: int
+    max_pixels_tolerance: float
+    do_pad: bool
+    return_row_col_info: bool
 
 
 @auto_docstring
diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
index b00ee3d05d04..311dfdc3b123 100755
--- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
@@ -18,7 +18,6 @@
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, make_nested_list_of_images
 from ...processing_utils import (
-    ImagesKwargs,
     ProcessingKwargs,
     ProcessorMixin,
     TextKwargs,
@@ -31,30 +30,12 @@
 logger = logging.get_logger(__name__)
 
 
-class Lfm2VlImagesKwargs(ImagesKwargs, total=False):
-    downsample_factor: Optional[int]
-    do_image_splitting: Optional[bool]
-    min_tiles: Optional[int]
-    max_tiles: Optional[int]
-    use_thumbnail: Optional[bool]
-    min_image_tokens: Optional[int]
-    max_image_tokens: Optional[int]
-    encoder_patch_size: Optional[int]
-    tile_size: Optional[int]
-    max_pixels_tolerance: Optional[float]
-    patch_size: Optional[int]
-    do_pad: Optional[bool]
-    return_row_col_info: Optional[bool]
-
-
 class Lfm2VlTextKwargs(TextKwargs, total=False):
     use_image_special_tokens: Optional[bool]
 
 
 class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Lfm2VlImagesKwargs
     text_kwargs: Lfm2VlTextKwargs
-
     _defaults = {
         "images_kwargs": {
             "return_row_col_info": True,
diff --git a/src/transformers/models/llama4/image_processing_llama4_fast.py b/src/transformers/models/llama4/image_processing_llama4_fast.py
index 42b2221d6a18..ccbb60585b0b 100644
--- a/src/transformers/models/llama4/image_processing_llama4_fast.py
+++ b/src/transformers/models/llama4/image_processing_llama4_fast.py
@@ -320,8 +320,8 @@ class Llama4ImageProcessorKwargs(ImagesKwargs, total=False):
         but never upsample, unless the image is smaller than the patch size.
     """
 
-    max_patches: Optional[int]
-    resize_to_max_canvas: Optional[bool]
+    max_patches: int
+    resize_to_max_canvas: bool
 
 
 @auto_docstring
diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py
index 26ca94dad6e9..c4bc1ed07287 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -67,7 +67,7 @@ class LlavaNextImageProcessorKwargs(ImagesKwargs, total=False):
         method.
     """
 
-    image_grid_pinpoints: Optional[list[list[int]]]
+    image_grid_pinpoints: list[list[int]]
 
 
 def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]:
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
index 3654edcdbf71..4b0f399e4959 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
@@ -66,7 +66,7 @@ class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False):
         method.
     """
 
-    image_grid_pinpoints: Optional[list[list[int]]]
+    image_grid_pinpoints: list[list[int]]
 
 
 # Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
index 235dbe8039f1..79b449eae416 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -74,9 +74,9 @@ class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False):
         The number of labels in the segmentation map.
     """
 
-    size_divisor: Optional[int]
+    size_divisor: int
     ignore_index: Optional[int]
-    do_reduce_labels: Optional[bool]
+    do_reduce_labels: bool
     num_labels: Optional[int]
 
 
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index 4046fcafb07a..7d83809ced66 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -80,9 +80,9 @@ class MaskFormerImageProcessorKwargs(ImagesKwargs, total=False):
         The number of labels in the segmentation map.
     """
 
-    size_divisor: Optional[int]
+    size_divisor: int
     ignore_index: Optional[int]
-    do_reduce_labels: Optional[bool]
+    do_reduce_labels: bool
     num_labels: Optional[int]
 
 
diff --git a/src/transformers/models/mllama/image_processing_mllama.py b/src/transformers/models/mllama/image_processing_mllama.py
index 50579703b905..1a1d76774868 100644
--- a/src/transformers/models/mllama/image_processing_mllama.py
+++ b/src/transformers/models/mllama/image_processing_mllama.py
@@ -56,7 +56,7 @@ class MllamaImageProcessorKwargs(ImagesKwargs, total=False):
         The maximum number of tiles allowed.
     """
 
-    max_image_tiles: Optional[int]
+    max_image_tiles: int
 
 
 @lru_cache(maxsize=10)
diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
index ad4c6937b76a..876d9c6be444 100644
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
@@ -59,7 +59,7 @@ class MobileNetV2ImageProcessorKwargs(ImagesKwargs, total=False):
         ADE20k). The background label will be replaced by 255.
     """
 
-    do_reduce_labels: Optional[bool]
+    do_reduce_labels: bool
 
 
 @requires(backends=("vision",))
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index 8a914608295f..0a9b6bc64423 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -63,8 +63,8 @@ class MobileVitImageProcessorKwargs(ImagesKwargs, total=False):
         ADE20k). The background label will be replaced by 255.
     """
 
-    do_flip_channel_order: Optional[bool]
-    do_reduce_labels: Optional[bool]
+    do_flip_channel_order: bool
+    do_reduce_labels: bool
 
 
 @requires(backends=("vision",))
diff --git a/src/transformers/models/nougat/image_processing_nougat.py b/src/transformers/models/nougat/image_processing_nougat.py
index 35db47bdf0c9..a9178ab43e07 100644
--- a/src/transformers/models/nougat/image_processing_nougat.py
+++ b/src/transformers/models/nougat/image_processing_nougat.py
@@ -62,9 +62,9 @@ class NougatImageProcessorKwargs(ImagesKwargs, total=False):
         Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
     """
 
-    do_crop_margin: Optional[bool]
-    do_thumbnail: Optional[bool]
-    do_align_long_axis: Optional[bool]
+    do_crop_margin: bool
+    do_thumbnail: bool
+    do_align_long_axis: bool
 
 
 class NougatImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
index 058c655559a8..00d4989fdf28 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -85,7 +85,7 @@ class OneFormerImageProcessorKwargs(ImagesKwargs, total=False):
     num_text: Optional[int]
     num_labels: Optional[int]
     ignore_index: Optional[int]
-    do_reduce_labels: Optional[bool]
+    do_reduce_labels: bool
 
 
 # Copied from transformers.models.detr.image_processing_detr.max_across_indices
diff --git a/src/transformers/models/ovis2/image_processing_ovis2.py b/src/transformers/models/ovis2/image_processing_ovis2.py
index ab4ae57f0cb8..4598e9f3f521 100644
--- a/src/transformers/models/ovis2/image_processing_ovis2.py
+++ b/src/transformers/models/ovis2/image_processing_ovis2.py
@@ -61,10 +61,10 @@ class Ovis2ImageProcessorKwargs(ImagesKwargs, total=False):
         `preprocess` method.
     """
 
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
-    use_covering_area_grid: Optional[bool]
+    crop_to_patches: bool
+    min_patches: int
+    max_patches: int
+    use_covering_area_grid: bool
 
 
 # Similar to image_processing_mllama.get_all_supported_aspect_ratios
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index 8409be51e034..03ff515e63af 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -54,8 +54,8 @@ class PerceptionLMImageProcessorKwargs(ImagesKwargs, total=False):
     """
 
     vision_input_type: Optional[str]
-    tile_size: Optional[int]
-    max_num_tiles: Optional[int]
+    tile_size: int
+    max_num_tiles: int
 
 
 @auto_docstring
diff --git a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py
index 6dcbf61f38f6..98f160a1fd5e 100644
--- a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py
+++ b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py
@@ -43,8 +43,8 @@ class Phi4MultimodalImageProcessorKwargs(ImagesKwargs, total=False):
         The maximum number of crops per image.
     """
 
-    patch_size: Optional[int]
-    dynamic_hd: Optional[int]
+    patch_size: int
+    dynamic_hd: int
 
 
 @auto_docstring
diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py
index 66cc7cd0b04a..3ec36ebda440 100644
--- a/src/transformers/models/pix2struct/image_processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@@ -57,7 +57,7 @@ class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False):
         Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
     """
 
-    max_patches: Optional[int]
+    max_patches: int
     header_text: Optional[Union[list[str], str]]
 
 
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 387c82f4e0a0..3cbfaeb41922 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -56,7 +56,7 @@ class PixtralImageProcessorKwargs(ImagesKwargs, total=False):
         Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
     """
 
-    patch_size: Optional[Union[dict[str, int], int]]
+    patch_size: Union[dict[str, int], int]
 
 
 # Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white.
diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py
index c7bb38c4340c..8d466739638d 100644
--- a/src/transformers/models/poolformer/image_processing_poolformer.py
+++ b/src/transformers/models/poolformer/image_processing_poolformer.py
@@ -54,7 +54,7 @@ class PoolFormerImageProcessorKwargs(ImagesKwargs, total=False):
         Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`.
     """
 
-    crop_pct: Optional[float]
+    crop_pct: float
 
 
 class PoolFormerImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
index cb8c0e37af5f..b62ba7994f0a 100644
--- a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
+++ b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
@@ -64,10 +64,10 @@ class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs, total=False):
         Scale factor to convert the prompt depth to meters.
     """
 
-    keep_aspect_ratio: Optional[bool]
-    ensure_multiple_of: Optional[int]
-    size_divisor: Optional[int]
-    prompt_scale_to_meter: Optional[float]
+    keep_aspect_ratio: bool
+    ensure_multiple_of: int
+    size_divisor: int
+    prompt_scale_to_meter: float
 
 
 def _constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index f522f63beb33..ea60155999e6 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -33,16 +33,16 @@
 # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
 # and does not use them in video processor class
 class Qwen2_5_OmniVideosKwargs(VideosKwargs, total=False):
-    min_pixels: Optional[int]
-    max_pixels: Optional[int]
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-    min_frames: Optional[int]
-    max_frames: Optional[int]
-    use_audio_in_video: Optional[bool]
-    seconds_per_chunk: Optional[float]
-    position_id_per_seconds: Optional[Union[int, float]]
+    min_pixels: int
+    max_pixels: int
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+    min_frames: int
+    max_frames: int
+    use_audio_in_video: bool
+    seconds_per_chunk: float
+    position_id_per_seconds: Union[int, float]
 
 
 class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
index 82b7bce43fe9..e5a1e0a7551e 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -66,11 +66,11 @@ class Qwen2VLImageProcessorKwargs(ImagesKwargs, total=False):
         The merge size of the vision encoder to llm encoder.
     """
 
-    min_pixels: Optional[int]
-    max_pixels: Optional[int]
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
+    min_pixels: int
+    max_pixels: int
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
 
 
 def smart_resize(
diff --git a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
index 2fc8bf1ac5b3..11b5ff80dade 100644
--- a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
@@ -42,13 +42,13 @@
 
 
 class Qwen2VLVideoProcessorInitKwargs(VideosKwargs, total=False):
-    min_pixels: Optional[int]
-    max_pixels: Optional[int]
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-    min_frames: Optional[int]
-    max_frames: Optional[int]
+    min_pixels: int
+    max_pixels: int
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+    min_frames: int
+    max_frames: int
 
 
 @add_start_docstrings(
diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
index ba54b3a0c1f6..df5629931fa3 100644
--- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
-from typing import Optional, Union
+from typing import Union
 
 import numpy as np
 
@@ -35,16 +35,16 @@
 # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
 # and does not use them in video processor class
 class Qwen3OmniMoeVideosKwargs(VideosKwargs, total=False):
-    min_pixels: Optional[int]
-    max_pixels: Optional[int]
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-    min_frames: Optional[int]
-    max_frames: Optional[int]
-    use_audio_in_video: Optional[bool]
-    seconds_per_chunk: Optional[float]
-    position_id_per_seconds: Optional[Union[int, float]]
+    min_pixels: int
+    max_pixels: int
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+    min_frames: int
+    max_frames: int
+    use_audio_in_video: bool
+    seconds_per_chunk: float
+    position_id_per_seconds: Union[int, float]
 
 
 class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False):
diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
index 3bb06d7f2b08..e74f55b642dd 100644
--- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
@@ -65,11 +65,11 @@ def smart_resize(
 
 
 class Qwen3VLVideoProcessorInitKwargs(VideosKwargs, total=False):
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-    min_frames: Optional[int]
-    max_frames: Optional[int]
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+    min_frames: int
+    max_frames: int
 
 
 @add_start_docstrings(
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py
index 5e91e3c4fc01..b366ca62fabf 100644
--- a/src/transformers/models/rt_detr/image_processing_rt_detr.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py
@@ -84,9 +84,9 @@ class RTDetrImageProcessorKwargs(ImagesKwargs, total=False):
         Path to the directory containing the segmentation masks.
     """
 
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
+    format: Union[str, AnnotationFormat]
+    do_convert_annotations: bool
+    return_segmentation_masks: bool
     annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
     masks_path: Optional[Union[str, pathlib.Path]]
 
diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
index 3ba27f63d993..eb2615b3e963 100644
--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -67,8 +67,8 @@ class SamImageProcessorKwargs(ImagesKwargs, total=False):
         map size provided for preprocessing.
     """
 
-    mask_size: Optional[dict[str, int]]
-    mask_pad_size: Optional[dict[str, int]]
+    mask_size: dict[str, int]
+    mask_pad_size: dict[str, int]
 
 
 class SamImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index 18d812fc6825..d6cdd2ab2653 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -36,9 +36,9 @@ class SamImagesKwargs(ImagesKwargs, total=False):
     input_points: Optional[list[list[float]]]
     input_labels: Optional[list[list[int]]]
     input_boxes: Optional[list[list[list[float]]]]
-    point_pad_value: Optional[int]
-    mask_size: Optional[dict[str, int]]
-    mask_pad_size: Optional[dict[str, int]]
+    point_pad_value: int
+    mask_size: dict[str, int]
+    mask_pad_size: dict[str, int]
 
 
 class SamProcessorKwargs(ProcessingKwargs, total=False):
diff --git a/src/transformers/models/sam2/image_processing_sam2_fast.py b/src/transformers/models/sam2/image_processing_sam2_fast.py
index b00fba952973..014354d8c642 100644
--- a/src/transformers/models/sam2/image_processing_sam2_fast.py
+++ b/src/transformers/models/sam2/image_processing_sam2_fast.py
@@ -49,7 +49,7 @@ class Sam2FastImageProcessorKwargs(ImagesKwargs, total=False):
         The size `{"height": int, "width": int}` to resize the segmentation maps to.
     """
 
-    mask_size: Optional[dict[str, int]]
+    mask_size: dict[str, int]
 
 
 def _compute_stability_score(masks: "torch.Tensor", mask_threshold: float, stability_score_offset: int):
diff --git a/src/transformers/models/sam2/modular_sam2.py b/src/transformers/models/sam2/modular_sam2.py
index eac3cc232c1a..d451fc946e6d 100644
--- a/src/transformers/models/sam2/modular_sam2.py
+++ b/src/transformers/models/sam2/modular_sam2.py
@@ -76,7 +76,7 @@ class Sam2FastImageProcessorKwargs(ImagesKwargs, total=False):
         The size `{"height": int, "width": int}` to resize the segmentation maps to.
     """
 
-    mask_size: Optional[dict[str, int]]
+    mask_size: dict[str, int]
 
 
 @auto_docstring
diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py
index 8e1945971ebc..2c3b36d98c0a 100644
--- a/src/transformers/models/sam_hq/processing_samhq.py
+++ b/src/transformers/models/sam_hq/processing_samhq.py
@@ -36,9 +36,9 @@ class SamHQImagesKwargs(ImagesKwargs, total=False):
     input_points: Optional[list[list[float]]]
     input_labels: Optional[list[list[int]]]
     input_boxes: Optional[list[list[list[float]]]]
-    point_pad_value: Optional[int]
-    mask_size: Optional[dict[str, int]]
-    mask_pad_size: Optional[dict[str, int]]
+    point_pad_value: int
+    mask_size: dict[str, int]
+    mask_pad_size: dict[str, int]
 
 
 class SamHQProcessorKwargs(ProcessingKwargs, total=False):
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index bedb4ff54651..ede9d589294b 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -63,7 +63,7 @@ class SegformerImageProcessorKwargs(ImagesKwargs, total=False):
         ADE20k). The background label will be replaced by 255.
     """
 
-    do_reduce_labels: Optional[bool]
+    do_reduce_labels: bool
 
 
 @requires(backends=("vision",))
diff --git a/src/transformers/models/siglip2/image_processing_siglip2.py b/src/transformers/models/siglip2/image_processing_siglip2.py
index d71b66464223..85063fc9078a 100644
--- a/src/transformers/models/siglip2/image_processing_siglip2.py
+++ b/src/transformers/models/siglip2/image_processing_siglip2.py
@@ -57,8 +57,8 @@ class Siglip2ImageProcessorKwargs(ImagesKwargs, total=False):
         and then padded in "patch" dimension to match this number exactly.
     """
 
-    patch_size: Optional[int]
-    max_num_patches: Optional[int]
+    patch_size: int
+    max_num_patches: int
 
 
 @lru_cache(maxsize=256)
diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm.py b/src/transformers/models/smolvlm/image_processing_smolvlm.py
index c12c08182a94..a946cc0c191b 100644
--- a/src/transformers/models/smolvlm/image_processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/image_processing_smolvlm.py
@@ -64,9 +64,9 @@ class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False):
         Whether to return the row and column information of the images.
     """
 
-    do_image_splitting: Optional[bool]
-    max_image_size: Optional[dict[str, int]]
-    return_row_col_info: Optional[bool]
+    do_image_splitting: bool
+    max_image_size: dict[str, int]
+    return_row_col_info: bool
 
 
 MAX_IMAGE_SIZE = 4096  # 4k resolution as absolute maximum
diff --git a/src/transformers/models/smolvlm/video_processing_smolvlm.py b/src/transformers/models/smolvlm/video_processing_smolvlm.py
index d8cecb6c0c5c..09751486f0ae 100644
--- a/src/transformers/models/smolvlm/video_processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/video_processing_smolvlm.py
@@ -91,7 +91,7 @@ def get_resize_output_image_size(
 
 
 class SmolVLMVideoProcessorInitKwargs(VideosKwargs, total=False):
-    max_image_size: Optional[dict[str, int]]
+    max_image_size: dict[str, int]
 
 
 class SmolVLMVideoProcessor(BaseVideoProcessor):
diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index 9c810c450ad7..57b1a9dc6cb1 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -52,7 +52,7 @@ class SuperPointImageProcessorKwargs(ImagesKwargs, total=False):
         Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
     """
 
-    do_grayscale: Optional[bool] = True
+    do_grayscale: bool
 
 
 def is_grayscale(
diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py
index d9de6c684959..0ba052e92e05 100644
--- a/src/transformers/models/swin2sr/image_processing_swin2sr.py
+++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py
@@ -39,7 +39,7 @@
 
 
 class Swin2SRImageProcessorKwargs(ImagesKwargs, total=False):
-    size_divisor: Optional[int]
+    size_divisor: int
 
 
 class Swin2SRImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index e5e127d987e0..bd7aa6f5086e 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -50,7 +50,7 @@
 
 
 class TextNetImageProcessorKwargs(ImagesKwargs, total=False):
-    size_divisor: Optional[int]
+    size_divisor: int
 
 
 class TextNetImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 42834287e110..d1ae5c374b4b 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -60,7 +60,7 @@ class TvpImageProcessorKwargs(ImagesKwargs, total=False):
         Padding mode to use — `'constant'`, `'edge'`, `'reflect'`, or `'symmetric'`.
     """
 
-    do_flip_channel_order: Optional[bool]
+    do_flip_channel_order: bool
     constant_values: Optional[Union[float, list[float]]]
     pad_mode: Optional[str]
 
diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py
index fadb5302d4ee..5c1b2acf6e4b 100644
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -48,7 +48,7 @@
 
 
 class ViltImageProcessorKwargs(ImagesKwargs, total=False):
-    size_divisor: Optional[int]
+    size_divisor: int
 
 
 def max_across_indices(values: Iterable[Any]) -> list[Any]:
diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte.py b/src/transformers/models/vitmatte/image_processing_vitmatte.py
index eb994b641962..ea54ba603435 100644
--- a/src/transformers/models/vitmatte/image_processing_vitmatte.py
+++ b/src/transformers/models/vitmatte/image_processing_vitmatte.py
@@ -42,7 +42,7 @@
 
 
 class VitMatteImageProcessorKwargs(ImagesKwargs, total=False):
-    size_divisor: Optional[int]
+    size_divisor: int
 
 
 class VitMatteImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 19fda87897ae..b594c296707b 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -97,9 +97,9 @@ class YolosImageProcessorKwargs(ImagesKwargs, total=False):
         Path to the directory containing the segmentation masks.
     """
 
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
+    format: Union[str, AnnotationFormat]
+    do_convert_annotations: bool
+    return_segmentation_masks: bool
     annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
     masks_path: Optional[Union[str, pathlib.Path]]
 
diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py
index 3fdf414bc20c..d94a2ee088eb 100644
--- a/src/transformers/models/zoedepth/image_processing_zoedepth.py
+++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py
@@ -77,8 +77,8 @@ class ZoeDepthImageProcessorKwargs(ImagesKwargs, total=False):
         Can be overridden by `ensure_multiple_of` in `preprocess`.
     """
 
-    keep_aspect_ratio: Optional[bool]
-    ensure_multiple_of: Optional[int]
+    keep_aspect_ratio: bool
+    ensure_multiple_of: int
 
 
 def get_resize_output_image_size(
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index ebd5ab276c9d..55844c8d9cce 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -14,7 +14,6 @@
 """
 Processing saving/loading class for common processors.
 """
-# from __future__ import annotations
 
 import bisect
 import copy

From 774c2603e748f57e5708126f36dd136300fb39be Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 8 Oct 2025 16:01:13 +0200
Subject: [PATCH 28/28] frigit to fix copies

---
 src/transformers/models/aria/processing_aria.py        |  6 +++---
 .../image_processing_cohere2_vision_fast.py            |  6 +++---
 .../image_processing_deepseek_vl_hybrid.py             | 10 +++++-----
 .../models/janus/image_processing_janus.py             |  2 +-
 src/transformers/models/sam_hq/processing_samhq.py     |  2 +-
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py
index ee8a8fd3f9ef..d0841c96aee2 100644
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@@ -31,9 +31,9 @@
 
 
 class AriaImagesKwargs(ImagesKwargs, total=False):
-    split_image: Optional[bool]
-    max_image_size: Optional[int]
-    min_image_size: Optional[int]
+    split_image: bool
+    max_image_size: int
+    min_image_size: int
 
 
 class AriaProcessorKwargs(ProcessingKwargs, total=False):
diff --git a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
index 638b7549bfae..afdd683e2312 100644
--- a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
+++ b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
@@ -46,9 +46,9 @@ class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
         set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
     """
 
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
+    crop_to_patches: bool
+    min_patches: int
+    max_patches: int
 
 
 @lru_cache(maxsize=10)
diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
index 7837cff2d33b..c91aab91fca5 100644
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
@@ -69,11 +69,11 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
         number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
     """
 
-    min_size: Optional[int]
-    high_res_size: Optional[dict]
-    high_res_resample: Optional[Union["PILImageResampling", int]]
-    high_res_image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
-    high_res_image_std: Optional[Union[float, list[float], tuple[float, ...]]]
+    min_size: int
+    high_res_size: dict
+    high_res_resample: Union["PILImageResampling", int]
+    high_res_image_mean: Union[float, list[float], tuple[float, ...]]
+    high_res_image_std: Union[float, list[float], tuple[float, ...]]
 
 
 class DeepseekVLHybridImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/janus/image_processing_janus.py b/src/transformers/models/janus/image_processing_janus.py
index 6b83ecf8eb5c..c47461174516 100644
--- a/src/transformers/models/janus/image_processing_janus.py
+++ b/src/transformers/models/janus/image_processing_janus.py
@@ -58,7 +58,7 @@ class JanusImageProcessorKwargs(ImagesKwargs, total=False):
         falls below this value after resizing.
     """
 
-    min_size: Optional[int]
+    min_size: int
 
 
 class JanusImageProcessor(BaseImageProcessor):
diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py
index 2c3b36d98c0a..d0b11ab06146 100644
--- a/src/transformers/models/sam_hq/processing_samhq.py
+++ b/src/transformers/models/sam_hq/processing_samhq.py
@@ -36,7 +36,7 @@ class SamHQImagesKwargs(ImagesKwargs, total=False):
     input_points: Optional[list[list[float]]]
     input_labels: Optional[list[list[int]]]
     input_boxes: Optional[list[list[list[float]]]]
-    point_pad_value: int
+    point_pad_value: Optional[int]
     mask_size: dict[str, int]
     mask_pad_size: dict[str, int]