From 762b6517fdd36b2fbf372c36e3bdd6e9701cbedd Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 10 Sep 2025 14:36:27 +0200 Subject: [PATCH 01/28] initial design draft --- smolvlm.py | 119 ++++++++++++ src/transformers/processing_utils.py | 108 ++++++----- src/transformers/utils/type_validators.py | 174 ++++++++++++++++++ .../models/colpali/test_processing_colpali.py | 16 +- .../colqwen2/test_processing_colqwen2.py | 16 +- tests/models/janus/test_processing_janus.py | 2 +- tests/models/mllama/test_processing_mllama.py | 2 +- .../models/smolvlm/test_processing_smolvlm.py | 2 +- tests/test_processing_common.py | 30 +-- 9 files changed, 389 insertions(+), 80 deletions(-) create mode 100644 smolvlm.py create mode 100644 src/transformers/utils/type_validators.py diff --git a/smolvlm.py b/smolvlm.py new file mode 100644 index 000000000000..ef01fd9f087c --- /dev/null +++ b/smolvlm.py @@ -0,0 +1,119 @@ +from transformers import Qwen2VLProcessor + +if __name__ == "__main__": + + for i in range(1): + processor = Qwen2VLProcessor.from_pretrained(pretrained_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", use_fast=True) + processor + + +from typing_extensions import Unpack +from transformers.tokenization_utils_base import PaddingStrategy +from typing import Union, TypeVar, Generic, get_type_hints, TypedDict, Literal, Annotated, Optional, get_origin, get_args +from dataclasses import make_dataclass, field + +my_int = TypeVar('my_int', bound=int) + + +class Mixin: + def mixin_method(self): + return 0 + +class Stack(Mixin, Generic[my_int]): + def __init__(self) -> None: + # Create an empty list with items of type T + self.items: list[my_int] = [] + + def push(self, item: my_int) -> None: + self.items.append(item) + + +class ModelStack(Stack[str]): + pass + +s = ModelStack() +s.push(0) + + + +from dataclasses import dataclass, MISSING, fields +from huggingface_hub.dataclasses import as_validated_field, strict, validated_field + +def positive_int(value: int): + if not value >= 0: + raise ValueError(f"Value must be positive, got {value}") + + +def multiple_of_64(value: int): + if not value % 64 == 0: + raise ValueError(f"Value must be a multiple of 64, got {value}") + + +@as_validated_field +def strictly_positive(value: int = None): + if value is not None and not value > 0: + raise ValueError(f"Value must be strictly positive, got {value}") + +@as_validated_field +def padding_validator(value: Union[bool, str, PaddingStrategy] = None): + if value is None: + return + + if not isinstance(value, (bool, str, PaddingStrategy)): + raise ValueError(f"Value must be padding") + if isinstance(value, str) and value not in ["longest", "max_length", "do_not_pad"]: + raise ValueError(f'Value for padding must be one of ["longest", "max_length", "do_not_pad"]') + +@strict +@dataclass +class Config: + model_type: str + hidden_size: int = validated_field(validator=[positive_int, multiple_of_64]) + vocab_size: int = strictly_positive(default=16) + + +class AnotherKwargs(TypedDict, total=False): + name: Union[str, list[str]] + age: Annotated[Optional[int], strictly_positive()] + padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()] + padding_side: Optional[Literal["right", "left"]] + + +def unpack_annotated_type(type): + if get_origin(type) is Annotated: + base, *meta = get_args(type) + return base, meta[0] + return type, field(default=MISSING) + + +def dataclass_from_typed_dict(td: type[TypedDict]): + hints = get_type_hints(td, include_extras=True) + dc_fields = [ + (k, *unpack_annotated_type(v)) + for k, v in hints.items() + ] + return make_dataclass(td.__name__ + "Config", dc_fields) + + +class HubTypeAdapter(): + def __init__(self, type: type[TypedDict]) -> None: + self.type = type + dataclass = dataclass_from_typed_dict(type) + self.dataclass = strict(dataclass) + + def validate_fields(self, **kwargs): + for f in fields(self.dataclass): + if f.name not in kwargs: + kwargs[f.name] = None + self.dataclass(**kwargs) + + +config = Config(model_type="bert", vocab_size=30000, hidden_size=768) +print(config.__dataclass_fields__) +assert config.model_type == "bert" +assert config.vocab_size == 30000 +assert config.hidden_size == 768 + +HubTypeAdapter(AnotherKwargs).validate_fields(name=["BOB", "MARY"], age=100, padding=None) +print(AnotherKwargs.__annotations__['age']) + diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 3130d0ded34f..038c9f5ecaa6 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -23,9 +23,10 @@ import sys import typing import warnings +from collections.abc import Iterable from dataclasses import dataclass from pathlib import Path -from typing import Any, Optional, TypedDict, TypeVar, Union +from typing import Annotated, Any, Literal, Optional, TypedDict, TypeVar, Union import numpy as np import typing_extensions @@ -36,6 +37,18 @@ from .feature_extraction_utils import BatchFeature from .image_utils import ChannelDimension, ImageInput, is_vision_available from .utils.chat_template_utils import render_jinja_template +from .utils.type_validators import ( + TypedDictAdapter, + device_validator, + image_size_validator, + padding_validator, + resampling_validator, + strictly_positive_int, + strictly_positive_number, + tensor_type_validator, + truncation_validator, + video_metadata_validator, +) from .video_utils import VideoInput, VideoMetadata @@ -138,15 +151,15 @@ class TextKwargs(TypedDict, total=False): """ text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] - text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] + text_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] text_pair_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] add_special_tokens: Optional[bool] - padding: Union[bool, str, PaddingStrategy] - truncation: Union[bool, str, TruncationStrategy] - max_length: Optional[int] - stride: Optional[int] + padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()] + truncation: Annotated[Optional[Union[bool, str, TruncationStrategy]], truncation_validator()] + max_length: Annotated[Optional[int], strictly_positive_int()] + stride: Annotated[Optional[int], strictly_positive_int()] is_split_into_words: Optional[bool] - pad_to_multiple_of: Optional[int] + pad_to_multiple_of: Annotated[Optional[int], strictly_positive_int()] return_token_type_ids: Optional[bool] return_attention_mask: Optional[bool] return_overflowing_tokens: Optional[bool] @@ -154,8 +167,9 @@ class TextKwargs(TypedDict, total=False): return_offsets_mapping: Optional[bool] return_length: Optional[bool] verbose: Optional[bool] - padding_side: Optional[str] + padding_side: Optional[Literal["left", "right"]] return_mm_token_type_ids: Optional[bool] + return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()] class ImagesKwargs(TypedDict, total=False): @@ -199,21 +213,22 @@ class methods and docstrings. """ do_resize: Optional[bool] - size: Optional[dict[str, int]] - size_divisor: Optional[int] - crop_size: Optional[dict[str, int]] - resample: Optional[Union["PILImageResampling", int]] + size: Annotated[Optional[dict[str, int]], image_size_validator()] + size_divisor: Annotated[Optional[int], strictly_positive_int()] + crop_size: Annotated[Optional[dict[str, int]], image_size_validator()] + resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()] do_rescale: Optional[bool] rescale_factor: Optional[float] do_normalize: Optional[bool] image_mean: Optional[Union[float, list[float]]] image_std: Optional[Union[float, list[float]]] do_pad: Optional[bool] - pad_size: Optional[dict[str, int]] + pad_size: Annotated[Optional[dict[str, int]], image_size_validator()] do_center_crop: Optional[bool] - data_format: Optional[ChannelDimension] + data_format: Optional[Union[str, ChannelDimension]] input_data_format: Optional[Union[str, ChannelDimension]] - device: Optional[str] + device: Annotated[Optional[str], device_validator()] + return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()] class VideosKwargs(TypedDict, total=False): @@ -267,10 +282,10 @@ class VideosKwargs(TypedDict, total=False): do_convert_rgb: Optional[bool] do_resize: Optional[bool] - size: Optional[dict[str, int]] - size_divisor: Optional[int] + size: Annotated[Optional[dict[str, int]], image_size_validator()] + size_divisor: Annotated[Optional[int], strictly_positive_int()] default_to_square: Optional[bool] - resample: Optional["PILImageResampling"] + resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()] do_rescale: Optional[bool] rescale_factor: Optional[float] do_normalize: Optional[bool] @@ -278,15 +293,18 @@ class VideosKwargs(TypedDict, total=False): image_std: Optional[Union[float, list[float]]] do_pad: Optional[bool] do_center_crop: Optional[bool] - crop_size: Optional[dict[str, int]] + crop_size: Annotated[Optional[dict[str, int]], image_size_validator()] data_format: Optional[ChannelDimension] input_data_format: Optional[Union[str, ChannelDimension]] - device: Optional[str] + device: Annotated[Optional[str], device_validator()] do_sample_frames: Optional[bool] - video_metadata: Optional[Union[VideoMetadata, dict]] - fps: Optional[Union[int, float]] - num_frames: Optional[int] + video_metadata: Annotated[ + Optional[Union[VideoMetadata, dict, Iterable[VideoMetadata, dict]]], video_metadata_validator() + ] + fps: Annotated[Optional[Union[int, float]], strictly_positive_number()] + num_frames: Annotated[Optional[int], strictly_positive_int()] return_metadata: Optional[bool] + return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()] class AudioKwargs(TypedDict, total=False): @@ -319,17 +337,14 @@ class AudioKwargs(TypedDict, total=False): Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`. """ - sampling_rate: Optional[int] + sampling_rate: Annotated[Optional[int], strictly_positive_int()] raw_speech: Optional[Union["np.ndarray", list[float], list["np.ndarray"], list[list[float]]]] - padding: Optional[Union[bool, str, PaddingStrategy]] - max_length: Optional[int] - truncation: Optional[bool] - pad_to_multiple_of: Optional[int] + padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()] + max_length: Annotated[Optional[int], strictly_positive_int()] + truncation: Annotated[Optional[Union[bool, str, TruncationStrategy]], truncation_validator()] + pad_to_multiple_of: Annotated[Optional[int], strictly_positive_int()] return_attention_mask: Optional[bool] - - -class CommonKwargs(TypedDict, total=False): - return_tensors: Optional[Union[str, TensorType]] + return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()] class ProcessingKwargs(TypedDict, total=False): @@ -373,9 +388,6 @@ class CustomProcessorKwargs(ProcessingKwargs, total=False): _defaults = {} - common_kwargs: CommonKwargs = { - **CommonKwargs.__annotations__, - } text_kwargs: TextKwargs = { **TextKwargs.__annotations__, } @@ -1248,7 +1260,6 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg "images_kwargs": {}, "audio_kwargs": {}, "videos_kwargs": {}, - "common_kwargs": {}, } default_kwargs = { @@ -1256,7 +1267,6 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg "images_kwargs": {}, "audio_kwargs": {}, "videos_kwargs": {}, - "common_kwargs": {}, } possible_modality_keywords = {"text", "audio", "videos", "images"} @@ -1314,17 +1324,21 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg else: # kwargs is a flat dictionary for key, kwarg in kwargs.items(): - if key not in used_keys: - if key in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__: - output_kwargs["common_kwargs"][key] = kwarg - elif key not in possible_modality_keywords: - logger.warning_once( - f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." - ) + if key not in used_keys and key not in possible_modality_keywords: + logger.warning_once( + f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." + ) + + # BC for `common_kwargs` to update all modality-specific kwargs + common_kwargs = kwargs.get("common_kwargs", {}) + if common_kwargs: + for kwarg in output_kwargs.values(): + kwarg.update(common_kwargs) - # all modality-specific kwargs are updated with common kwargs - for kwarg in output_kwargs.values(): - kwarg.update(output_kwargs["common_kwargs"]) + # Perform type validation on collected kwargs + for key, typed_dict_obj in ModelProcessorKwargs.__annotations__.items(): + type_validator = TypedDictAdapter(typed_dict_obj) + type_validator.validate_fields(**output_kwargs[key]) return output_kwargs @classmethod diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py new file mode 100644 index 000000000000..cb846b8a6255 --- /dev/null +++ b/src/transformers/utils/type_validators.py @@ -0,0 +1,174 @@ +from collections.abc import Iterable +from dataclasses import MISSING, field, make_dataclass +from typing import Annotated, Optional, TypedDict, Union, get_args, get_origin, get_type_hints + +from huggingface_hub.dataclasses import as_validated_field, strict + +from ..tokenization_utils_base import PaddingStrategy, TruncationStrategy +from ..video_utils import VideoMetadata +from .generic import TensorType +from .import_utils import is_vision_available + + +if is_vision_available(): + from ..image_utils import PILImageResampling + + +def unpack_annotated_type(type): + if get_origin(type) is Annotated: + base, *meta = get_args(type) + return base, meta[0] + return type, field(default=MISSING) + + +# Minimalistic version on pydantic.TypeAdapter tailored for `TypedDict` +class TypedDictAdapter: + """ + A utility class used to convert a TypedDict object to dataclass and attach + a hub validator on top based on TypedDict annotations. + + Args: + type: The TypedDict object that needs to be validated. + """ + + def __init__(self, type: type[TypedDict]) -> None: + self.type = type + self.dataclass = self.create_dataclass() + self.dataclass = strict(self.dataclass) + + def validate_fields(self, **kwargs): + # If not all kwargs are set, dataclass raises an error in python <= 3.9 + # In newer python we can bypass by creating a dataclass with `kw_only=True` + for field in self.fields: + if field[0] not in kwargs: + kwargs[field[0]] = None + self.dataclass(**kwargs) + + def create_dataclass(self): + """ + Creates a dataclass object dynamically from `TypedDict`, so that + we can use strict type validation from typing hints with `TypedDict`. + + Example: + + @as_validated_field + def padding_validator(value: Union[bool, str, PaddingStrategy] = None): + if value is None: + return + if not isinstance(value, (bool, str, PaddingStrategy)): + raise ValueError(f"Value must be one of '[bool, string, PaddingStrategy]'") + if isinstance(value, str) and value not in ["longest", "max_length", "do_not_pad"]: + raise ValueError(f'Value for padding must be one of `["longest", "max_length", "do_not_pad"]`') + + class TokenizerKwargs(TypedDict, total=False): + text: str + padding: Annotated[Union[bool, str, PaddingStrategy], padding_validator()] + + # Now we can create a dataclass and warp it with hub validators for type constraints + # The dataclass can also be used as a simple config class for easier kwarg management + dataclass = dataclass_from_typed_dict(TokenizerKwargs) + """ + hints = get_type_hints(self.type, include_extras=True) + fields = [(k, *unpack_annotated_type(v)) for k, v in hints.items()] + self.fields = fields + return make_dataclass(self.type.__name__ + "Config", fields) + + +@as_validated_field +def strictly_positive_number(value: Optional[Union[int, float]] = None): + if value is not None and (not isinstance(value, (int, float)) or not value > 0): + raise ValueError(f"Value must be strictly positive, got {value}") + + +@as_validated_field +def strictly_positive_int(value: Optional[int] = None): + if value is not None and (not isinstance(value, int) or not value > 0): + raise ValueError(f"Value must be strictly positive integer, got {value}") + + +@as_validated_field +def padding_validator(value: Optional[Union[bool, str, PaddingStrategy]] = None): + possible_names = ["longest", "max_length", "do_not_pad"] + if value is None: + pass + elif not isinstance(value, (bool, str, PaddingStrategy)): + raise ValueError("Value for padding must be either a boolean, a string or a `PaddingStrategy`") + elif isinstance(value, str) and value not in possible_names: + raise ValueError(f"If padding is a string, the value must be one of {possible_names}") + + +@as_validated_field +def truncation_validator(value: Optional[Union[bool, str, TruncationStrategy]] = None): + possible_names = ["only_first", "only_second", "longest_first", "do_not_truncate"] + if value is None: + pass + elif not isinstance(value, (bool, str, TruncationStrategy)): + raise ValueError("Value for truncation must be either a boolean, a string or a `TruncationStrategy`") + elif isinstance(value, str) and value not in possible_names: + raise ValueError(f"If truncation is a string, value must be one of {possible_names}") + + +@as_validated_field +def image_size_validator(value: Optional[dict[str, int]] = None): + possible_keys = ["height", "width", "longest_edge", "shortest_edge", "max_height", "max_width"] + if value is None: + pass + elif not isinstance(value, dict) or any(k not in possible_keys for k in value.keys()): + raise ValueError(f"Value for size must be a dict with keys {possible_keys} but got size={value}") + + +@as_validated_field +def device_validator(value: Optional[Union[str, int]] = None): + possible_names = ["cpu", "cuda", "xla", "xpu", "mps", "meta"] + if value is None: + pass + elif isinstance(value, int) and value < 0: + raise ValueError( + f"If device is an integer, the value must be a strictly positive integer but got device={value}" + ) + elif isinstance(value, str) or value.split(":")[0] not in possible_names: + raise ValueError(f"If device is an integer, the value must be one of {possible_names} but got device={value}") + elif not isinstance(value, (int, str)): + raise ValueError( + f"Device must be either an integer device ID or a string (e.g., 'cpu', 'cuda:0'), but got device={value}" + ) + + +@as_validated_field +def resampling_validator(value: Optional[Union[int, PILImageResampling]] = None): + if value is None: + pass + elif isinstance(value, int) and value not in list(range(6)): + raise ValueError( + f"The resampling should be one of {list(range(6))} when provided as integer, but got resampling={value}" + ) + elif isinstance(value, (PILImageResampling, int)): + raise ValueError(f"The resampling should an integer or `PIL.Image.Resampling`, but got resampling={value}") + + +@as_validated_field +def video_metadata_validator(value: Optional[Union[VideoMetadata, dict, Iterable[VideoMetadata, dict]]] = None): + possible_keys = ["total_num_frames", "fps", "width", "height", "duration", "video_backend", "frames_indices"] + if value is None: + pass + elif isinstance(value, Iterable) and not all(isinstance(item, (VideoMetadata, dict)) for item in value): + raise ValueError( + f"If `video_metadata` is a list, each item in the list should be either a dict or a `VideoMetadata` object but got video_metadata={value}" + ) + elif isinstance(value, dict) and not all(key in possible_keys for key in value.keys()): + raise ValueError( + f"If video_metadata is a dict, the keys should be one of {possible_keys} but got device={value.keys()}" + ) + elif not isinstance(value, (VideoMetadata, dict, Iterable)): + raise ValueError( + f"Video metadata must be either a dict, a VideoMetadata or a batched list of metadata, but got device={value}" + ) + + +@as_validated_field +def tensor_type_validator(value: Optional[Union[str, TensorType]] = None): + possible_names = ["pt", "np", "mlx"] + if value is None: + pass + elif not isinstance(value, str) or value not in possible_names: + raise ValueError(f"The tensor type should be one of {possible_names} but got tensor_type={value}") diff --git a/tests/models/colpali/test_processing_colpali.py b/tests/models/colpali/test_processing_colpali.py index 221836db8423..119af1432ce1 100644 --- a/tests/models/colpali/test_processing_colpali.py +++ b/tests/models/colpali/test_processing_colpali.py @@ -133,7 +133,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): def test_image_processor_defaults_preserved_by_image_kwargs(self): """ - We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor. + We use do_rescale=True, rescale_factor=-1.0 to ensure that image_processor kwargs are preserved in the processor. We then check that the mean of the pixel_values is less than or equal to 0 after processing. Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied. """ @@ -141,7 +141,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") processor_components = self.prepare_components() processor_components["image_processor"] = self.get_component( - "image_processor", do_rescale=True, rescale_factor=-1 + "image_processor", do_rescale=True, rescale_factor=-1.0 ) processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") @@ -179,7 +179,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self): image_input = self.prepare_image_inputs() - inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt") + inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1.0, return_tensors="pt") self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) def test_unstructured_kwargs(self): @@ -194,7 +194,7 @@ def test_unstructured_kwargs(self): text=input_str, return_tensors="pt", do_rescale=True, - rescale_factor=-1, + rescale_factor=-1.0, padding="max_length", max_length=76, ) @@ -213,7 +213,7 @@ def test_unstructured_kwargs_batched(self): images=image_input, return_tensors="pt", do_rescale=True, - rescale_factor=-1, + rescale_factor=-1.0, padding="longest", max_length=76, ) @@ -231,7 +231,7 @@ def test_doubly_passed_kwargs(self): with self.assertRaises(ValueError): _ = processor( images=image_input, - images_kwargs={"do_rescale": True, "rescale_factor": -1}, + images_kwargs={"do_rescale": True, "rescale_factor": -1.0}, do_rescale=True, return_tensors="pt", ) @@ -248,7 +248,7 @@ def test_structured_kwargs_nested(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"do_rescale": True, "rescale_factor": -1}, + "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } @@ -268,7 +268,7 @@ def test_structured_kwargs_nested_from_dict(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"do_rescale": True, "rescale_factor": -1}, + "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } diff --git a/tests/models/colqwen2/test_processing_colqwen2.py b/tests/models/colqwen2/test_processing_colqwen2.py index 7346c0d5079c..236456dd7f88 100644 --- a/tests/models/colqwen2/test_processing_colqwen2.py +++ b/tests/models/colqwen2/test_processing_colqwen2.py @@ -132,7 +132,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): def test_image_processor_defaults_preserved_by_image_kwargs(self): """ - We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor. + We use do_rescale=True, rescale_factor=-1.0 to ensure that image_processor kwargs are preserved in the processor. We then check that the mean of the pixel_values is less than or equal to 0 after processing. Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied. """ @@ -140,7 +140,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") processor_components = self.prepare_components() processor_components["image_processor"] = self.get_component( - "image_processor", do_rescale=True, rescale_factor=-1 + "image_processor", do_rescale=True, rescale_factor=-1.0 ) processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") @@ -178,7 +178,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self): image_input = self.prepare_image_inputs() - inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt") + inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1.0, return_tensors="pt") self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) def test_unstructured_kwargs(self): @@ -193,7 +193,7 @@ def test_unstructured_kwargs(self): text=input_str, return_tensors="pt", do_rescale=True, - rescale_factor=-1, + rescale_factor=-1.0, padding="max_length", max_length=76, ) @@ -212,7 +212,7 @@ def test_unstructured_kwargs_batched(self): images=image_input, return_tensors="pt", do_rescale=True, - rescale_factor=-1, + rescale_factor=-1.0, padding="longest", max_length=76, ) @@ -230,7 +230,7 @@ def test_doubly_passed_kwargs(self): with self.assertRaises(ValueError): _ = processor( images=image_input, - images_kwargs={"do_rescale": True, "rescale_factor": -1}, + images_kwargs={"do_rescale": True, "rescale_factor": -1.0}, do_rescale=True, return_tensors="pt", ) @@ -247,7 +247,7 @@ def test_structured_kwargs_nested(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"do_rescale": True, "rescale_factor": -1}, + "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } @@ -267,7 +267,7 @@ def test_structured_kwargs_nested_from_dict(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"do_rescale": True, "rescale_factor": -1}, + "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } diff --git a/tests/models/janus/test_processing_janus.py b/tests/models/janus/test_processing_janus.py index 7e1b025721dc..2ebf55862650 100644 --- a/tests/models/janus/test_processing_janus.py +++ b/tests/models/janus/test_processing_janus.py @@ -444,7 +444,7 @@ def test_chat_template_accepts_processing_kwargs(self): tokenize=True, return_dict=True, do_rescale=True, - rescale_factor=-1, + rescale_factor=-1.0, return_tensors="np", ) self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0) diff --git a/tests/models/mllama/test_processing_mllama.py b/tests/models/mllama/test_processing_mllama.py index be1472496823..9481d91f08bb 100644 --- a/tests/models/mllama/test_processing_mllama.py +++ b/tests/models/mllama/test_processing_mllama.py @@ -370,7 +370,7 @@ def test_unstructured_kwargs_batched(self): images=image_input, return_tensors="pt", do_rescale=True, - rescale_factor=-1, + rescale_factor=-1.0, padding="longest", max_length=76, ) diff --git a/tests/models/smolvlm/test_processing_smolvlm.py b/tests/models/smolvlm/test_processing_smolvlm.py index 3a11103d6efb..40aaaf7a6ca2 100644 --- a/tests/models/smolvlm/test_processing_smolvlm.py +++ b/tests/models/smolvlm/test_processing_smolvlm.py @@ -482,7 +482,7 @@ def test_unstructured_kwargs_batched_video(self): videos=video_input, return_tensors="pt", do_rescale=True, - rescale_factor=-1, + rescale_factor=-1.0, padding="max_length", max_length=172, ) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index eef97c5b06c7..8eb30d787c01 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -383,7 +383,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): def test_image_processor_defaults_preserved_by_image_kwargs(self): """ - We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor. + We use do_rescale=True, rescale_factor=-1.0 to ensure that image_processor kwargs are preserved in the processor. We then check that the mean of the pixel_values is less than or equal to 0 after processing. Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied. """ @@ -391,7 +391,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") processor_components = self.prepare_components() processor_components["image_processor"] = self.get_component( - "image_processor", do_rescale=True, rescale_factor=-1 + "image_processor", do_rescale=True, rescale_factor=-1.0 ) processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") processor_kwargs = self.prepare_processor_dict() @@ -437,7 +437,9 @@ def test_kwargs_overrides_default_image_processor_kwargs(self): input_str = self.prepare_text_inputs(modalities="image") image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt") + inputs = processor( + text=input_str, images=image_input, do_rescale=True, rescale_factor=-1.0, return_tensors="pt" + ) self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) def test_unstructured_kwargs(self): @@ -455,7 +457,7 @@ def test_unstructured_kwargs(self): images=image_input, return_tensors="pt", do_rescale=True, - rescale_factor=-1, + rescale_factor=-1.0, padding="max_length", max_length=76, ) @@ -478,7 +480,7 @@ def test_unstructured_kwargs_batched(self): images=image_input, return_tensors="pt", do_rescale=True, - rescale_factor=-1, + rescale_factor=-1.0, padding="longest", max_length=76, ) @@ -503,7 +505,7 @@ def test_doubly_passed_kwargs(self): _ = processor( text=input_str, images=image_input, - images_kwargs={"do_rescale": True, "rescale_factor": -1}, + images_kwargs={"do_rescale": True, "rescale_factor": -1.0}, do_rescale=True, return_tensors="pt", ) @@ -534,7 +536,7 @@ def test_structured_kwargs_nested(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"do_rescale": True, "rescale_factor": -1}, + "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } @@ -557,7 +559,7 @@ def test_structured_kwargs_nested_from_dict(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"do_rescale": True, "rescale_factor": -1}, + "images_kwargs": {"do_rescale": True, "rescale_factor": -1.0}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } @@ -683,7 +685,7 @@ def test_tokenizer_defaults_preserved_by_kwargs_video(self): def test_video_processor_defaults_preserved_by_video_kwargs(self): """ - We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor. + We use do_rescale=True, rescale_factor=-1.0 to ensure that image_processor kwargs are preserved in the processor. We then check that the mean of the pixel_values is less than or equal to 0 after processing. Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied. """ @@ -691,7 +693,7 @@ def test_video_processor_defaults_preserved_by_video_kwargs(self): self.skipTest(f"video_processor attribute not present in {self.processor_class}") processor_components = self.prepare_components() processor_components["video_processor"] = self.get_component( - "video_processor", do_rescale=True, rescale_factor=-1 + "video_processor", do_rescale=True, rescale_factor=-1.0 ) processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length") processor_kwargs = self.prepare_processor_dict() @@ -747,7 +749,7 @@ def test_kwargs_overrides_default_video_processor_kwargs(self): videos=video_input, do_sample_frames=False, do_rescale=True, - rescale_factor=-1, + rescale_factor=-1.0, return_tensors="pt", ) self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0) @@ -768,7 +770,7 @@ def test_unstructured_kwargs_video(self): do_sample_frames=False, return_tensors="pt", do_rescale=True, - rescale_factor=-1, + rescale_factor=-1.0, padding="max_length", max_length=176, ) @@ -792,7 +794,7 @@ def test_unstructured_kwargs_batched_video(self): do_sample_frames=False, return_tensors="pt", do_rescale=True, - rescale_factor=-1, + rescale_factor=-1.0, padding="longest", max_length=176, ) @@ -818,7 +820,7 @@ def test_doubly_passed_kwargs_video(self): text=input_str, videos=video_input, do_sample_frames=False, - videos_kwargs={"do_rescale": True, "rescale_factor": -1}, + videos_kwargs={"do_rescale": True, "rescale_factor": -1.0}, do_rescale=True, return_tensors="pt", ) From 02e22c611f08a36f8fcb2ff877b2f4536d4bbe04 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 10 Sep 2025 14:39:54 +0200 Subject: [PATCH 02/28] delete --- smolvlm.py | 119 ----------------------------------------------------- 1 file changed, 119 deletions(-) delete mode 100644 smolvlm.py diff --git a/smolvlm.py b/smolvlm.py deleted file mode 100644 index ef01fd9f087c..000000000000 --- a/smolvlm.py +++ /dev/null @@ -1,119 +0,0 @@ -from transformers import Qwen2VLProcessor - -if __name__ == "__main__": - - for i in range(1): - processor = Qwen2VLProcessor.from_pretrained(pretrained_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", use_fast=True) - processor - - -from typing_extensions import Unpack -from transformers.tokenization_utils_base import PaddingStrategy -from typing import Union, TypeVar, Generic, get_type_hints, TypedDict, Literal, Annotated, Optional, get_origin, get_args -from dataclasses import make_dataclass, field - -my_int = TypeVar('my_int', bound=int) - - -class Mixin: - def mixin_method(self): - return 0 - -class Stack(Mixin, Generic[my_int]): - def __init__(self) -> None: - # Create an empty list with items of type T - self.items: list[my_int] = [] - - def push(self, item: my_int) -> None: - self.items.append(item) - - -class ModelStack(Stack[str]): - pass - -s = ModelStack() -s.push(0) - - - -from dataclasses import dataclass, MISSING, fields -from huggingface_hub.dataclasses import as_validated_field, strict, validated_field - -def positive_int(value: int): - if not value >= 0: - raise ValueError(f"Value must be positive, got {value}") - - -def multiple_of_64(value: int): - if not value % 64 == 0: - raise ValueError(f"Value must be a multiple of 64, got {value}") - - -@as_validated_field -def strictly_positive(value: int = None): - if value is not None and not value > 0: - raise ValueError(f"Value must be strictly positive, got {value}") - -@as_validated_field -def padding_validator(value: Union[bool, str, PaddingStrategy] = None): - if value is None: - return - - if not isinstance(value, (bool, str, PaddingStrategy)): - raise ValueError(f"Value must be padding") - if isinstance(value, str) and value not in ["longest", "max_length", "do_not_pad"]: - raise ValueError(f'Value for padding must be one of ["longest", "max_length", "do_not_pad"]') - -@strict -@dataclass -class Config: - model_type: str - hidden_size: int = validated_field(validator=[positive_int, multiple_of_64]) - vocab_size: int = strictly_positive(default=16) - - -class AnotherKwargs(TypedDict, total=False): - name: Union[str, list[str]] - age: Annotated[Optional[int], strictly_positive()] - padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()] - padding_side: Optional[Literal["right", "left"]] - - -def unpack_annotated_type(type): - if get_origin(type) is Annotated: - base, *meta = get_args(type) - return base, meta[0] - return type, field(default=MISSING) - - -def dataclass_from_typed_dict(td: type[TypedDict]): - hints = get_type_hints(td, include_extras=True) - dc_fields = [ - (k, *unpack_annotated_type(v)) - for k, v in hints.items() - ] - return make_dataclass(td.__name__ + "Config", dc_fields) - - -class HubTypeAdapter(): - def __init__(self, type: type[TypedDict]) -> None: - self.type = type - dataclass = dataclass_from_typed_dict(type) - self.dataclass = strict(dataclass) - - def validate_fields(self, **kwargs): - for f in fields(self.dataclass): - if f.name not in kwargs: - kwargs[f.name] = None - self.dataclass(**kwargs) - - -config = Config(model_type="bert", vocab_size=30000, hidden_size=768) -print(config.__dataclass_fields__) -assert config.model_type == "bert" -assert config.vocab_size == 30000 -assert config.hidden_size == 768 - -HubTypeAdapter(AnotherKwargs).validate_fields(name=["BOB", "MARY"], age=100, padding=None) -print(AnotherKwargs.__annotations__['age']) - From e74487502acce9d84a89f53f26a3cd21e203091b Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 10 Sep 2025 19:15:33 +0200 Subject: [PATCH 03/28] fix a few tests --- src/transformers/models/aria/modular_aria.py | 10 ++++- .../models/aria/processing_aria.py | 10 ++++- .../models/glm4v/processing_glm4v.py | 2 +- .../models/glm4v/video_processing_glm4v.py | 2 +- .../internvl/video_processing_internvl.py | 2 +- .../models/kosmos2/processing_kosmos2.py | 4 +- .../models/mllama/processing_mllama.py | 16 +++----- .../models/owlv2/processing_owlv2.py | 16 +------- .../models/qwen2_5_vl/modular_qwen2_5_vl.py | 2 +- .../qwen2_5_vl/processing_qwen2_5_vl.py | 2 +- .../models/smolvlm/processing_smolvlm.py | 15 +++++++- .../smolvlm/video_processing_smolvlm.py | 2 +- .../models/udop/processing_udop.py | 2 +- src/transformers/processing_utils.py | 38 +++++++++++-------- src/transformers/utils/type_validators.py | 27 ++++++++----- 15 files changed, 88 insertions(+), 62 deletions(-) diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 790003d853c4..2c3b3e996ee5 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -36,7 +36,7 @@ ) from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_utils import PreTrainedModel -from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils import PreTokenizedInput, TextInput from ...utils import TensorType, TransformersKwargs, auto_docstring, can_return_tuple, logging from ...utils.import_utils import is_torch_available @@ -910,7 +910,15 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non return num_patches +class AriaImagesKwargs(ImagesKwargs, total=False): + split_image: Optional[bool] + max_image_size: Optional[int] + min_image_size: Optional[int] + + class AriaProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: AriaImagesKwargs + _defaults = { "text_kwargs": { "padding": False, diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py index 9264776e80fd..34b03f126d70 100644 --- a/src/transformers/models/aria/processing_aria.py +++ b/src/transformers/models/aria/processing_aria.py @@ -24,13 +24,21 @@ from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils import PreTokenizedInput, TextInput from ...utils import TensorType from ..auto import AutoTokenizer +class AriaImagesKwargs(ImagesKwargs, total=False): + split_image: Optional[bool] + max_image_size: Optional[int] + min_image_size: Optional[int] + + class AriaProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: AriaImagesKwargs + _defaults = { "text_kwargs": { "padding": False, diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index 817da3630d52..e0a005b1ad1d 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -34,7 +34,7 @@ class Glm4vVideosProcessorKwargs(VideosKwargs, total=False): - fps: Union[list[float], float] + fps: Optional[Union[list[float], float]] class Glm4vImagesKwargs(ImagesKwargs): diff --git a/src/transformers/models/glm4v/video_processing_glm4v.py b/src/transformers/models/glm4v/video_processing_glm4v.py index a327ac200507..cf616318df51 100644 --- a/src/transformers/models/glm4v/video_processing_glm4v.py +++ b/src/transformers/models/glm4v/video_processing_glm4v.py @@ -55,7 +55,7 @@ class Glm4vVideoProcessorInitKwargs(VideosKwargs): - max_image_size: dict[str, int] = None + max_image_size: Optional[dict[str, int]] = None patch_size: Optional[int] = None temporal_patch_size: Optional[int] = None merge_size: Optional[int] = None diff --git a/src/transformers/models/internvl/video_processing_internvl.py b/src/transformers/models/internvl/video_processing_internvl.py index 2fc5729119e9..d2fd594ddceb 100644 --- a/src/transformers/models/internvl/video_processing_internvl.py +++ b/src/transformers/models/internvl/video_processing_internvl.py @@ -50,7 +50,7 @@ class InternVLVideoProcessorInitKwargs(VideosKwargs): - initial_shift: Union[bool, float, int] + initial_shift: Optional[Union[bool, float, int]] @requires(backends=("torchvision",)) diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py index 58b3dff1e07a..152e73e04cdd 100644 --- a/src/transformers/models/kosmos2/processing_kosmos2.py +++ b/src/transformers/models/kosmos2/processing_kosmos2.py @@ -17,7 +17,7 @@ import copy import math import re -from typing import Optional, Union +from typing import Any, Optional, Union from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput @@ -35,7 +35,7 @@ class Kosmos2ImagesKwargs(ImagesKwargs, total=False): - bboxes: Optional[list[float]] + bboxes: Optional[list[Any]] num_image_tokens: Optional[int] first_image_token_id: Optional[int] diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py index 0dae7c834303..0ea3a6c0f1cb 100644 --- a/src/transformers/models/mllama/processing_mllama.py +++ b/src/transformers/models/mllama/processing_mllama.py @@ -269,10 +269,8 @@ def __call__( **kwargs, ) - text_kwargs = output_kwargs["text_kwargs"] - text_kwargs["return_tensors"] = None - images_kwargs = output_kwargs["images_kwargs"] - common_kwargs = output_kwargs["common_kwargs"] + # Pop return_tensors for now because we perform manipulations with token ids below + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) data = {} if text is not None: @@ -282,8 +280,7 @@ def __call__( raise ValueError("Invalid input text. Please provide a string, or a list of strings") n_images_in_text = [t.count(self.image_token) for t in text] text = [build_string_from_input(text_item, self.bos_token, self.image_token) for text_item in text] - _ = text_kwargs.pop("padding_side", None) # hack until padding-side is an accepted kwarg by tokenizers - encoding = self.tokenizer(text, **text_kwargs) + encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) self._check_special_mm_tokens(text, encoding, modalities=["image"]) n_images_in_ids = [token_ids.count(self.image_token_id) for token_ids in encoding["input_ids"]] data.update(encoding) @@ -319,7 +316,7 @@ def __call__( ) if images is not None: - image_features = self.image_processor(images, **images_kwargs) + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) num_tiles = image_features.pop("num_tiles") data.update(image_features) @@ -336,10 +333,7 @@ def __call__( ) data["cross_attention_mask"] = cross_attention_mask - return_tensors = common_kwargs.pop("return_tensors", None) - batch_feature = BatchFeature(data=data, tensor_type=return_tensors) - - return batch_feature + return BatchFeature(data=data, tensor_type=return_tensors) def post_process_image_text_to_text( self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index 2e69379af73f..160daa4f5ae2 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -30,7 +30,7 @@ Unpack, ) from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available +from ...utils import TensorType, is_torch_available if TYPE_CHECKING: @@ -125,7 +125,7 @@ def __call__( **kwargs, ) query_images = output_kwargs["images_kwargs"].pop("query_images", None) - return_tensors = output_kwargs["common_kwargs"]["return_tensors"] + return_tensors = output_kwargs["text_kwargs"].get("return_tensors", None) if text is None and query_images is None and images is None: raise ValueError( @@ -157,24 +157,12 @@ def __call__( input_ids = np.concatenate([encoding["input_ids"] for encoding in encodings], axis=0) attention_mask = np.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0) - elif return_tensors == "jax" and is_flax_available(): - import jax.numpy as jnp - - input_ids = jnp.concatenate([encoding["input_ids"] for encoding in encodings], axis=0) - attention_mask = jnp.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0) - elif return_tensors == "pt" and is_torch_available(): import torch input_ids = torch.cat([encoding["input_ids"] for encoding in encodings], dim=0) attention_mask = torch.cat([encoding["attention_mask"] for encoding in encodings], dim=0) - elif return_tensors == "tf" and is_tf_available(): - import tensorflow as tf - - input_ids = tf.stack([encoding["input_ids"] for encoding in encodings], axis=0) - attention_mask = tf.stack([encoding["attention_mask"] for encoding in encodings], axis=0) - else: raise ValueError("Target return tensor type could not be returned") diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index d62f94f37678..8ad5b7d9ec53 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -840,7 +840,7 @@ def prepare_inputs_for_generation( class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): - fps: Union[list[float], float] + fps: Optional[Union[list[float], float]] class Qwen2_5_VLImagesKwargs(Qwen2VLImagesKwargs): diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py index b357ba850deb..7be98e7e5023 100644 --- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py @@ -36,7 +36,7 @@ class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): - fps: Union[list[float], float] + fps: Optional[Union[list[float], float]] class Qwen2_5_VLImagesKwargs(ImagesKwargs): diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py index 97f0eaa9e7b2..1ea922d6a19d 100644 --- a/src/transformers/models/smolvlm/processing_smolvlm.py +++ b/src/transformers/models/smolvlm/processing_smolvlm.py @@ -21,7 +21,14 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, make_nested_list_of_images -from ...processing_utils import AllKwargsForChatTemplate, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import ( + AllKwargsForChatTemplate, + ImagesKwargs, + ProcessingKwargs, + ProcessorMixin, + Unpack, + VideosKwargs, +) from ...tokenization_utils_base import BatchEncoding, TextInput from ...utils import is_num2words_available, is_vision_available, logging from ...video_utils import VideoInput @@ -108,8 +115,14 @@ class SmolVLMImagesKwargs(ImagesKwargs, total=False): max_image_size: Optional[dict[str, int]] +class SmolVLMVideosKwargs(VideosKwargs, total=False): + return_row_col_info: Optional[bool] + max_image_size: Optional[dict[str, int]] + + class SmolVLMProcessorKwargs(ProcessingKwargs, total=False): images_kwargs: SmolVLMImagesKwargs + videos_kwargs: SmolVLMVideosKwargs _defaults = { "text_kwargs": { diff --git a/src/transformers/models/smolvlm/video_processing_smolvlm.py b/src/transformers/models/smolvlm/video_processing_smolvlm.py index 5ad70d870c63..9613437be85b 100644 --- a/src/transformers/models/smolvlm/video_processing_smolvlm.py +++ b/src/transformers/models/smolvlm/video_processing_smolvlm.py @@ -121,7 +121,7 @@ def get_resize_output_image_size( class SmolVLMVideoProcessorInitKwargs(VideosKwargs): - max_image_size: dict[str, int] = None + max_image_size: Optional[dict[str, int]] = None @requires(backends=("torchvision",)) diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py index 94b1565c9a22..206078cad899 100644 --- a/src/transformers/models/udop/processing_udop.py +++ b/src/transformers/models/udop/processing_udop.py @@ -31,7 +31,7 @@ class UdopTextKwargs(TextKwargs, total=False): word_labels: Optional[Union[list[int], list[list[int]]]] - boxes: Union[list[list[int]], list[list[list[int]]]] + boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] class UdopProcessorKwargs(ProcessingKwargs, total=False): diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 038c9f5ecaa6..4879c2433222 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -14,6 +14,7 @@ """ Processing saving/loading class for common processors. """ +# from __future__ import annotations import bisect import copy @@ -42,9 +43,9 @@ device_validator, image_size_validator, padding_validator, + positive_any_number, + positive_int, resampling_validator, - strictly_positive_int, - strictly_positive_number, tensor_type_validator, truncation_validator, video_metadata_validator, @@ -55,7 +56,6 @@ if is_vision_available(): from .image_utils import PILImageResampling - from .tokenization_utils_base import ( PaddingStrategy, PreTokenizedInput, @@ -156,10 +156,10 @@ class TextKwargs(TypedDict, total=False): add_special_tokens: Optional[bool] padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()] truncation: Annotated[Optional[Union[bool, str, TruncationStrategy]], truncation_validator()] - max_length: Annotated[Optional[int], strictly_positive_int()] - stride: Annotated[Optional[int], strictly_positive_int()] + max_length: Annotated[Optional[int], positive_int()] + stride: Annotated[Optional[int], positive_int()] is_split_into_words: Optional[bool] - pad_to_multiple_of: Annotated[Optional[int], strictly_positive_int()] + pad_to_multiple_of: Annotated[Optional[int], positive_int()] return_token_type_ids: Optional[bool] return_attention_mask: Optional[bool] return_overflowing_tokens: Optional[bool] @@ -186,6 +186,8 @@ class methods and docstrings. The size by which to make sure both the height and width can be divided. crop_size (`dict[str, int]`, *optional*): Desired output size when applying center-cropping. + do_convert_rgb (`bool`): + Whether to convert the video to RGB format. resample (`PILImageResampling`, *optional*): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*): @@ -212,9 +214,10 @@ class methods and docstrings. The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing. """ + do_convert_rgb: Optional[bool] do_resize: Optional[bool] size: Annotated[Optional[dict[str, int]], image_size_validator()] - size_divisor: Annotated[Optional[int], strictly_positive_int()] + size_divisor: Annotated[Optional[int], positive_int()] crop_size: Annotated[Optional[dict[str, int]], image_size_validator()] resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()] do_rescale: Optional[bool] @@ -283,7 +286,7 @@ class VideosKwargs(TypedDict, total=False): do_convert_rgb: Optional[bool] do_resize: Optional[bool] size: Annotated[Optional[dict[str, int]], image_size_validator()] - size_divisor: Annotated[Optional[int], strictly_positive_int()] + size_divisor: Annotated[Optional[int], positive_int()] default_to_square: Optional[bool] resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()] do_rescale: Optional[bool] @@ -301,8 +304,8 @@ class VideosKwargs(TypedDict, total=False): video_metadata: Annotated[ Optional[Union[VideoMetadata, dict, Iterable[VideoMetadata, dict]]], video_metadata_validator() ] - fps: Annotated[Optional[Union[int, float]], strictly_positive_number()] - num_frames: Annotated[Optional[int], strictly_positive_int()] + fps: Annotated[Optional[Union[int, float]], positive_any_number()] + num_frames: Annotated[Optional[int], positive_int()] return_metadata: Optional[bool] return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()] @@ -337,12 +340,12 @@ class AudioKwargs(TypedDict, total=False): Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`. """ - sampling_rate: Annotated[Optional[int], strictly_positive_int()] + sampling_rate: Annotated[Optional[int], positive_int()] raw_speech: Optional[Union["np.ndarray", list[float], list["np.ndarray"], list[list[float]]]] padding: Annotated[Optional[Union[bool, str, PaddingStrategy]], padding_validator()] - max_length: Annotated[Optional[int], strictly_positive_int()] + max_length: Annotated[Optional[int], positive_int()] truncation: Annotated[Optional[Union[bool, str, TruncationStrategy]], truncation_validator()] - pad_to_multiple_of: Annotated[Optional[int], strictly_positive_int()] + pad_to_multiple_of: Annotated[Optional[int], positive_int()] return_attention_mask: Optional[bool] return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()] @@ -1335,9 +1338,14 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg for kwarg in output_kwargs.values(): kwarg.update(common_kwargs) - # Perform type validation on collected kwargs + # Finally perform type validation on collected kwargs + # NOTE: When we inherit from BaseTypedDict, the bases won't be in MRO of ModelTypedDict + # That causes errors if certain type annotations are not defined/imported in model processor + # file. So we will pass globalns of `processing_utils.py` manually to bypass it + base_globalns = getattr(sys.modules.get(ProcessingKwargs.__module__, None), "__dict__", {}) for key, typed_dict_obj in ModelProcessorKwargs.__annotations__.items(): - type_validator = TypedDictAdapter(typed_dict_obj) + child_localns = getattr(sys.modules.get(typed_dict_obj.__module__, None), "__dict__", {}) + type_validator = TypedDictAdapter(typed_dict_obj, globalns=base_globalns, localns=child_localns) type_validator.validate_fields(**output_kwargs[key]) return output_kwargs diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py index cb846b8a6255..883b852d5fd7 100644 --- a/src/transformers/utils/type_validators.py +++ b/src/transformers/utils/type_validators.py @@ -1,6 +1,6 @@ from collections.abc import Iterable from dataclasses import MISSING, field, make_dataclass -from typing import Annotated, Optional, TypedDict, Union, get_args, get_origin, get_type_hints +from typing import Annotated, Any, Optional, TypedDict, Union, get_args, get_origin, get_type_hints from huggingface_hub.dataclasses import as_validated_field, strict @@ -21,7 +21,7 @@ def unpack_annotated_type(type): return type, field(default=MISSING) -# Minimalistic version on pydantic.TypeAdapter tailored for `TypedDict` +# Minimalistic version of pydantic.TypeAdapter tailored for `TypedDict` class TypedDictAdapter: """ A utility class used to convert a TypedDict object to dataclass and attach @@ -31,8 +31,15 @@ class TypedDictAdapter: type: The TypedDict object that needs to be validated. """ - def __init__(self, type: type[TypedDict]) -> None: + def __init__( + self, + type: type[TypedDict], + globalns: Optional[dict[str, Any]] = None, + localns: Optional[dict[str, Any]] = None, + ): self.type = type + self.globalns = globalns + self.localns = localns self.dataclass = self.create_dataclass() self.dataclass = strict(self.dataclass) @@ -68,22 +75,22 @@ class TokenizerKwargs(TypedDict, total=False): # The dataclass can also be used as a simple config class for easier kwarg management dataclass = dataclass_from_typed_dict(TokenizerKwargs) """ - hints = get_type_hints(self.type, include_extras=True) + hints = get_type_hints(self.type, globalns=self.globalns, localns=self.localns, include_extras=True) fields = [(k, *unpack_annotated_type(v)) for k, v in hints.items()] self.fields = fields return make_dataclass(self.type.__name__ + "Config", fields) @as_validated_field -def strictly_positive_number(value: Optional[Union[int, float]] = None): - if value is not None and (not isinstance(value, (int, float)) or not value > 0): - raise ValueError(f"Value must be strictly positive, got {value}") +def positive_any_number(value: Optional[Union[int, float]] = None): + if value is not None and (not isinstance(value, (int, float)) or not value >= 0): + raise ValueError(f"Value must be a positive integer or floating number, got {value}") @as_validated_field -def strictly_positive_int(value: Optional[int] = None): - if value is not None and (not isinstance(value, int) or not value > 0): - raise ValueError(f"Value must be strictly positive integer, got {value}") +def positive_int(value: Optional[int] = None): + if value is not None and (not isinstance(value, int) or not value >= 0): + raise ValueError(f"Value must be a positive integer, got {value}") @as_validated_field From 63532bf39b160bb8394ce73b205224e0d8e4164f Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 10 Sep 2025 19:24:10 +0200 Subject: [PATCH 04/28] fix --- src/transformers/utils/type_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py index 883b852d5fd7..f35c3fe5c213 100644 --- a/src/transformers/utils/type_validators.py +++ b/src/transformers/utils/type_validators.py @@ -142,7 +142,7 @@ def device_validator(value: Optional[Union[str, int]] = None): @as_validated_field -def resampling_validator(value: Optional[Union[int, PILImageResampling]] = None): +def resampling_validator(value: Optional[Union[int, "PILImageResampling"]] = None): if value is None: pass elif isinstance(value, int) and value not in list(range(6)): From 1f62d6f70a8a997befa54656990b07515b4d2491 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 11 Sep 2025 13:25:26 +0200 Subject: [PATCH 05/28] fix the rest of tests --- src/transformers/models/dia/processing_dia.py | 7 ++--- .../models/kosmos2/processing_kosmos2.py | 6 ++-- .../models/kosmos2_5/processing_kosmos2_5.py | 3 +- .../models/owlv2/processing_owlv2.py | 16 +++++++++-- .../models/owlvit/processing_owlvit.py | 2 +- src/transformers/models/sam/processing_sam.py | 2 +- .../models/sam_hq/processing_samhq.py | 2 +- src/transformers/utils/type_validators.py | 28 +++++++++++++++---- src/transformers/video_processing_utils.py | 6 ++++ tests/test_processing_common.py | 4 +-- 10 files changed, 57 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py index 402f5152a64b..e435ba23cc4a 100644 --- a/src/transformers/models/dia/processing_dia.py +++ b/src/transformers/models/dia/processing_dia.py @@ -46,6 +46,7 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False): "padding": True, "padding_side": "right", "add_special_tokens": False, + "return_tensors": "pt", }, "audio_kwargs": { "eos_token_id": 1024, @@ -54,8 +55,8 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False): "delay_pattern": [0, 8, 9, 10, 11, 12, 13, 14, 15], "generation": True, "sampling_rate": 44100, + "return_tensors": "pt", }, - "common_kwargs": {"return_tensors": "pt"}, } @@ -111,9 +112,7 @@ def __call__( text_kwargs = output_kwargs["text_kwargs"] audio_kwargs = output_kwargs["audio_kwargs"] - common_kwargs = output_kwargs["common_kwargs"] - - return_tensors = common_kwargs.pop("return_tensors", None) + return_tensors = output_kwargs["text_kwargs"].get("return_tensors", None) if return_tensors != "pt": raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.") diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py index 152e73e04cdd..423a395d74a0 100644 --- a/src/transformers/models/kosmos2/processing_kosmos2.py +++ b/src/transformers/models/kosmos2/processing_kosmos2.py @@ -17,7 +17,7 @@ import copy import math import re -from typing import Any, Optional, Union +from typing import Optional, Union from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput @@ -33,9 +33,11 @@ list[list[tuple[float, float, float]]], ] +NestedList = list[Union[Optional[int], "NestedList"]] + class Kosmos2ImagesKwargs(ImagesKwargs, total=False): - bboxes: Optional[list[Any]] + bboxes: Optional[NestedList] num_image_tokens: Optional[int] first_image_token_id: Optional[int] diff --git a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py index 0e3c70c80234..1bc516038b9e 100644 --- a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py +++ b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py @@ -43,12 +43,13 @@ class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False): "return_token_type_ids": False, "stride": 0, "truncation": True, + "return_tensors": "pt", }, "images_kwargs": { "max_patches": 4096, "num_image_tokens": 2048, + "return_tensors": "pt", }, - "common_kwargs": {"return_tensors": "pt"}, } diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index 160daa4f5ae2..57da5ce03370 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -30,7 +30,7 @@ Unpack, ) from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...utils import TensorType, is_torch_available +from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available if TYPE_CHECKING: @@ -125,7 +125,7 @@ def __call__( **kwargs, ) query_images = output_kwargs["images_kwargs"].pop("query_images", None) - return_tensors = output_kwargs["text_kwargs"].get("return_tensors", None) + return_tensors = output_kwargs["text_kwargs"]["return_tensors"] if text is None and query_images is None and images is None: raise ValueError( @@ -157,12 +157,24 @@ def __call__( input_ids = np.concatenate([encoding["input_ids"] for encoding in encodings], axis=0) attention_mask = np.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0) + elif return_tensors == "jax" and is_flax_available(): + import jax.numpy as jnp + + input_ids = jnp.concatenate([encoding["input_ids"] for encoding in encodings], axis=0) + attention_mask = jnp.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0) + elif return_tensors == "pt" and is_torch_available(): import torch input_ids = torch.cat([encoding["input_ids"] for encoding in encodings], dim=0) attention_mask = torch.cat([encoding["attention_mask"] for encoding in encodings], dim=0) + elif return_tensors == "tf" and is_tf_available(): + import tensorflow as tf + + input_ids = tf.stack([encoding["input_ids"] for encoding in encodings], axis=0) + attention_mask = tf.stack([encoding["attention_mask"] for encoding in encodings], axis=0) + else: raise ValueError("Target return tensor type could not be returned") diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index 0e0c59d555f2..c62d53c77f9e 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -135,7 +135,7 @@ def __call__( **kwargs, ) query_images = output_kwargs["images_kwargs"].pop("query_images", None) - return_tensors = output_kwargs["common_kwargs"]["return_tensors"] + return_tensors = output_kwargs["text_kwargs"]["return_tensors"] if text is None and query_images is None and images is None: raise ValueError( diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py index 603adde95040..2552a4c66460 100644 --- a/src/transformers/models/sam/processing_sam.py +++ b/src/transformers/models/sam/processing_sam.py @@ -117,7 +117,7 @@ def __call__( input_points=input_points, input_labels=input_labels, input_boxes=input_boxes, - return_tensors=output_kwargs["common_kwargs"].get("return_tensors"), + return_tensors=output_kwargs["images_kwargs"].get("return_tensors"), point_pad_value=point_pad_value, ) diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py index 49681c7c6a26..1799a07201e7 100644 --- a/src/transformers/models/sam_hq/processing_samhq.py +++ b/src/transformers/models/sam_hq/processing_samhq.py @@ -118,7 +118,7 @@ def __call__( input_points=input_points, input_labels=input_labels, input_boxes=input_boxes, - return_tensors=output_kwargs["common_kwargs"].get("return_tensors"), + return_tensors=output_kwargs["images_kwargs"].get("return_tensors"), point_pad_value=output_kwargs["images_kwargs"].get("point_pad_value"), ) diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py index f35c3fe5c213..5fa0c719f82b 100644 --- a/src/transformers/utils/type_validators.py +++ b/src/transformers/utils/type_validators.py @@ -1,6 +1,6 @@ from collections.abc import Iterable from dataclasses import MISSING, field, make_dataclass -from typing import Annotated, Any, Optional, TypedDict, Union, get_args, get_origin, get_type_hints +from typing import Annotated, Any, ForwardRef, Optional, TypedDict, Union, get_args, get_origin from huggingface_hub.dataclasses import as_validated_field, strict @@ -21,6 +21,24 @@ def unpack_annotated_type(type): return type, field(default=MISSING) +def get_type_hints_from_typed_dict(obj: type[TypedDict]): + """ + Same as `typing.get_type_hints` but does not perform evaluation + on the ForwardRefs. Evaluating might fails if the package is not imported + or installed, therefore we will have our own "guarded" type validations. + All `ForwardRef` will be ignored by the hub validator + """ + raw_annots = obj.__dict__.get("__annotations__", {}) + type_hints = {} + for name, value in raw_annots.items(): + if value is None: + value = type(None) + if isinstance(value, str): + value = ForwardRef(value, is_argument=False) + type_hints[name] = value + return type_hints + + # Minimalistic version of pydantic.TypeAdapter tailored for `TypedDict` class TypedDictAdapter: """ @@ -75,7 +93,7 @@ class TokenizerKwargs(TypedDict, total=False): # The dataclass can also be used as a simple config class for easier kwarg management dataclass = dataclass_from_typed_dict(TokenizerKwargs) """ - hints = get_type_hints(self.type, globalns=self.globalns, localns=self.localns, include_extras=True) + hints = get_type_hints_from_typed_dict(self.type) fields = [(k, *unpack_annotated_type(v)) for k, v in hints.items()] self.fields = fields return make_dataclass(self.type.__name__ + "Config", fields) @@ -133,8 +151,8 @@ def device_validator(value: Optional[Union[str, int]] = None): raise ValueError( f"If device is an integer, the value must be a strictly positive integer but got device={value}" ) - elif isinstance(value, str) or value.split(":")[0] not in possible_names: - raise ValueError(f"If device is an integer, the value must be one of {possible_names} but got device={value}") + elif isinstance(value, str) and value.split(":")[0] not in possible_names: + raise ValueError(f"If device is an string, the value must be one of {possible_names} but got device={value}") elif not isinstance(value, (int, str)): raise ValueError( f"Device must be either an integer device ID or a string (e.g., 'cpu', 'cuda:0'), but got device={value}" @@ -149,7 +167,7 @@ def resampling_validator(value: Optional[Union[int, "PILImageResampling"]] = Non raise ValueError( f"The resampling should be one of {list(range(6))} when provided as integer, but got resampling={value}" ) - elif isinstance(value, (PILImageResampling, int)): + elif is_vision_available() and not isinstance(value, (PILImageResampling, int)): raise ValueError(f"The resampling should an integer or `PIL.Image.Resampling`, but got resampling={value}") diff --git a/src/transformers/video_processing_utils.py b/src/transformers/video_processing_utils.py index 562a5de65718..e403d3d90774 100644 --- a/src/transformers/video_processing_utils.py +++ b/src/transformers/video_processing_utils.py @@ -52,6 +52,7 @@ ) from .utils.hub import cached_files from .utils.import_utils import requires +from .utils.type_validators import TypedDictAdapter from .video_utils import ( VideoInput, VideoMetadata, @@ -364,6 +365,11 @@ def preprocess( captured_kwargs=kwargs.keys(), valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"], ) + + # Perform type validation on received kwargs + type_validator = TypedDictAdapter(self.valid_kwargs) + type_validator.validate_fields(**kwargs) + # Set default kwargs from self. This ensures that if a kwarg is not provided # by the user, it gets its default value from the instance, or is set to None. for kwarg_name in self.valid_kwargs.__annotations__: diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 8eb30d787c01..862961f7b299 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -839,7 +839,7 @@ def test_structured_kwargs_nested_video(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "videos_kwargs": {"do_rescale": True, "rescale_factor": -1, "do_sample_frames": False}, + "videos_kwargs": {"do_rescale": True, "rescale_factor": -1.0, "do_sample_frames": False}, "text_kwargs": {"padding": "max_length", "max_length": 176}, } @@ -862,7 +862,7 @@ def test_structured_kwargs_nested_from_dict_video(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "videos_kwargs": {"do_rescale": True, "rescale_factor": -1, "do_sample_frames": False}, + "videos_kwargs": {"do_rescale": True, "rescale_factor": -1.0, "do_sample_frames": False}, "text_kwargs": {"padding": "max_length", "max_length": 176}, } From c203ffd81c1a67dd1015d9f86105a97519ea0380 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 11 Sep 2025 13:54:37 +0200 Subject: [PATCH 06/28] common-kwargs --- src/transformers/models/dia/processing_dia.py | 5 +++-- src/transformers/models/glm4v/processing_glm4v.py | 2 +- .../models/qwen2_5_vl/modular_qwen2_5_vl.py | 2 +- .../models/qwen2_5_vl/processing_qwen2_5_vl.py | 2 +- src/transformers/processing_utils.py | 11 ++++------- 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py index e435ba23cc4a..391914e198a4 100644 --- a/src/transformers/models/dia/processing_dia.py +++ b/src/transformers/models/dia/processing_dia.py @@ -46,7 +46,6 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False): "padding": True, "padding_side": "right", "add_special_tokens": False, - "return_tensors": "pt", }, "audio_kwargs": { "eos_token_id": 1024, @@ -55,6 +54,8 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False): "delay_pattern": [0, 8, 9, 10, 11, 12, 13, 14, 15], "generation": True, "sampling_rate": 44100, + }, + "common_kwargs": { "return_tensors": "pt", }, } @@ -112,7 +113,7 @@ def __call__( text_kwargs = output_kwargs["text_kwargs"] audio_kwargs = output_kwargs["audio_kwargs"] - return_tensors = output_kwargs["text_kwargs"].get("return_tensors", None) + return_tensors = output_kwargs["text_kwargs"]["return_tensors"] if return_tensors != "pt": raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.") diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index e0a005b1ad1d..8a6d0c25f2fb 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -34,7 +34,7 @@ class Glm4vVideosProcessorKwargs(VideosKwargs, total=False): - fps: Optional[Union[list[float], float]] + fps: Optional[Union[list[float | int] | float | int]] class Glm4vImagesKwargs(ImagesKwargs): diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index 8ad5b7d9ec53..9ee6b57b1213 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -840,7 +840,7 @@ def prepare_inputs_for_generation( class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): - fps: Optional[Union[list[float], float]] + fps: Optional[Union[list[float | int] | float | int]] class Qwen2_5_VLImagesKwargs(Qwen2VLImagesKwargs): diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py index 7be98e7e5023..f7f58fef38f6 100644 --- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py @@ -36,7 +36,7 @@ class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): - fps: Optional[Union[list[float], float]] + fps: Optional[Union[list[float | int] | float | int]] class Qwen2_5_VLImagesKwargs(ImagesKwargs): diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 4879c2433222..b6053337fcf4 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -1332,20 +1332,17 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." ) - # BC for `common_kwargs` to update all modality-specific kwargs + # For `common_kwargs` just update all modality-specific kwargs with same key/values common_kwargs = kwargs.get("common_kwargs", {}) + ModelProcessorKwargs._defaults["common_kwargs"] + common_kwargs.update(ModelProcessorKwargs._defaults.get("common_kwargs", {})) if common_kwargs: for kwarg in output_kwargs.values(): kwarg.update(common_kwargs) # Finally perform type validation on collected kwargs - # NOTE: When we inherit from BaseTypedDict, the bases won't be in MRO of ModelTypedDict - # That causes errors if certain type annotations are not defined/imported in model processor - # file. So we will pass globalns of `processing_utils.py` manually to bypass it - base_globalns = getattr(sys.modules.get(ProcessingKwargs.__module__, None), "__dict__", {}) for key, typed_dict_obj in ModelProcessorKwargs.__annotations__.items(): - child_localns = getattr(sys.modules.get(typed_dict_obj.__module__, None), "__dict__", {}) - type_validator = TypedDictAdapter(typed_dict_obj, globalns=base_globalns, localns=child_localns) + type_validator = TypedDictAdapter(typed_dict_obj) type_validator.validate_fields(**output_kwargs[key]) return output_kwargs From 725a479e3267251720e55db37482c9d3236adb63 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 11 Sep 2025 13:59:07 +0200 Subject: [PATCH 07/28] why the runner complains about typing with "|"? --- src/transformers/models/glm4v/processing_glm4v.py | 2 +- src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py | 2 +- src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index 8a6d0c25f2fb..6a44e04f0d0e 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -34,7 +34,7 @@ class Glm4vVideosProcessorKwargs(VideosKwargs, total=False): - fps: Optional[Union[list[float | int] | float | int]] + fps: Optional[Union[list[Union[float, int]], float, int]] class Glm4vImagesKwargs(ImagesKwargs): diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index 9ee6b57b1213..04cfe85a2026 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -840,7 +840,7 @@ def prepare_inputs_for_generation( class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): - fps: Optional[Union[list[float | int] | float | int]] + fps: Optional[Union[list[Union[float, int]], float, int]] class Qwen2_5_VLImagesKwargs(Qwen2VLImagesKwargs): diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py index f7f58fef38f6..de859970976e 100644 --- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py @@ -36,7 +36,7 @@ class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): - fps: Optional[Union[list[float | int] | float | int]] + fps: Optional[Union[list[Union[float, int]], float, int]] class Qwen2_5_VLImagesKwargs(ImagesKwargs): From d8ca68348040c8c03cf0f15716e9b701f96949e3 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 11 Sep 2025 14:00:26 +0200 Subject: [PATCH 08/28] revert --- src/transformers/models/kosmos2_5/processing_kosmos2_5.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py index 1bc516038b9e..0e3c70c80234 100644 --- a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py +++ b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py @@ -43,13 +43,12 @@ class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False): "return_token_type_ids": False, "stride": 0, "truncation": True, - "return_tensors": "pt", }, "images_kwargs": { "max_patches": 4096, "num_image_tokens": 2048, - "return_tensors": "pt", }, + "common_kwargs": {"return_tensors": "pt"}, } From 8ff15f771e3c60f78f2dd4a0bb3daec15c4dad16 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 11 Sep 2025 14:36:42 +0200 Subject: [PATCH 09/28] forgot to delete --- src/transformers/processing_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index b6053337fcf4..51c1a51212ca 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -1334,7 +1334,6 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg # For `common_kwargs` just update all modality-specific kwargs with same key/values common_kwargs = kwargs.get("common_kwargs", {}) - ModelProcessorKwargs._defaults["common_kwargs"] common_kwargs.update(ModelProcessorKwargs._defaults.get("common_kwargs", {})) if common_kwargs: for kwarg in output_kwargs.values(): From b0e8120ee30c8b314bbe8a645928527bd5b461ae Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 11 Sep 2025 16:04:13 +0200 Subject: [PATCH 10/28] update --- src/transformers/models/csm/processing_csm.py | 4 +- src/transformers/processing_utils.py | 7 +-- src/transformers/utils/type_validators.py | 47 ++++++++++++------- src/transformers/video_utils.py | 7 ++- 4 files changed, 39 insertions(+), 26 deletions(-) diff --git a/src/transformers/models/csm/processing_csm.py b/src/transformers/models/csm/processing_csm.py index 0f929f6a2a0c..d0cad3fdfb8e 100644 --- a/src/transformers/models/csm/processing_csm.py +++ b/src/transformers/models/csm/processing_csm.py @@ -249,9 +249,7 @@ def __call__( text_kwargs = output_kwargs["text_kwargs"] audio_kwargs = output_kwargs["audio_kwargs"] - common_kwargs = output_kwargs["common_kwargs"] - - return_tensors = common_kwargs.pop("return_tensors", None) + return_tensors = text_kwargs.get("return_tensors", None) if return_tensors != "pt": raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.") diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 51c1a51212ca..f80025ca2d73 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -24,7 +24,6 @@ import sys import typing import warnings -from collections.abc import Iterable from dataclasses import dataclass from pathlib import Path from typing import Annotated, Any, Literal, Optional, TypedDict, TypeVar, Union @@ -50,7 +49,7 @@ truncation_validator, video_metadata_validator, ) -from .video_utils import VideoInput, VideoMetadata +from .video_utils import VideoInput, VideoMetadataType if is_vision_available(): @@ -301,9 +300,7 @@ class VideosKwargs(TypedDict, total=False): input_data_format: Optional[Union[str, ChannelDimension]] device: Annotated[Optional[str], device_validator()] do_sample_frames: Optional[bool] - video_metadata: Annotated[ - Optional[Union[VideoMetadata, dict, Iterable[VideoMetadata, dict]]], video_metadata_validator() - ] + video_metadata: Annotated[Optional[VideoMetadataType], video_metadata_validator()] fps: Annotated[Optional[Union[int, float]], positive_any_number()] num_frames: Annotated[Optional[int], positive_int()] return_metadata: Optional[bool] diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py index 5fa0c719f82b..54a014a99b20 100644 --- a/src/transformers/utils/type_validators.py +++ b/src/transformers/utils/type_validators.py @@ -1,11 +1,11 @@ -from collections.abc import Iterable +from collections.abc import Sequence from dataclasses import MISSING, field, make_dataclass from typing import Annotated, Any, ForwardRef, Optional, TypedDict, Union, get_args, get_origin from huggingface_hub.dataclasses import as_validated_field, strict from ..tokenization_utils_base import PaddingStrategy, TruncationStrategy -from ..video_utils import VideoMetadata +from ..video_utils import VideoMetadataType from .generic import TensorType from .import_utils import is_vision_available @@ -172,22 +172,35 @@ def resampling_validator(value: Optional[Union[int, "PILImageResampling"]] = Non @as_validated_field -def video_metadata_validator(value: Optional[Union[VideoMetadata, dict, Iterable[VideoMetadata, dict]]] = None): - possible_keys = ["total_num_frames", "fps", "width", "height", "duration", "video_backend", "frames_indices"] +def video_metadata_validator(value: Optional[VideoMetadataType] = None): if value is None: - pass - elif isinstance(value, Iterable) and not all(isinstance(item, (VideoMetadata, dict)) for item in value): - raise ValueError( - f"If `video_metadata` is a list, each item in the list should be either a dict or a `VideoMetadata` object but got video_metadata={value}" - ) - elif isinstance(value, dict) and not all(key in possible_keys for key in value.keys()): - raise ValueError( - f"If video_metadata is a dict, the keys should be one of {possible_keys} but got device={value.keys()}" - ) - elif not isinstance(value, (VideoMetadata, dict, Iterable)): - raise ValueError( - f"Video metadata must be either a dict, a VideoMetadata or a batched list of metadata, but got device={value}" - ) + return + + valid_keys = ["total_num_frames", "fps", "width", "height", "duration", "video_backend", "frames_indices"] + + def check_dict_keys(d: dict) -> bool: + return all(key in valid_keys for key in d.keys()) + + if isinstance(value, Sequence) and isinstance(value[0], Sequence) and isinstance(value[0][0], dict): + for sublist in value: + for item in sublist: + if not check_dict_keys(item): + raise ValueError( + f"Invalid keys found in video metadata. Valid keys: {valid_keys} got: {list(item.keys())}" + ) + + elif isinstance(value, Sequence) and isinstance(value[0], dict): + for item in value: + if not check_dict_keys(item): + raise ValueError( + f"Invalid keys found in video metadata. Valid keys: {valid_keys} got: {list(item.keys())}" + ) + + elif isinstance(value, dict): + if not check_dict_keys(value): + raise ValueError( + f"Invalid keys found in video metadata. Valid keys: {valid_keys}, got: {list(value.keys())}" + ) @as_validated_field diff --git a/src/transformers/video_utils.py b/src/transformers/video_utils.py index 1749b0b3b1c5..cfe89c94e65e 100644 --- a/src/transformers/video_utils.py +++ b/src/transformers/video_utils.py @@ -112,6 +112,11 @@ def update(self, dictionary): setattr(self, key, value) +VideoMetadataType = Union[ + VideoMetadata, dict, list[Union[dict, VideoMetadata]], list[list[Union[dict, VideoMetadata]]] +] + + def is_valid_video_frame(frame): return isinstance(frame, PIL.Image.Image) or ( (is_numpy_array(frame) or is_torch_tensor(frame)) and frame.ndim == 3 @@ -215,7 +220,7 @@ def make_batched_videos(videos) -> list[Union[np.ndarray, "torch.Tensor", "URL", return flat_videos_list -def make_batched_metadata(videos: VideoInput, video_metadata: Union[VideoMetadata, dict]): +def make_batched_metadata(videos: VideoInput, video_metadata: VideoMetadataType) -> list[VideoMetadata]: if video_metadata is None: # Create default metadata and fill attributes we can infer from given video video_metadata = [ From 9f761c6421f92986d5188807b0f4003602303d71 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 11 Sep 2025 18:17:05 +0200 Subject: [PATCH 11/28] fix last issues --- .../models/qwen2_5_omni/processing_qwen2_5_omni.py | 8 ++++---- tests/test_video_processing_common.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index 45d8cacddeb2..49bf64dcad2d 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -31,10 +31,10 @@ class Qwen2_5_OmniVideosKwargs(VideosKwargs): - fps: Optional[list[Union[int, float]]] = None - use_audio_in_video: Optional[bool] = None - seconds_per_chunk: Optional[float] = None - position_id_per_seconds: Optional[int] = None + fps: Optional[Union[list[Union[int, float]], int, float]] + use_audio_in_video: Optional[bool] + seconds_per_chunk: Optional[float] + position_id_per_seconds: Optional[int] min_pixels: Optional[int] max_pixels: Optional[int] patch_size: Optional[int] diff --git a/tests/test_video_processing_common.py b/tests/test_video_processing_common.py index 3d0477ee05d5..67a31cf8d20e 100644 --- a/tests/test_video_processing_common.py +++ b/tests/test_video_processing_common.py @@ -398,8 +398,8 @@ def test_call_numpy_4_channels(self): video_inputs[0], return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=0.0, + image_std=1.0, )[self.input_name] expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]]) if video_processor.do_convert_rgb: @@ -412,8 +412,8 @@ def test_call_numpy_4_channels(self): video_inputs, return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=0.0, + image_std=1.0, )[self.input_name] expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs) if video_processor.do_convert_rgb: From f935cffdc74557504abfe3d3b94b86c84f934bc5 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 16 Sep 2025 17:36:24 +0200 Subject: [PATCH 12/28] add more detalis in docs --- src/transformers/utils/type_validators.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py index 54a014a99b20..ee6ff2be956f 100644 --- a/src/transformers/utils/type_validators.py +++ b/src/transformers/utils/type_validators.py @@ -1,6 +1,6 @@ from collections.abc import Sequence from dataclasses import MISSING, field, make_dataclass -from typing import Annotated, Any, ForwardRef, Optional, TypedDict, Union, get_args, get_origin +from typing import Annotated, ForwardRef, Optional, TypedDict, Union, get_args, get_origin from huggingface_hub.dataclasses import as_validated_field, strict @@ -45,6 +45,11 @@ class TypedDictAdapter: A utility class used to convert a TypedDict object to dataclass and attach a hub validator on top based on TypedDict annotations. + We don't want to replace `TypedDict` by dataclasses in the codebase because + with dataclasses we will lose typing hints that `Unpack[TypedDict]` gives. + So this utility is a sweet spot to keep the balance between DevX and strong + typing`validation. + Args: type: The TypedDict object that needs to be validated. """ @@ -52,12 +57,8 @@ class TypedDictAdapter: def __init__( self, type: type[TypedDict], - globalns: Optional[dict[str, Any]] = None, - localns: Optional[dict[str, Any]] = None, ): self.type = type - self.globalns = globalns - self.localns = localns self.dataclass = self.create_dataclass() self.dataclass = strict(self.dataclass) From e6a77d8629458b30f957fe803b14f257014ada93 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 24 Sep 2025 16:58:22 +0200 Subject: [PATCH 13/28] pin the latest hub release --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b4feedbc77a1..b293479cba28 100644 --- a/setup.py +++ b/setup.py @@ -115,7 +115,7 @@ "GitPython<3.1.19", "hf-doc-builder>=0.3.0", "hf_xet", - "huggingface-hub>=0.34.0,<1.0", + "huggingface-hub>=0.35.1,<1.0", "importlib_metadata", "ipadic>=1.0.0,<2.0", "jax>=0.4.1,<=0.4.13", From 5a4263030b8970ac5b0a3c260658cac0b3f1cc39 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 24 Sep 2025 18:19:46 +0200 Subject: [PATCH 14/28] fix tests for new models --- src/transformers/dependency_versions_table.py | 2 +- src/transformers/models/lfm2_vl/processing_lfm2_vl.py | 8 ++++++-- .../models/qwen2_5_omni/processing_qwen2_5_omni.py | 2 +- .../models/qwen3_omni_moe/processing_qwen3_omni_moe.py | 4 ++-- tests/models/lfm2_vl/test_processing_lfm2_vl.py | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index bd1a34ee747f..1f977cbe267e 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -23,7 +23,7 @@ "GitPython": "GitPython<3.1.19", "hf-doc-builder": "hf-doc-builder>=0.3.0", "hf_xet": "hf_xet", - "huggingface-hub": "huggingface-hub>=0.34.0,<1.0", + "huggingface-hub": "huggingface-hub>=0.35.1,<1.0", "importlib_metadata": "importlib_metadata", "ipadic": "ipadic>=1.0.0,<2.0", "jinja2": "jinja2>=3.1.0", diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py index 12f289c266a1..32965e6fc7ac 100755 --- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py +++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py @@ -21,6 +21,7 @@ ImagesKwargs, ProcessingKwargs, ProcessorMixin, + TextKwargs, Unpack, ) from ...tokenization_utils_base import BatchEncoding, TextInput @@ -46,8 +47,13 @@ class Lfm2VlImagesKwargs(ImagesKwargs, total=False): return_row_col_info: Optional[bool] +class Lfm2VlTextKwargs(TextKwargs, total=False): + use_image_special_tokens: Optional[bool] + + class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False): images_kwargs: Lfm2VlImagesKwargs + text_kwargs: Lfm2VlTextKwargs _defaults = { "images_kwargs": { @@ -88,12 +94,10 @@ def __init__( image_processor, tokenizer, chat_template: Optional[str] = None, - use_image_special_tokens: Optional[bool] = True, **kwargs, ): self.image_token = tokenizer.image_token self.image_token_id = tokenizer.image_token_id - self.use_image_special_tokens = use_image_special_tokens self.image_start_token = tokenizer.image_start_token self.image_end_token = tokenizer.image_end_token self.image_thumbnail_token = tokenizer.image_thumbnail diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index 4e5cbf484c34..2ee7b7b43c2b 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -34,7 +34,7 @@ class Qwen2_5_OmniVideosKwargs(VideosKwargs): fps: Optional[Union[list[Union[int, float]], int, float]] use_audio_in_video: Optional[bool] seconds_per_chunk: Optional[float] - position_id_per_seconds: Optional[int] + position_id_per_seconds: Optional[Union[int, float]] min_pixels: Optional[int] max_pixels: Optional[int] patch_size: Optional[int] diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py index 86041fc3de16..c77d9c75ae5f 100644 --- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py @@ -33,10 +33,10 @@ class Qwen3OmniMoeVideosKwargs(VideosKwargs): - fps: Optional[list[Union[int, float]]] + fps: Optional[Union[list[Union[int, float]], int, float]] use_audio_in_video: Optional[bool] seconds_per_chunk: Optional[float] - position_id_per_seconds: Optional[int] + position_id_per_seconds: Optional[Union[int, float]] min_pixels: Optional[int] max_pixels: Optional[int] patch_size: Optional[int] diff --git a/tests/models/lfm2_vl/test_processing_lfm2_vl.py b/tests/models/lfm2_vl/test_processing_lfm2_vl.py index f2c33e40e3f6..d1f7669bdddd 100755 --- a/tests/models/lfm2_vl/test_processing_lfm2_vl.py +++ b/tests/models/lfm2_vl/test_processing_lfm2_vl.py @@ -100,7 +100,7 @@ def prepare_processor_dict(): "{{'<|im_start|>assistant\n' }}" "{% endif %}" ) - return {"chat_template": chat_template, "use_image_special_tokens": True} + return {"chat_template": chat_template} # Override as Lfm2VL needs images/video to be an explicitly nested batch def prepare_image_inputs(self, batch_size=None): From fe4ba565c4e38dafd36d289204571a25cee382a2 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 24 Sep 2025 18:37:53 +0200 Subject: [PATCH 15/28] also fast image processor --- src/transformers/image_processing_utils_fast.py | 14 ++++++++++---- .../image_processing_deepseek_vl_fast.py | 2 +- .../image_processing_deepseek_vl_hybrid_fast.py | 10 +++++----- .../modular_deepseek_vl_hybrid.py | 10 +++++----- .../image_processing_efficientnet_fast.py | 4 ++-- .../models/eomt/image_processing_eomt_fast.py | 16 ++++++---------- .../models/janus/image_processing_janus_fast.py | 2 +- .../image_processing_llava_onevision_fast.py | 5 ++--- .../llava_onevision/modular_llava_onevision.py | 5 ++--- src/transformers/processing_utils.py | 16 ++++++++-------- 10 files changed, 42 insertions(+), 42 deletions(-) diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index 75210680b57a..0607d4fac550 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -51,6 +51,7 @@ logging, ) from .utils.import_utils import is_rocm_platform +from .utils.type_validators import TypedDictAdapter if is_vision_available(): @@ -169,21 +170,21 @@ class DefaultFastImageProcessorKwargs(TypedDict, total=False): do_resize: Optional[bool] size: Optional[dict[str, int]] default_to_square: Optional[bool] - resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] + resample: Optional[Union["PILImageResampling", "F.InterpolationMode", int]] do_center_crop: Optional[bool] crop_size: Optional[dict[str, int]] do_rescale: Optional[bool] rescale_factor: Optional[Union[int, float]] do_normalize: Optional[bool] - image_mean: Optional[Union[float, list[float]]] - image_std: Optional[Union[float, list[float]]] + image_mean: Optional[Union[float, list[float], tuple[float, float, float]]] + image_std: Optional[Union[float, list[float], tuple[float, float, float]]] do_pad: Optional[bool] pad_size: Optional[dict[str, int]] do_convert_rgb: Optional[bool] return_tensors: Optional[Union[str, TensorType]] data_format: Optional[ChannelDimension] input_data_format: Optional[Union[str, ChannelDimension]] - device: Optional["torch.device"] + device: Optional[Union[str, "torch.device"]] disable_grouping: Optional[bool] @@ -737,6 +738,11 @@ def __call__(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageP def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature: # args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names) + + # Perform type validation on received kwargs + type_validator = TypedDictAdapter(self.valid_kwargs) + type_validator.validate_fields(**kwargs) + # Set default kwargs from self. This ensures that if a kwarg is not provided # by the user, it gets its default value from the instance, or is set to None. for kwarg_name in self._valid_kwargs_names: diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py index 896e91f0692c..87ccf3b4a6fe 100644 --- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py @@ -42,7 +42,7 @@ class DeepseekVLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): falls below this value after resizing. """ - min_size: int + min_size: Optional[int] @auto_docstring diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py index db9c9ad987c1..82b6357f24f4 100644 --- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py @@ -67,11 +67,11 @@ class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. """ - min_size: int - high_res_size: dict - high_res_resample: "PILImageResampling" - high_res_image_mean: list[float] - high_res_image_std: list[float] + min_size: Optional[int] + high_res_size: Optional[dict[str, int]] + high_res_resample: Optional[Union["PILImageResampling", "F.InterpolationMode", int]] + high_res_image_mean: Optional[Union[float, list[float], tuple[float, float, float]]] + high_res_image_std: Optional[Union[float, list[float], tuple[float, float, float]]] @auto_docstring diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py index e9808b02ce34..c0a45431c5e3 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -752,11 +752,11 @@ class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. """ - min_size: int - high_res_size: dict - high_res_resample: "PILImageResampling" - high_res_image_mean: list[float] - high_res_image_std: list[float] + min_size: Optional[int] + high_res_size: Optional[dict[str, int]] + high_res_resample: Optional[Union["PILImageResampling", "F.InterpolationMode", int]] + high_res_image_mean: Optional[Union[float, list[float], tuple[float, float, float]]] + high_res_image_std: Optional[Union[float, list[float], tuple[float, float, float]]] class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast): diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py b/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py index 3544d927c146..e6d056481870 100644 --- a/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py +++ b/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py @@ -45,8 +45,8 @@ class EfficientNetFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): Normalize the image again with the standard deviation only for image classification if set to True. """ - rescale_offset: bool - include_top: bool + rescale_offset: Optional[bool] + include_top: Optional[bool] @auto_docstring diff --git a/src/transformers/models/eomt/image_processing_eomt_fast.py b/src/transformers/models/eomt/image_processing_eomt_fast.py index 97a13a0745eb..3e44a87bd1bf 100644 --- a/src/transformers/models/eomt/image_processing_eomt_fast.py +++ b/src/transformers/models/eomt/image_processing_eomt_fast.py @@ -59,19 +59,15 @@ class EomtImageProcessorFastKwargs(DefaultFastImageProcessorKwargs): """ do_split_image (`bool`, *optional*, defaults to `False`): - Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the - input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches. - Otherwise, the input images will be padded to the target size. - do_pad (`bool`, *optional*, defaults to `False`): - Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest - number of patches in the batch. Padding will be applied to the bottom and right with zeros. + Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the + input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches. + Otherwise, the input images will be padded to the target size. ignore_index (`int`, *optional*): - Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels - denoted with 0 (background) will be replaced with `ignore_index`. + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. """ - do_split_image: bool - do_pad: bool + do_split_image: Optional[bool] ignore_index: Optional[int] = None diff --git a/src/transformers/models/janus/image_processing_janus_fast.py b/src/transformers/models/janus/image_processing_janus_fast.py index 9ed2732fb3d0..881f38d6681a 100644 --- a/src/transformers/models/janus/image_processing_janus_fast.py +++ b/src/transformers/models/janus/image_processing_janus_fast.py @@ -53,7 +53,7 @@ class JanusFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): falls below this value after resizing. """ - min_size: int + min_size: Optional[int] @auto_docstring diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py index 4392d64e9ebf..6dc0752d9f94 100644 --- a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py @@ -92,8 +92,7 @@ def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImag batch_num_images = [1] * len(images) else: batch_num_images = [1] - kwargs["batch_num_images"] = batch_num_images - return super().preprocess(images, **kwargs) + return super().preprocess(images, batch_num_images, **kwargs) def _resize_for_patching( self, @@ -218,6 +217,7 @@ def _pad_for_batching( def _preprocess( self, images: list["torch.Tensor"], + batch_num_images: list[int], do_resize: bool, size: SizeDict, image_grid_pinpoints: list[list[int]], @@ -230,7 +230,6 @@ def _preprocess( image_mean: Optional[Union[float, list[float]]], image_std: Optional[Union[float, list[float]]], do_pad: bool, - batch_num_images: list[int], disable_grouping: Optional[bool], return_tensors: Optional[Union[str, TensorType]], **kwargs, diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py index ec2304e09dd1..c664764bbaf9 100644 --- a/src/transformers/models/llava_onevision/modular_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py @@ -143,12 +143,12 @@ def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImag batch_num_images = [1] * len(images) else: batch_num_images = [1] - kwargs["batch_num_images"] = batch_num_images - return super().preprocess(images, **kwargs) + return super().preprocess(images, batch_num_images, **kwargs) def _preprocess( self, images: list["torch.Tensor"], + batch_num_images: list[int], do_resize: bool, size: SizeDict, image_grid_pinpoints: list[list[int]], @@ -161,7 +161,6 @@ def _preprocess( image_mean: Optional[Union[float, list[float]]], image_std: Optional[Union[float, list[float]]], do_pad: bool, - batch_num_images: list[int], disable_grouping: Optional[bool], return_tensors: Optional[Union[str, TensorType]], **kwargs, diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 949069f14c5c..89dea69b000d 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -193,9 +193,9 @@ class methods and docstrings. Scale factor to use if rescaling the image. do_normalize (`bool`, *optional*): Whether to normalize the image. - image_mean (`float` or `list[float]`, *optional*): + image_mean (`float` or `list[float] or tuple[float, float, float]`, *optional*): Mean to use if normalizing the image. - image_std (`float` or `list[float]`, *optional*): + image_std (`float` or `list[float] or tuple[float, float, float]`, *optional*): Standard deviation to use if normalizing the image. do_pad (`bool`, *optional*): Whether to pad the image to the `(max_height, max_width)` of the images in the batch. @@ -219,8 +219,8 @@ class methods and docstrings. do_rescale: Optional[bool] rescale_factor: Optional[float] do_normalize: Optional[bool] - image_mean: Optional[Union[float, list[float]]] - image_std: Optional[Union[float, list[float]]] + image_mean: Optional[Union[float, list[float], tuple[float, float, float]]] + image_std: Optional[Union[float, list[float], tuple[float, float, float]]] do_pad: Optional[bool] pad_size: Annotated[Optional[dict[str, int]], image_size_validator()] do_center_crop: Optional[bool] @@ -251,9 +251,9 @@ class VideosKwargs(TypedDict, total=False): Scale factor to use if rescaling the video. do_normalize (`bool`, *optional*): Whether to normalize the video. - image_mean (`float` or `list[float]`, *optional*): + image_mean (`float` or `list[float] or tuple[float, float, float]`, *optional*): Mean to use if normalizing the video. - image_std (`float` or `list[float]`, *optional*): + image_std (`float` or `list[float] or tuple[float, float, float]`, *optional*): Standard deviation to use if normalizing the video. do_center_crop (`bool`, *optional*): Whether to center crop the video. @@ -283,8 +283,8 @@ class VideosKwargs(TypedDict, total=False): do_rescale: Optional[bool] rescale_factor: Optional[float] do_normalize: Optional[bool] - image_mean: Optional[Union[float, list[float]]] - image_std: Optional[Union[float, list[float]]] + image_mean: Optional[Union[float, list[float], tuple[float, float, float]]] + image_std: Optional[Union[float, list[float], tuple[float, float, float]]] do_center_crop: Optional[bool] crop_size: Annotated[Optional[dict[str, int]], image_size_validator()] data_format: Optional[ChannelDimension] From 6e8d77e49354095a85a66d3914629c0f20892415 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 24 Sep 2025 18:49:32 +0200 Subject: [PATCH 16/28] fix copies --- .../models/llava_onevision/modular_llava_onevision.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py index c664764bbaf9..afa633d8f61a 100644 --- a/src/transformers/models/llava_onevision/modular_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py @@ -34,7 +34,12 @@ from ...cache_utils import Cache from ...image_processing_utils import BatchFeature -from ...image_processing_utils_fast import DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + group_images_by_shape, + reorder_images, +) from ...image_utils import ( OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, @@ -74,7 +79,7 @@ class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): image_grid_pinpoints: Optional[list[list[int]]] -class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast): +class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast, BaseImageProcessorFast): resample = PILImageResampling.BICUBIC image_mean = OPENAI_CLIP_MEAN image_std = OPENAI_CLIP_STD @@ -143,7 +148,7 @@ def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImag batch_num_images = [1] * len(images) else: batch_num_images = [1] - return super().preprocess(images, batch_num_images, **kwargs) + return BaseImageProcessorFast.preprocess(images, batch_num_images, **kwargs) def _preprocess( self, From ba419921889e0a3221c3ac3f9754597602051125 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 25 Sep 2025 10:53:16 +0200 Subject: [PATCH 17/28] image processing ast validated --- .../image_processing_perception_lm_fast.py | 9 +++++--- .../pixtral/image_processing_pixtral_fast.py | 4 ++-- .../test_image_processing_oneformer.py | 1 - .../test_image_processing_vitmatte.py | 22 ++++++++++++------- tests/test_image_processing_common.py | 8 +++---- 5 files changed, 26 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py index c26132a48439..8642fe2e5f94 100644 --- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py +++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py @@ -51,9 +51,9 @@ class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): Maximum number of tiles an image can be split into based on its aspect ratio. """ - vision_input_type: str = "thumb+tile" - tile_size: int = 448 - max_num_tiles: int = 36 + vision_input_type: Optional[str] + tile_size: Optional[int] + max_num_tiles: Optional[int] @auto_docstring @@ -66,6 +66,9 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_convert_rgb = True + vision_input_type = "thumb+tile" + tile_size = 448 + max_num_tiles = 36 size = {"width": 448, "height": 448} # for backward compatibility in tests valid_kwargs = PerceptionLMFastImageProcessorKwargs diff --git a/src/transformers/models/pixtral/image_processing_pixtral_fast.py b/src/transformers/models/pixtral/image_processing_pixtral_fast.py index db3e75760318..473ebefd02f4 100644 --- a/src/transformers/models/pixtral/image_processing_pixtral_fast.py +++ b/src/transformers/models/pixtral/image_processing_pixtral_fast.py @@ -46,11 +46,11 @@ class PixtralFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): """ - patch_size (`dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`): + patch_size (`Union[int, dict[str, int]]` *optional*, defaults to `{"height": 16, "width": 16}`): Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method. """ - patch_size: Optional[dict[str, int]] + patch_size: Optional[Union[int, dict[str, int]]] @auto_docstring diff --git a/tests/models/oneformer/test_image_processing_oneformer.py b/tests/models/oneformer/test_image_processing_oneformer.py index 4fe89959bf0b..1343d069d819 100644 --- a/tests/models/oneformer/test_image_processing_oneformer.py +++ b/tests/models/oneformer/test_image_processing_oneformer.py @@ -224,7 +224,6 @@ def comm_get_image_processor_inputs( annotations, return_tensors="pt", instance_id_to_semantic_id=instance_id_to_semantic_id, - pad_and_return_pixel_mask=True, ) return inputs diff --git a/tests/models/vitmatte/test_image_processing_vitmatte.py b/tests/models/vitmatte/test_image_processing_vitmatte.py index a103c33a9cca..e57fce61747d 100644 --- a/tests/models/vitmatte/test_image_processing_vitmatte.py +++ b/tests/models/vitmatte/test_image_processing_vitmatte.py @@ -255,18 +255,24 @@ def test_image_processor_preprocess_arguments(self): # vitmatte require additional trimap input for image_processor # that is why we override original common test - for image_processing_class in self.image_processor_list: + for i, image_processing_class in enumerate(self.image_processor_list): image_processor = image_processing_class(**self.image_processor_dict) image = self.image_processor_tester.prepare_image_inputs()[0] trimap = np.random.randint(0, 3, size=image.size[::-1]) - with warnings.catch_warnings(record=True) as raised_warnings: - warnings.simplefilter("always") - image_processor(image, trimaps=trimap, extra_argument=True) - - messages = " ".join([str(w.message) for w in raised_warnings]) - self.assertGreaterEqual(len(raised_warnings), 1) - self.assertIn("extra_argument", messages) + # Type validation will fail for fast processors only (for now) + if i == 1: + with self.assertRaises(TypeError): + image_processor(image, trimaps=trimap, extra_argument=True) + else: + # Else we just consume extra kwargs and raise a warning + with warnings.catch_warnings(record=True) as raised_warnings: + warnings.simplefilter("always") + image_processor(image, trimaps=trimap, extra_argument=True) + + messages = " ".join([str(w.message) for w in raised_warnings]) + self.assertGreaterEqual(len(raised_warnings), 1) + self.assertIn("extra_argument", messages) @unittest.skip(reason="Many failing cases. This test needs a more deep investigation.") def test_fast_is_faster_than_slow(self): diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py index ce0bd4181be5..4ab674051ec8 100644 --- a/tests/test_image_processing_common.py +++ b/tests/test_image_processing_common.py @@ -514,8 +514,8 @@ def test_call_numpy_4_channels(self): image_inputs[0], return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=[0.0, 0.0, 0.0, 0.0], + image_std=[1.0, 1.0, 1.0, 1.0], ).pixel_values expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) @@ -525,8 +525,8 @@ def test_call_numpy_4_channels(self): image_inputs, return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=[0.0, 0.0, 0.0, 0.0], + image_std=[1.0, 1.0, 1.0, 1.0], ).pixel_values expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) self.assertEqual( From 3233a703a3ce30fcab2ab0ad634575e793eacefe Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 25 Sep 2025 12:39:31 +0200 Subject: [PATCH 18/28] fix more tests --- .../image_processing_utils_fast.py | 20 +++++----- src/transformers/processing_utils.py | 38 +++++++++++++------ .../test_image_processing_cohere2_vision.py | 8 ++-- .../glm4v/test_image_processing_glm4v.py | 8 ++-- .../glm4v/test_video_processing_glm4v.py | 8 ++-- .../nougat/test_image_processing_nougat.py | 8 ++-- .../test_video_processing_qwen2_vl.py | 8 ++-- .../test_video_processing_qwen3_vl.py | 8 ++-- tests/models/tvp/test_image_processing_tvp.py | 12 +++++- .../test_image_processing_videomae.py | 12 +++++- .../test_image_processing_vitmatte.py | 4 +- .../vitpose/test_image_processing_vitpose.py | 8 ++-- .../vivit/test_image_processing_vivit.py | 12 +++++- 13 files changed, 97 insertions(+), 57 deletions(-) diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index 0607d4fac550..01aab0eee3e7 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -15,7 +15,7 @@ from collections.abc import Iterable from copy import deepcopy from functools import lru_cache, partial -from typing import Any, Optional, TypedDict, Union +from typing import Annotated, Any, Optional, TypedDict, Union import numpy as np @@ -51,7 +51,7 @@ logging, ) from .utils.import_utils import is_rocm_platform -from .utils.type_validators import TypedDictAdapter +from .utils.type_validators import TypedDictAdapter, device_validator, image_size_validator, tensor_type_validator if is_vision_available(): @@ -168,23 +168,23 @@ def divide_to_patches( class DefaultFastImageProcessorKwargs(TypedDict, total=False): do_resize: Optional[bool] - size: Optional[dict[str, int]] + size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()] default_to_square: Optional[bool] resample: Optional[Union["PILImageResampling", "F.InterpolationMode", int]] do_center_crop: Optional[bool] - crop_size: Optional[dict[str, int]] + crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()] do_rescale: Optional[bool] rescale_factor: Optional[Union[int, float]] do_normalize: Optional[bool] - image_mean: Optional[Union[float, list[float], tuple[float, float, float]]] - image_std: Optional[Union[float, list[float], tuple[float, float, float]]] + image_mean: Optional[Union[float, list[float], tuple[float, ...]]] + image_std: Optional[Union[float, list[float], tuple[float, ...]]] do_pad: Optional[bool] - pad_size: Optional[dict[str, int]] + crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()] do_convert_rgb: Optional[bool] - return_tensors: Optional[Union[str, TensorType]] - data_format: Optional[ChannelDimension] + return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()] + data_format: Optional[Union[str, ChannelDimension]] input_data_format: Optional[Union[str, ChannelDimension]] - device: Optional[Union[str, "torch.device"]] + device: Annotated[Optional[str], device_validator()] disable_grouping: Optional[bool] diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 89dea69b000d..f3d92420691f 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -147,6 +147,10 @@ class TextKwargs(TypedDict, total=False): The side on which padding will be applied. return_mm_token_type_ids (`bool`, *optional*): Whether to return multimodal token type ids indicating mm placeholder token positions. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. """ text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] @@ -209,20 +213,24 @@ class methods and docstrings. The channel dimension format for the input image. device (`str`, *optional*): The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. """ do_convert_rgb: Optional[bool] do_resize: Optional[bool] - size: Annotated[Optional[dict[str, int]], image_size_validator()] - crop_size: Annotated[Optional[dict[str, int]], image_size_validator()] + size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()] + crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()] resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()] do_rescale: Optional[bool] rescale_factor: Optional[float] do_normalize: Optional[bool] - image_mean: Optional[Union[float, list[float], tuple[float, float, float]]] - image_std: Optional[Union[float, list[float], tuple[float, float, float]]] + image_mean: Optional[Union[float, list[float], tuple[float, ...]]] + image_std: Optional[Union[float, list[float], tuple[float, ...]]] do_pad: Optional[bool] - pad_size: Annotated[Optional[dict[str, int]], image_size_validator()] + pad_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()] do_center_crop: Optional[bool] data_format: Optional[Union[str, ChannelDimension]] input_data_format: Optional[Union[str, ChannelDimension]] @@ -271,23 +279,27 @@ class VideosKwargs(TypedDict, total=False): The channel dimension format for the output video. input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the input video. - return_metadata (`ChannelDimension` or `str`, *optional*): + return_metadata (`bool`, *optional*): Whether to return video metadata or not. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. """ do_convert_rgb: Optional[bool] do_resize: Optional[bool] - size: Annotated[Optional[dict[str, int]], image_size_validator()] + size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()] default_to_square: Optional[bool] resample: Annotated[Optional[Union["PILImageResampling", int]], resampling_validator()] do_rescale: Optional[bool] rescale_factor: Optional[float] do_normalize: Optional[bool] - image_mean: Optional[Union[float, list[float], tuple[float, float, float]]] - image_std: Optional[Union[float, list[float], tuple[float, float, float]]] + image_mean: Optional[Union[float, list[float], tuple[float, ...]]] + image_std: Optional[Union[float, list[float], tuple[float, ...]]] do_center_crop: Optional[bool] - crop_size: Annotated[Optional[dict[str, int]], image_size_validator()] - data_format: Optional[ChannelDimension] + crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()] + data_format: Optional[Union[str, ChannelDimension]] input_data_format: Optional[Union[str, ChannelDimension]] device: Annotated[Optional[str], device_validator()] do_sample_frames: Optional[bool] @@ -326,6 +338,10 @@ class AudioKwargs(TypedDict, total=False): If set, will pad the sequence to a multiple of the provided value. return_attention_mask (`bool`, *optional*): Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. """ sampling_rate: Annotated[Optional[int], positive_int()] diff --git a/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py b/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py index 7ab3bf70d57b..81a16ba39c14 100644 --- a/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py +++ b/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py @@ -176,8 +176,8 @@ def test_call_numpy_4_channels(self): image_inputs[0], return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), ).pixel_values self.assertEqual(tuple(encoded_images.shape), (10, 4, 30, 30)) @@ -186,7 +186,7 @@ def test_call_numpy_4_channels(self): image_inputs, return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), ).pixel_values self.assertEqual(tuple(encoded_images.shape), (70, 4, 30, 30)) diff --git a/tests/models/glm4v/test_image_processing_glm4v.py b/tests/models/glm4v/test_image_processing_glm4v.py index cb5af4b275d2..1226fe473db9 100644 --- a/tests/models/glm4v/test_image_processing_glm4v.py +++ b/tests/models/glm4v/test_image_processing_glm4v.py @@ -236,8 +236,8 @@ def test_call_numpy_4_channels(self): image_inputs[0], return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), ).pixel_values expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) @@ -247,8 +247,8 @@ def test_call_numpy_4_channels(self): image_inputs, return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), ).pixel_values expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) diff --git a/tests/models/glm4v/test_video_processing_glm4v.py b/tests/models/glm4v/test_video_processing_glm4v.py index 1dcd4bdecca6..8443c728f2f2 100644 --- a/tests/models/glm4v/test_video_processing_glm4v.py +++ b/tests/models/glm4v/test_video_processing_glm4v.py @@ -250,8 +250,8 @@ def test_call_numpy_4_channels(self): video_inputs[0], return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), )[self.input_name] expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]]) self.assertEqual(list(encoded_videos.shape), expected_output_video_shape) @@ -261,8 +261,8 @@ def test_call_numpy_4_channels(self): video_inputs, return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), )[self.input_name] expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs) self.assertEqual(list(encoded_videos.shape), expected_output_video_shape) diff --git a/tests/models/nougat/test_image_processing_nougat.py b/tests/models/nougat/test_image_processing_nougat.py index c014c21828f4..68a71a6dfb8c 100644 --- a/tests/models/nougat/test_image_processing_nougat.py +++ b/tests/models/nougat/test_image_processing_nougat.py @@ -282,8 +282,8 @@ def test_call_numpy_4_channels(self): image_inputs[0], return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), ).pixel_values expected_output_image_shape = self.image_processor_tester.expected_output_image_shape( [image_inputs[0]] @@ -295,8 +295,8 @@ def test_call_numpy_4_channels(self): image_inputs, return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), ).pixel_values expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) self.assertEqual( diff --git a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py index 4d6026a06289..b80adebbd9ab 100644 --- a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py @@ -265,8 +265,8 @@ def test_call_numpy_4_channels(self): video_inputs[0], return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), )[self.input_name] expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]]) self.assertEqual(list(encoded_videos.shape), expected_output_video_shape) @@ -276,8 +276,8 @@ def test_call_numpy_4_channels(self): video_inputs, return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), )[self.input_name] expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs) self.assertEqual(list(encoded_videos.shape), expected_output_video_shape) diff --git a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py index 9230f0f9502e..60f4023938bb 100644 --- a/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_video_processing_qwen3_vl.py @@ -249,8 +249,8 @@ def test_call_numpy_4_channels(self): video_inputs[0], return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), )[self.input_name] expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]]) self.assertEqual(list(encoded_videos.shape), expected_output_video_shape) @@ -260,8 +260,8 @@ def test_call_numpy_4_channels(self): video_inputs, return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), )[self.input_name] expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs) self.assertEqual(list(encoded_videos.shape), expected_output_video_shape) diff --git a/tests/models/tvp/test_image_processing_tvp.py b/tests/models/tvp/test_image_processing_tvp.py index c2c8b81dfc0a..390ed75a913b 100644 --- a/tests/models/tvp/test_image_processing_tvp.py +++ b/tests/models/tvp/test_image_processing_tvp.py @@ -274,7 +274,11 @@ def test_call_numpy_4_channels(self): # Test not batched input expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs) encoded_videos = image_processing( - test_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first" + test_inputs[0], + return_tensors="pt", + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), + input_data_format="channels_first", ).pixel_values self.assertListEqual( list(encoded_videos.shape), @@ -292,7 +296,11 @@ def test_call_numpy_4_channels(self): video_inputs, batched=True ) encoded_videos = image_processing( - test_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first" + test_inputs, + return_tensors="pt", + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), + input_data_format="channels_first", ).pixel_values self.assertListEqual( list(encoded_videos.shape), diff --git a/tests/models/videomae/test_image_processing_videomae.py b/tests/models/videomae/test_image_processing_videomae.py index 2fe9303f3705..f8576a7bc8af 100644 --- a/tests/models/videomae/test_image_processing_videomae.py +++ b/tests/models/videomae/test_image_processing_videomae.py @@ -177,14 +177,22 @@ def test_call_numpy_4_channels(self): # Test not batched input encoded_videos = image_processing( - video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first" + video_inputs[0], + return_tensors="pt", + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), + input_data_format="channels_first", ).pixel_values expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]]) self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape)) # Test batched encoded_videos = image_processing( - video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first" + video_inputs, + return_tensors="pt", + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), + input_data_format="channels_first", ).pixel_values expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos) self.assertEqual( diff --git a/tests/models/vitmatte/test_image_processing_vitmatte.py b/tests/models/vitmatte/test_image_processing_vitmatte.py index e57fce61747d..a17968a1d567 100644 --- a/tests/models/vitmatte/test_image_processing_vitmatte.py +++ b/tests/models/vitmatte/test_image_processing_vitmatte.py @@ -220,8 +220,8 @@ def test_call_numpy_4_channels(self): images=image, trimaps=trimap, input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), return_tensors="pt", ).pixel_values diff --git a/tests/models/vitpose/test_image_processing_vitpose.py b/tests/models/vitpose/test_image_processing_vitpose.py index 44d9ddf8eb59..c0ede8e22de0 100644 --- a/tests/models/vitpose/test_image_processing_vitpose.py +++ b/tests/models/vitpose/test_image_processing_vitpose.py @@ -205,8 +205,8 @@ def test_call_numpy_4_channels(self): boxes=boxes, return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), ).pixel_values expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) self.assertEqual(tuple(encoded_images.shape), (len(boxes[0]), *expected_output_image_shape)) @@ -218,8 +218,8 @@ def test_call_numpy_4_channels(self): boxes=boxes, return_tensors="pt", input_data_format="channels_last", - image_mean=0, - image_std=1, + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), ).pixel_values expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) self.assertEqual( diff --git a/tests/models/vivit/test_image_processing_vivit.py b/tests/models/vivit/test_image_processing_vivit.py index bf61fc1082b2..323dbd3cc55f 100644 --- a/tests/models/vivit/test_image_processing_vivit.py +++ b/tests/models/vivit/test_image_processing_vivit.py @@ -191,14 +191,22 @@ def test_call_numpy_4_channels(self): # Test not batched input encoded_videos = image_processing( - video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first" + video_inputs[0], + return_tensors="pt", + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), + input_data_format="channels_first", ).pixel_values expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]]) self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape)) # Test batched encoded_videos = image_processing( - video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first" + video_inputs, + return_tensors="pt", + image_mean=(0.0, 0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0, 1.0), + input_data_format="channels_first", ).pixel_values expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos) self.assertEqual( From 909b98ed81fcab8fba0fdc26da3a84330c4107e4 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 25 Sep 2025 12:56:42 +0200 Subject: [PATCH 19/28] typo.and fix copies --- src/transformers/image_processing_utils_fast.py | 2 +- src/transformers/models/lfm2_vl/processing_lfm2_vl.py | 2 -- src/transformers/utils/type_validators.py | 4 ++-- tests/models/tvp/test_image_processing_tvp.py | 8 ++++---- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index 01aab0eee3e7..975ddb282061 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -179,7 +179,7 @@ class DefaultFastImageProcessorKwargs(TypedDict, total=False): image_mean: Optional[Union[float, list[float], tuple[float, ...]]] image_std: Optional[Union[float, list[float], tuple[float, ...]]] do_pad: Optional[bool] - crop_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()] + pad_size: Annotated[Optional[Union[int, list[int], tuple[int, ...], dict[str, int]]], image_size_validator()] do_convert_rgb: Optional[bool] return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()] data_format: Optional[Union[str, ChannelDimension]] diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py index 32965e6fc7ac..9495a1a3bda2 100755 --- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py +++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py @@ -81,8 +81,6 @@ class Lfm2VlProcessor(ProcessorMixin): An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. - use_image_special_tokens (`bool`, *optional*, defaults to `True`): - Whether to use image special tokens or not when processing. """ attributes = ["image_processor", "tokenizer"] diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py index ee6ff2be956f..4b56f7842480 100644 --- a/src/transformers/utils/type_validators.py +++ b/src/transformers/utils/type_validators.py @@ -135,11 +135,11 @@ def truncation_validator(value: Optional[Union[bool, str, TruncationStrategy]] = @as_validated_field -def image_size_validator(value: Optional[dict[str, int]] = None): +def image_size_validator(value: Optional[Union[int, Sequence[int], dict[str, int]]] = None): possible_keys = ["height", "width", "longest_edge", "shortest_edge", "max_height", "max_width"] if value is None: pass - elif not isinstance(value, dict) or any(k not in possible_keys for k in value.keys()): + elif isinstance(value, dict) and any(k not in possible_keys for k in value.keys()): raise ValueError(f"Value for size must be a dict with keys {possible_keys} but got size={value}") diff --git a/tests/models/tvp/test_image_processing_tvp.py b/tests/models/tvp/test_image_processing_tvp.py index 390ed75a913b..6d454daf9e4b 100644 --- a/tests/models/tvp/test_image_processing_tvp.py +++ b/tests/models/tvp/test_image_processing_tvp.py @@ -276,8 +276,8 @@ def test_call_numpy_4_channels(self): encoded_videos = image_processing( test_inputs[0], return_tensors="pt", - image_mean=(0.0, 0.0, 0.0, 0.0), - image_std=(1.0, 1.0, 1.0, 1.0), + image_mean=(0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0), input_data_format="channels_first", ).pixel_values self.assertListEqual( @@ -298,8 +298,8 @@ def test_call_numpy_4_channels(self): encoded_videos = image_processing( test_inputs, return_tensors="pt", - image_mean=(0.0, 0.0, 0.0, 0.0), - image_std=(1.0, 1.0, 1.0, 1.0), + image_mean=(0.0, 0.0, 0.0), + image_std=(1.0, 1.0, 1.0), input_data_format="channels_first", ).pixel_values self.assertListEqual( From 4410dd3aa632066b7a7677f7bd43b3984b2dbefd Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 25 Sep 2025 17:21:55 +0200 Subject: [PATCH 20/28] bump --- setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 3945537c49ff..a498aed58805 100644 --- a/setup.py +++ b/setup.py @@ -114,7 +114,7 @@ "GitPython<3.1.19", "hf-doc-builder>=0.3.0", "hf_xet", - "huggingface-hub==1.0.0.rc1", + "huggingface-hub==1.0.0.rc2", "importlib_metadata", "ipadic>=1.0.0,<2.0", "jinja2>=3.1.0", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 80b107d93c4d..4f6c65966713 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -23,7 +23,7 @@ "GitPython": "GitPython<3.1.19", "hf-doc-builder": "hf-doc-builder>=0.3.0", "hf_xet": "hf_xet", - "huggingface-hub": "huggingface-hub==1.0.0.rc1", + "huggingface-hub": "huggingface-hub==1.0.0.rc2", "importlib_metadata": "importlib_metadata", "ipadic": "ipadic>=1.0.0,<2.0", "jinja2": "jinja2>=3.1.0", From 1daa883fe176da4a0c614452484e26a357cc5c76 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 3 Oct 2025 15:38:08 +0200 Subject: [PATCH 21/28] style --- src/transformers/image_processing_utils_fast.py | 4 ++-- .../models/llava_onevision/modular_llava_onevision.py | 2 +- src/transformers/processing_utils.py | 2 -- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index 157ad6b6acdd..09da3e263470 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -15,7 +15,7 @@ from collections.abc import Iterable from copy import deepcopy from functools import lru_cache, partial -from typing import Annotated, Any, Optional, TypedDict, Union +from typing import Any, Optional, Union import numpy as np @@ -50,7 +50,7 @@ logging, ) from .utils.import_utils import is_rocm_platform -from .utils.type_validators import TypedDictAdapter, device_validator, image_size_validator, tensor_type_validator +from .utils.type_validators import TypedDictAdapter if is_vision_available(): diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py index f2d78dfaf81a..88d1c10ab122 100644 --- a/src/transformers/models/llava_onevision/modular_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py @@ -35,7 +35,7 @@ from ...cache_utils import Cache from ...image_processing_utils import BatchFeature -from ...image_processing_utils_fast import group_images_by_shape, reorder_images +from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images from ...image_utils import ( OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index c67d9a49b76d..8ad4d57d1d10 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -84,8 +84,6 @@ if is_torch_available(): - import torch - from .modeling_utils import PreTrainedAudioTokenizerBase From b8385a201276d2cb31d4c38f13efed01176f7903 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 7 Oct 2025 19:19:17 +0200 Subject: [PATCH 22/28] fix some tests --- .../models/deepseek_vl/image_processing_deepseek_vl.py | 2 +- .../image_processing_deepseek_vl_hybrid.py | 10 +++++----- .../deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py | 10 +++++----- .../efficientnet/image_processing_efficientnet.py | 4 ++-- src/transformers/models/eomt/image_processing_eomt.py | 4 ++-- .../models/janus/image_processing_janus.py | 2 +- .../models/pixtral/image_processing_pixtral.py | 4 ++-- .../models/qwen2_5_omni/processing_qwen2_5_omni.py | 2 +- .../models/qwen3_omni_moe/processing_qwen3_omni_moe.py | 4 ++-- src/transformers/processing_utils.py | 10 +++++++++- 10 files changed, 30 insertions(+), 22 deletions(-) diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py index c41ac586753e..88dcbff6c416 100644 --- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py @@ -56,7 +56,7 @@ class DeepseekVLImageProcessorKwargs(ImagesKwargs): falls below this value after resizing. """ - min_size: int + min_size: Optional[int] class DeepseekVLImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py index 8b93f7fa6c94..091e02742d89 100644 --- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py @@ -69,11 +69,11 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs): number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. """ - min_size: int - high_res_size: dict - high_res_resample: "PILImageResampling" - high_res_image_mean: list[float] - high_res_image_std: list[float] + min_size: Optional[int] + high_res_size: Optional[dict] + high_res_resample: Optional[Union["PILImageResampling", int]] + high_res_image_mean: Optional[Union[float, list[float], tuple[float, ...]]] + high_res_image_std: Optional[Union[float, list[float], tuple[float, ...]]] class DeepseekVLHybridImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py index 4135623743ae..7a745a351c9e 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -448,11 +448,11 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs): number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. """ - min_size: int - high_res_size: dict - high_res_resample: "PILImageResampling" - high_res_image_mean: list[float] - high_res_image_std: list[float] + min_size: Optional[int] + high_res_size: Optional[dict] + high_res_resample: Optional[Union["PILImageResampling", int]] + high_res_image_mean: Optional[Union[float, list[float], tuple[float, ...]]] + high_res_image_std: Optional[Union[float, list[float], tuple[float, ...]]] class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor): diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py index f5a69eff70e4..4868f573a517 100644 --- a/src/transformers/models/efficientnet/image_processing_efficientnet.py +++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py @@ -52,8 +52,8 @@ class EfficientNetImageProcessorKwargs(ImagesKwargs): Normalize the image again with the standard deviation only for image classification if set to True. """ - rescale_offset: bool - include_top: bool + rescale_offset: Optional[bool] + include_top: Optional[bool] class EfficientNetImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/eomt/image_processing_eomt.py b/src/transformers/models/eomt/image_processing_eomt.py index 3381e5bcac50..55e57e9b2343 100644 --- a/src/transformers/models/eomt/image_processing_eomt.py +++ b/src/transformers/models/eomt/image_processing_eomt.py @@ -66,8 +66,8 @@ class EomtImageProcessorKwargs(ImagesKwargs): denoted with 0 (background) will be replaced with `ignore_index`. """ - do_split_image: bool - ignore_index: Optional[int] = None + do_split_image: Optional[bool] + ignore_index: Optional[int] # Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks diff --git a/src/transformers/models/janus/image_processing_janus.py b/src/transformers/models/janus/image_processing_janus.py index 06ea0fe0e4d1..b342966ece55 100644 --- a/src/transformers/models/janus/image_processing_janus.py +++ b/src/transformers/models/janus/image_processing_janus.py @@ -58,7 +58,7 @@ class JanusImageProcessorKwargs(ImagesKwargs): falls below this value after resizing. """ - min_size: int + min_size: Optional[int] class JanusImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py index f5df895e66a4..864e76eb0e57 100644 --- a/src/transformers/models/pixtral/image_processing_pixtral.py +++ b/src/transformers/models/pixtral/image_processing_pixtral.py @@ -52,11 +52,11 @@ class PixtralImageProcessorKwargs(ImagesKwargs): """ - patch_size (`dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`): + patch_size (`Union[dict[str, int], int]` *optional*, defaults to `{"height": 16, "width": 16}`): Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method. """ - patch_size: Optional[dict[str, int]] + patch_size: Optional[Union[dict[str, int], int]] # Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white. diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index 95f687e1414a..fb0ec1c89420 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -42,7 +42,7 @@ class Qwen2_5_OmniVideosKwargs(VideosKwargs): max_frames: Optional[int] use_audio_in_video: Optional[bool] seconds_per_chunk: Optional[float] - position_id_per_seconds: Optional[int] + position_id_per_seconds: Optional[Union[int, float]] class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False): diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py index a118f7d2260b..70988b3b77c5 100644 --- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import re -from typing import Optional +from typing import Optional, Union import numpy as np @@ -44,7 +44,7 @@ class Qwen3OmniMoeVideosKwargs(VideosKwargs): max_frames: Optional[int] use_audio_in_video: Optional[bool] seconds_per_chunk: Optional[float] - position_id_per_seconds: Optional[int] + position_id_per_seconds: Optional[Union[int, float]] class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False): diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 8ad4d57d1d10..17af2698ae56 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -1389,8 +1389,16 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg for kwarg in output_kwargs.values(): kwarg.update(common_kwargs) - # Finally perform type validation on collected kwargs for key, typed_dict_obj in ModelProcessorKwargs.__annotations__.items(): + if key in map_preprocessor_kwargs: + preprocessor = getattr(self, map_preprocessor_kwargs[key], None) + if preprocessor is None or getattr(preprocessor, "valid_kwargs", None) is None: + continue + preprocessor_typed_dict_obj = getattr(preprocessor, "valid_kwargs") + typed_dict_obj = TypedDict( + "merged_typed_dict", + {**preprocessor_typed_dict_obj.__annotations__, **typed_dict_obj.__annotations__}, + ) type_validator = TypedDictAdapter(typed_dict_obj) type_validator.validate_fields(**output_kwargs[key]) return output_kwargs From 69448bb1341f2895090670763383d33c4ba5d08d Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 8 Oct 2025 11:31:07 +0200 Subject: [PATCH 23/28] fix copies --- src/transformers/models/janus/modular_janus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index a2f2541d84fa..307bbbd38890 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -1296,7 +1296,7 @@ class JanusImageProcessorKwargs(ImagesKwargs): falls below this value after resizing. """ - min_size: int + min_size: Optional[int] class JanusImageProcessor(BlipImageProcessor): From d25361509f358c01a008a492f52f2093f9cae31b Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 8 Oct 2025 13:42:33 +0200 Subject: [PATCH 24/28] pin rc4 and mark all TypedDict as non-total --- setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- src/transformers/image_processing_utils_fast.py | 5 ++--- src/transformers/models/beit/image_processing_beit.py | 2 +- .../bridgetower/image_processing_bridgetower.py | 2 +- .../image_processing_cohere2_vision_fast.py | 2 +- .../models/cohere2_vision/modular_cohere2_vision.py | 2 +- .../image_processing_conditional_detr.py | 2 +- .../models/convnext/image_processing_convnext.py | 2 +- .../deepseek_vl/image_processing_deepseek_vl.py | 2 +- .../image_processing_deepseek_vl_hybrid.py | 2 +- .../deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py | 2 +- .../image_processing_deformable_detr.py | 2 +- src/transformers/models/detr/image_processing_detr.py | 2 +- .../models/donut/image_processing_donut.py | 2 +- src/transformers/models/dpt/image_processing_dpt.py | 2 +- .../efficientloftr/image_processing_efficientloftr.py | 2 +- .../efficientnet/image_processing_efficientnet.py | 2 +- src/transformers/models/emu3/image_processing_emu3.py | 2 +- src/transformers/models/eomt/image_processing_eomt.py | 2 +- .../models/flava/image_processing_flava.py | 2 +- .../models/gemma3/image_processing_gemma3.py | 2 +- .../models/glm4v/image_processing_glm4v.py | 2 +- .../models/glm4v/video_processing_glm4v.py | 2 +- .../models/got_ocr2/image_processing_got_ocr2.py | 2 +- .../grounding_dino/image_processing_grounding_dino.py | 2 +- .../models/idefics/image_processing_idefics.py | 2 +- .../models/idefics2/image_processing_idefics2.py | 2 +- .../models/idefics3/image_processing_idefics3.py | 2 +- .../models/imagegpt/image_processing_imagegpt.py | 2 +- .../video_processing_instructblipvideo.py | 2 +- .../models/internvl/video_processing_internvl.py | 2 +- .../models/janus/image_processing_janus.py | 2 +- src/transformers/models/janus/modular_janus.py | 2 +- .../models/kosmos2_5/image_processing_kosmos2_5.py | 2 +- .../models/layoutlmv2/image_processing_layoutlmv2.py | 2 +- .../models/layoutlmv3/image_processing_layoutlmv3.py | 2 +- .../models/lfm2_vl/image_processing_lfm2_vl_fast.py | 2 +- .../models/llama4/image_processing_llama4_fast.py | 2 +- .../models/llava_next/image_processing_llava_next.py | 2 +- .../image_processing_llava_onevision.py | 2 +- .../mask2former/image_processing_mask2former.py | 2 +- .../models/maskformer/image_processing_maskformer.py | 2 +- .../models/mllama/image_processing_mllama.py | 2 +- .../mobilenet_v2/image_processing_mobilenet_v2.py | 2 +- .../models/mobilevit/image_processing_mobilevit.py | 2 +- .../models/nougat/image_processing_nougat.py | 2 +- .../models/oneformer/image_processing_oneformer.py | 2 +- .../models/ovis2/image_processing_ovis2.py | 2 +- .../image_processing_perception_lm_fast.py | 2 +- .../image_processing_phi4_multimodal_fast.py | 2 +- .../models/pix2struct/image_processing_pix2struct.py | 2 +- .../models/pixtral/image_processing_pixtral.py | 2 +- .../models/poolformer/image_processing_poolformer.py | 2 +- .../image_processing_prompt_depth_anything.py | 2 +- .../models/qwen2_5_omni/processing_qwen2_5_omni.py | 2 +- .../models/qwen2_vl/image_processing_qwen2_vl.py | 2 +- .../models/qwen2_vl/video_processing_qwen2_vl.py | 2 +- .../qwen3_omni_moe/processing_qwen3_omni_moe.py | 2 +- .../models/qwen3_vl/video_processing_qwen3_vl.py | 2 +- .../models/rt_detr/image_processing_rt_detr.py | 2 +- src/transformers/models/sam/image_processing_sam.py | 2 +- src/transformers/models/sam/processing_sam.py | 2 +- .../models/sam2/image_processing_sam2_fast.py | 2 +- src/transformers/models/sam2/modular_sam2.py | 2 +- src/transformers/models/sam_hq/processing_samhq.py | 2 +- .../models/segformer/image_processing_segformer.py | 2 +- .../models/siglip2/image_processing_siglip2.py | 2 +- .../models/smolvlm/image_processing_smolvlm.py | 2 +- .../models/smolvlm/video_processing_smolvlm.py | 2 +- .../models/superpoint/image_processing_superpoint.py | 2 +- .../models/swin2sr/image_processing_swin2sr.py | 2 +- .../models/textnet/image_processing_textnet.py | 2 +- src/transformers/models/tvp/image_processing_tvp.py | 2 +- src/transformers/models/vilt/image_processing_vilt.py | 2 +- .../models/vitmatte/image_processing_vitmatte.py | 2 +- .../models/yolos/image_processing_yolos.py | 2 +- .../models/zoedepth/image_processing_zoedepth.py | 2 +- src/transformers/processing_utils.py | 6 +++--- src/transformers/utils/type_validators.py | 11 +---------- src/transformers/video_processing_utils.py | 5 ++--- 81 files changed, 85 insertions(+), 96 deletions(-) diff --git a/setup.py b/setup.py index 41743d8e2ef1..462a97c44930 100644 --- a/setup.py +++ b/setup.py @@ -114,7 +114,7 @@ "GitPython<3.1.19", "hf-doc-builder>=0.3.0", "hf_xet", - "huggingface-hub==1.0.0.rc2", + "huggingface-hub==1.0.0.rc4", "importlib_metadata", "ipadic>=1.0.0,<2.0", "jinja2>=3.1.0", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index a6b6a9c445e6..1caefce16c3e 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -23,7 +23,7 @@ "GitPython": "GitPython<3.1.19", "hf-doc-builder": "hf-doc-builder>=0.3.0", "hf_xet": "hf_xet", - "huggingface-hub": "huggingface-hub==1.0.0.rc2", + "huggingface-hub": "huggingface-hub==1.0.0.rc4", "importlib_metadata": "importlib_metadata", "ipadic": "ipadic>=1.0.0,<2.0", "jinja2": "jinja2>=3.1.0", diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index 09da3e263470..8d4c79afbb1d 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -18,6 +18,7 @@ from typing import Any, Optional, Union import numpy as np +from huggingface_hub.dataclasses import validate_typed_dict from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from .image_transforms import ( @@ -50,7 +51,6 @@ logging, ) from .utils.import_utils import is_rocm_platform -from .utils.type_validators import TypedDictAdapter if is_vision_available(): @@ -713,8 +713,7 @@ def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names) # Perform type validation on received kwargs - type_validator = TypedDictAdapter(self.valid_kwargs) - type_validator.validate_fields(**kwargs) + validate_typed_dict(self.valid_kwargs, kwargs) # Set default kwargs from self. This ensures that if a kwarg is not provided # by the user, it gets its default value from the instance, or is set to None. diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py index f65709168379..9025e2868df8 100644 --- a/src/transformers/models/beit/image_processing_beit.py +++ b/src/transformers/models/beit/image_processing_beit.py @@ -55,7 +55,7 @@ logger = logging.get_logger(__name__) -class BeitImageProcessorKwargs(ImagesKwargs): +class BeitImageProcessorKwargs(ImagesKwargs, total=False): r""" do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index cad23d02893f..c44cf6a7ee3d 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -123,7 +123,7 @@ def get_resize_output_image_size( return new_height, new_width -class BridgeTowerImageProcessorKwargs(ImagesKwargs): +class BridgeTowerImageProcessorKwargs(ImagesKwargs, total=False): size_divisor: Optional[int] diff --git a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py index 358d84ac6d7c..638b7549bfae 100644 --- a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +++ b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py @@ -33,7 +33,7 @@ from ...utils import TensorType, auto_docstring -class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs): +class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False): """ crop_to_patches (`bool`, *optional*, defaults to `False`): Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the diff --git a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py index 997a6f2d638e..5df3e075ed20 100644 --- a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py @@ -303,7 +303,7 @@ def get_optimal_tiled_canvas( return best_grid -class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs): +class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False): """ crop_to_patches (`bool`, *optional*, defaults to `False`): Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index 6df784585e9b..efc532d413c4 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -729,7 +729,7 @@ def compute_segments( return segmentation, segments -class ConditionalDetrImageProcessorKwargs(ImagesKwargs): +class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False): r""" format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): Data format of the annotations. One of "coco_detection" or "coco_panoptic". diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py index d2e180de2464..95e9cd91bd4a 100644 --- a/src/transformers/models/convnext/image_processing_convnext.py +++ b/src/transformers/models/convnext/image_processing_convnext.py @@ -50,7 +50,7 @@ logger = logging.get_logger(__name__) -class ConvNextImageProcessorKwargs(ImagesKwargs): +class ConvNextImageProcessorKwargs(ImagesKwargs, total=False): """ crop_pct (`float`, *optional*): Percentage of the image to crop. Only has an effect if size < 384. Can be diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py index 88dcbff6c416..f3f4c1ac6e34 100644 --- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py @@ -49,7 +49,7 @@ logger = logging.get_logger(__name__) -class DeepseekVLImageProcessorKwargs(ImagesKwargs): +class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False): r""" min_size (`int`, *optional*, defaults to 14): The minimum allowed size for the resized image. Ensures that neither the height nor width diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py index 091e02742d89..7837cff2d33b 100644 --- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py @@ -50,7 +50,7 @@ logger = logging.get_logger(__name__) -class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs): +class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False): r""" min_size (`int`, *optional*, defaults to 14): The minimum allowed size for the resized image. Ensures that neither the height nor width diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py index 7a745a351c9e..343ba01ac874 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -429,7 +429,7 @@ def prepare_inputs_for_generation( return model_inputs -class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs): +class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False): r""" min_size (`int`, *optional*, defaults to 14): The minimum allowed size for the resized image. Ensures that neither the height nor width diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index eabdb536ff70..9fd63b6340c6 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -82,7 +82,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class DeformableDetrImageProcessorKwargs(ImagesKwargs): +class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False): r""" format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): Data format of the annotations. One of "coco_detection" or "coco_panoptic". diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index 02261fc2a129..1cd636a0cb72 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -84,7 +84,7 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) -class DetrImageProcessorKwargs(ImagesKwargs): +class DetrImageProcessorKwargs(ImagesKwargs, total=False): r""" format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): Data format of the annotations. One of "coco_detection" or "coco_panoptic". diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py index 5af365099724..802b05c776fd 100644 --- a/src/transformers/models/donut/image_processing_donut.py +++ b/src/transformers/models/donut/image_processing_donut.py @@ -52,7 +52,7 @@ import PIL -class DonutImageProcessorKwargs(ImagesKwargs): +class DonutImageProcessorKwargs(ImagesKwargs, total=False): """ do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`): Whether to resize the image using thumbnail method. diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py index 3ba5a6e30c21..71e930a7bfcf 100644 --- a/src/transformers/models/dpt/image_processing_dpt.py +++ b/src/transformers/models/dpt/image_processing_dpt.py @@ -64,7 +64,7 @@ logger = logging.get_logger(__name__) -class DPTImageProcessorKwargs(ImagesKwargs): +class DPTImageProcessorKwargs(ImagesKwargs, total=False): """ ensure_multiple_of (`int`, *optional*, defaults to 1): If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden diff --git a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py index d1beabb6c2b9..a5c06be89e98 100644 --- a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py +++ b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py @@ -50,7 +50,7 @@ logger = logging.get_logger(__name__) -class EfficientLoFTRImageProcessorKwargs(ImagesKwargs): +class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False): r""" do_grayscale (`bool`, *optional*, defaults to `True`): Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py index 4868f573a517..0a3a7542ff67 100644 --- a/src/transformers/models/efficientnet/image_processing_efficientnet.py +++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py @@ -44,7 +44,7 @@ logger = logging.get_logger(__name__) -class EfficientNetImageProcessorKwargs(ImagesKwargs): +class EfficientNetImageProcessorKwargs(ImagesKwargs, total=False): """ rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`): Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range]. diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py index fca5316a3fca..ec270fef2c87 100644 --- a/src/transformers/models/emu3/image_processing_emu3.py +++ b/src/transformers/models/emu3/image_processing_emu3.py @@ -47,7 +47,7 @@ logger = logging.get_logger(__name__) -class Emu3ImageProcessorKwargs(ImagesKwargs): +class Emu3ImageProcessorKwargs(ImagesKwargs, total=False): ratio: Optional[str] image_area: Optional[int] diff --git a/src/transformers/models/eomt/image_processing_eomt.py b/src/transformers/models/eomt/image_processing_eomt.py index 55e57e9b2343..b422ca3020d9 100644 --- a/src/transformers/models/eomt/image_processing_eomt.py +++ b/src/transformers/models/eomt/image_processing_eomt.py @@ -55,7 +55,7 @@ import torch.nn.functional as F -class EomtImageProcessorKwargs(ImagesKwargs): +class EomtImageProcessorKwargs(ImagesKwargs, total=False): """ do_split_image (`bool`, *optional*, defaults to `False`): Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py index 3c19a2405169..2005011dcb2b 100644 --- a/src/transformers/models/flava/image_processing_flava.py +++ b/src/transformers/models/flava/image_processing_flava.py @@ -57,7 +57,7 @@ LOGIT_LAPLACE_EPS: float = 0.1 -class FlavaImageProcessorKwargs(ImagesKwargs): +class FlavaImageProcessorKwargs(ImagesKwargs, total=False): """ return_image_mask (`bool`, *optional*, defaults to `False`): Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`. diff --git a/src/transformers/models/gemma3/image_processing_gemma3.py b/src/transformers/models/gemma3/image_processing_gemma3.py index 5206a13a04a3..4b32a1f31a05 100644 --- a/src/transformers/models/gemma3/image_processing_gemma3.py +++ b/src/transformers/models/gemma3/image_processing_gemma3.py @@ -51,7 +51,7 @@ import PIL -class Gemma3ImageProcessorKwargs(ImagesKwargs): +class Gemma3ImageProcessorKwargs(ImagesKwargs, total=False): """ do_pan_and_scan (`bool`, *optional*): Whether to apply `pan_and_scan` to images. diff --git a/src/transformers/models/glm4v/image_processing_glm4v.py b/src/transformers/models/glm4v/image_processing_glm4v.py index 13f4472e61f3..e58d295ca465 100644 --- a/src/transformers/models/glm4v/image_processing_glm4v.py +++ b/src/transformers/models/glm4v/image_processing_glm4v.py @@ -47,7 +47,7 @@ logger = logging.get_logger(__name__) -class Glm4vImageProcessorKwargs(ImagesKwargs): +class Glm4vImageProcessorKwargs(ImagesKwargs, total=False): """ patch_size (`int`, *optional*, defaults to 14): The spatial patch size of the vision encoder. diff --git a/src/transformers/models/glm4v/video_processing_glm4v.py b/src/transformers/models/glm4v/video_processing_glm4v.py index 8324ad482baa..95ea8160c606 100644 --- a/src/transformers/models/glm4v/video_processing_glm4v.py +++ b/src/transformers/models/glm4v/video_processing_glm4v.py @@ -36,7 +36,7 @@ from .image_processing_glm4v import smart_resize -class Glm4vVideoProcessorInitKwargs(VideosKwargs): +class Glm4vVideoProcessorInitKwargs(VideosKwargs, total=False): max_image_size: Optional[dict[str, int]] patch_size: Optional[int] temporal_patch_size: Optional[int] diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py index 3424020c65b3..5882f22b1bd0 100644 --- a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py +++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py @@ -49,7 +49,7 @@ logger = logging.get_logger(__name__) -class GotOcr2ImageProcessorKwargs(ImagesKwargs): +class GotOcr2ImageProcessorKwargs(ImagesKwargs, total=False): """ crop_to_patches (`bool`, *optional*, defaults to `False`): Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index c099d44e3d58..f556b2d295c3 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -93,7 +93,7 @@ class AnnotationFormat(ExplicitEnum): SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) -class GroundingDinoImageProcessorKwargs(ImagesKwargs): +class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False): r""" format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): Data format of the annotations. One of "coco_detection" or "coco_panoptic". diff --git a/src/transformers/models/idefics/image_processing_idefics.py b/src/transformers/models/idefics/image_processing_idefics.py index 7fda46e3a990..6a53e4d9b7d4 100644 --- a/src/transformers/models/idefics/image_processing_idefics.py +++ b/src/transformers/models/idefics/image_processing_idefics.py @@ -36,7 +36,7 @@ IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711] -class IdeficsImageProcessorKwargs(ImagesKwargs): +class IdeficsImageProcessorKwargs(ImagesKwargs, total=False): """ transform (`Callable`, *optional*): A custom transform function that accepts a single image can be passed for training. For example, diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py index b9b741a9704b..1baec594bc90 100644 --- a/src/transformers/models/idefics2/image_processing_idefics2.py +++ b/src/transformers/models/idefics2/image_processing_idefics2.py @@ -47,7 +47,7 @@ from PIL import Image -class Idefics2ImageProcessorKwargs(ImagesKwargs): +class Idefics2ImageProcessorKwargs(ImagesKwargs, total=False): """ do_image_splitting (`bool`, *optional*, defaults to `False`): Whether to split the image into a sequence 4 equal sub-images concatenated with the original image. diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py index f098a9f54dc1..d53a75596fea 100644 --- a/src/transformers/models/idefics3/image_processing_idefics3.py +++ b/src/transformers/models/idefics3/image_processing_idefics3.py @@ -48,7 +48,7 @@ from PIL import Image -class Idefics3ImageProcessorKwargs(ImagesKwargs): +class Idefics3ImageProcessorKwargs(ImagesKwargs, total=False): """ do_image_splitting (`bool`, *optional*, defaults to `True`): Whether to split the image into sub-images concatenated with the original image. They are split into patches diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py index 8f79cd58ec5f..f5a1682d9be3 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py @@ -45,7 +45,7 @@ logger = logging.get_logger(__name__) -class ImageGPTImageProcessorKwargs(ImagesKwargs): +class ImageGPTImageProcessorKwargs(ImagesKwargs, total=False): """ clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*): The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` diff --git a/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py index d2fe3cc7f343..d89d9069d495 100644 --- a/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py @@ -30,7 +30,7 @@ from ...video_utils import group_videos_by_shape, reorder_videos -class InstructBlipVideoVideoProcessorInitKwargs(VideosKwargs): ... +class InstructBlipVideoVideoProcessorInitKwargs(VideosKwargs, total=False): ... class InstructBlipVideoVideoProcessor(BaseVideoProcessor): diff --git a/src/transformers/models/internvl/video_processing_internvl.py b/src/transformers/models/internvl/video_processing_internvl.py index 0e3f0469fe5e..703c40b94f0c 100644 --- a/src/transformers/models/internvl/video_processing_internvl.py +++ b/src/transformers/models/internvl/video_processing_internvl.py @@ -27,7 +27,7 @@ from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos -class InternVLVideoProcessorInitKwargs(VideosKwargs): +class InternVLVideoProcessorInitKwargs(VideosKwargs, total=False): initial_shift: Optional[Union[bool, float, int]] diff --git a/src/transformers/models/janus/image_processing_janus.py b/src/transformers/models/janus/image_processing_janus.py index b342966ece55..6b83ecf8eb5c 100644 --- a/src/transformers/models/janus/image_processing_janus.py +++ b/src/transformers/models/janus/image_processing_janus.py @@ -51,7 +51,7 @@ logger = logging.get_logger(__name__) -class JanusImageProcessorKwargs(ImagesKwargs): +class JanusImageProcessorKwargs(ImagesKwargs, total=False): r""" min_size (`int`, *optional*, defaults to 14): The minimum allowed size for the resized image. Ensures that neither the height nor width diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index 307bbbd38890..36fe7abae438 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -1289,7 +1289,7 @@ def generate( return generated_tokens -class JanusImageProcessorKwargs(ImagesKwargs): +class JanusImageProcessorKwargs(ImagesKwargs, total=False): r""" min_size (`int`, *optional*, defaults to 14): The minimum allowed size for the resized image. Ensures that neither the height nor width diff --git a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py index 5f337e4b04c9..ca80cb978ec9 100644 --- a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py +++ b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py @@ -46,7 +46,7 @@ DEFAULT_FONT_PATH = "ybelkada/fonts" -class Kosmos2_5ImageProcessorKwargs(ImagesKwargs): +class Kosmos2_5ImageProcessorKwargs(ImagesKwargs, total=False): r""" patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`): The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16. diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py index d5a7e95537c5..5e9289c2701b 100644 --- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py @@ -52,7 +52,7 @@ logger = logging.get_logger(__name__) -class LayoutLMv2ImageProcessorKwargs(ImagesKwargs): +class LayoutLMv2ImageProcessorKwargs(ImagesKwargs, total=False): r""" apply_ocr (`bool`, *optional*, defaults to `True`): Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py index b9273dc75cad..9ba4f5507fc1 100644 --- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py @@ -56,7 +56,7 @@ logger = logging.get_logger(__name__) -class LayoutLMv3ImageProcessorKwargs(ImagesKwargs): +class LayoutLMv3ImageProcessorKwargs(ImagesKwargs, total=False): r""" apply_ocr (`bool`, *optional*, defaults to `True`): Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by diff --git a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py index ad99504fcad6..3d16a4d0a273 100755 --- a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +++ b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py @@ -169,7 +169,7 @@ def pad_along_first_dim( return images, pixel_mask -class Lfm2VlImageProcessorKwargs(ImagesKwargs): +class Lfm2VlImageProcessorKwargs(ImagesKwargs, total=False): """ downsample_factor (`int`, *optional*, defaults to `2`): The downsampling factor for images used when resizing the image. diff --git a/src/transformers/models/llama4/image_processing_llama4_fast.py b/src/transformers/models/llama4/image_processing_llama4_fast.py index e2678f556d02..42b2221d6a18 100644 --- a/src/transformers/models/llama4/image_processing_llama4_fast.py +++ b/src/transformers/models/llama4/image_processing_llama4_fast.py @@ -308,7 +308,7 @@ def get_best_fit( return optimal_canvas -class Llama4ImageProcessorKwargs(ImagesKwargs): +class Llama4ImageProcessorKwargs(ImagesKwargs, total=False): r""" max_patches (`int`, *optional*, defaults to 16): The maximum number of patches to be extracted from the image. diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index 56ebc10f391d..26ca94dad6e9 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -59,7 +59,7 @@ from PIL import Image -class LlavaNextImageProcessorKwargs(ImagesKwargs): +class LlavaNextImageProcessorKwargs(ImagesKwargs, total=False): r""" image_grid_pinpoints (`list[list[int]]`, *optional*): A list of possible resolutions to use for processing high resolution images. The best resolution is selected diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py index 119df9550a2a..3654edcdbf71 100644 --- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py @@ -58,7 +58,7 @@ from PIL import Image -class LlavaOnevisionImageProcessorKwargs(ImagesKwargs): +class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False): r""" image_grid_pinpoints (`list[list[int]]`, *optional*): A list of possible resolutions to use for processing high resolution images. The best resolution is selected diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py index ce79107f05b3..235dbe8039f1 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_mask2former.py @@ -61,7 +61,7 @@ from torch import nn -class Mask2FormerImageProcessorKwargs(ImagesKwargs): +class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False): r""" ignore_index (`int`, *optional*): Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py index 60e703405605..4046fcafb07a 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer.py +++ b/src/transformers/models/maskformer/image_processing_maskformer.py @@ -67,7 +67,7 @@ from torch import nn -class MaskFormerImageProcessorKwargs(ImagesKwargs): +class MaskFormerImageProcessorKwargs(ImagesKwargs, total=False): r""" ignore_index (`int`, *optional*): Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels diff --git a/src/transformers/models/mllama/image_processing_mllama.py b/src/transformers/models/mllama/image_processing_mllama.py index cd79f7de3121..50579703b905 100644 --- a/src/transformers/models/mllama/image_processing_mllama.py +++ b/src/transformers/models/mllama/image_processing_mllama.py @@ -50,7 +50,7 @@ logger = logging.get_logger(__name__) -class MllamaImageProcessorKwargs(ImagesKwargs): +class MllamaImageProcessorKwargs(ImagesKwargs, total=False): """ max_image_tiles (`int`, *optional*): The maximum number of tiles allowed. diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py index e8dfe992544a..ad4c6937b76a 100644 --- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py @@ -51,7 +51,7 @@ logger = logging.get_logger(__name__) -class MobileNetV2ImageProcessorKwargs(ImagesKwargs): +class MobileNetV2ImageProcessorKwargs(ImagesKwargs, total=False): """ do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py index 576ef9f449dc..8a914608295f 100644 --- a/src/transformers/models/mobilevit/image_processing_mobilevit.py +++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py @@ -53,7 +53,7 @@ logger = logging.get_logger(__name__) -class MobileVitImageProcessorKwargs(ImagesKwargs): +class MobileVitImageProcessorKwargs(ImagesKwargs, total=False): """ do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`): Whether to flip the color channels from RGB to BGR or vice versa. diff --git a/src/transformers/models/nougat/image_processing_nougat.py b/src/transformers/models/nougat/image_processing_nougat.py index 0a5c445645e0..35db47bdf0c9 100644 --- a/src/transformers/models/nougat/image_processing_nougat.py +++ b/src/transformers/models/nougat/image_processing_nougat.py @@ -52,7 +52,7 @@ import PIL -class NougatImageProcessorKwargs(ImagesKwargs): +class NougatImageProcessorKwargs(ImagesKwargs, total=False): r""" do_crop_margin (`bool`, *optional*, defaults to `True`): Whether to crop the image margins. diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py index 86ce8abf084e..058c655559a8 100644 --- a/src/transformers/models/oneformer/image_processing_oneformer.py +++ b/src/transformers/models/oneformer/image_processing_oneformer.py @@ -64,7 +64,7 @@ from torch import nn -class OneFormerImageProcessorKwargs(ImagesKwargs): +class OneFormerImageProcessorKwargs(ImagesKwargs, total=False): r""" repo_path (`str`, *optional*, defaults to `shi-labs/oneformer_demo`): Path to a local directory or Hugging Face Hub repository containing model metadata. diff --git a/src/transformers/models/ovis2/image_processing_ovis2.py b/src/transformers/models/ovis2/image_processing_ovis2.py index 2bc883f95e73..ab4ae57f0cb8 100644 --- a/src/transformers/models/ovis2/image_processing_ovis2.py +++ b/src/transformers/models/ovis2/image_processing_ovis2.py @@ -44,7 +44,7 @@ logger = logging.get_logger(__name__) -class Ovis2ImageProcessorKwargs(ImagesKwargs): +class Ovis2ImageProcessorKwargs(ImagesKwargs, total=False): """ crop_to_patches (`bool`, *optional*, defaults to `False`): Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py index ec721499917c..8409be51e034 100644 --- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py +++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py @@ -42,7 +42,7 @@ ) -class PerceptionLMImageProcessorKwargs(ImagesKwargs): +class PerceptionLMImageProcessorKwargs(ImagesKwargs, total=False): r""" vision_input_type (`str`, *optional*, defaults to `"thumb+tile"`): Vision processing strategy. `"thumb+tile"` uses both thumbnails and multiple tiles for diff --git a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py index 33b10915354f..6dcbf61f38f6 100644 --- a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +++ b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py @@ -35,7 +35,7 @@ logger = logging.get_logger(__name__) -class Phi4MultimodalImageProcessorKwargs(ImagesKwargs): +class Phi4MultimodalImageProcessorKwargs(ImagesKwargs, total=False): r""" patch_size (`int`, *optional*): The size of the patch. diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py index e0c630369029..66cc7cd0b04a 100644 --- a/src/transformers/models/pix2struct/image_processing_pix2struct.py +++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py @@ -49,7 +49,7 @@ DEFAULT_FONT_PATH = "ybelkada/fonts" -class Pix2StructImageProcessorKwargs(ImagesKwargs): +class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False): """ max_patches (`int`, *optional*): Maximum number of patches to extract. diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py index 864e76eb0e57..387c82f4e0a0 100644 --- a/src/transformers/models/pixtral/image_processing_pixtral.py +++ b/src/transformers/models/pixtral/image_processing_pixtral.py @@ -50,7 +50,7 @@ import PIL -class PixtralImageProcessorKwargs(ImagesKwargs): +class PixtralImageProcessorKwargs(ImagesKwargs, total=False): """ patch_size (`Union[dict[str, int], int]` *optional*, defaults to `{"height": 16, "width": 16}`): Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method. diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py index 7d03f8281285..c7bb38c4340c 100644 --- a/src/transformers/models/poolformer/image_processing_poolformer.py +++ b/src/transformers/models/poolformer/image_processing_poolformer.py @@ -48,7 +48,7 @@ logger = logging.get_logger(__name__) -class PoolFormerImageProcessorKwargs(ImagesKwargs): +class PoolFormerImageProcessorKwargs(ImagesKwargs, total=False): r""" crop_pct (`float`, *optional*, defaults to `self.crop_pct`): Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`. diff --git a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py index b8220a30fa42..cb8c0e37af5f 100644 --- a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +++ b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py @@ -54,7 +54,7 @@ logger = logging.get_logger(__name__) -class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs): +class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs, total=False): r""" keep_aspect_ratio (`bool`, *optional*): If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index fb0ec1c89420..f522f63beb33 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -32,7 +32,7 @@ # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni # and does not use them in video processor class -class Qwen2_5_OmniVideosKwargs(VideosKwargs): +class Qwen2_5_OmniVideosKwargs(VideosKwargs, total=False): min_pixels: Optional[int] max_pixels: Optional[int] patch_size: Optional[int] diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py index fe218bd05b9d..82b7bce43fe9 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py @@ -52,7 +52,7 @@ logger = logging.get_logger(__name__) -class Qwen2VLImageProcessorKwargs(ImagesKwargs): +class Qwen2VLImageProcessorKwargs(ImagesKwargs, total=False): r""" min_pixels (`int`, *optional*, defaults to `56 * 56`): The min pixels of the image to resize the image. diff --git a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py index 84bcd827f02e..2fc8bf1ac5b3 100644 --- a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py @@ -41,7 +41,7 @@ from .image_processing_qwen2_vl import smart_resize -class Qwen2VLVideoProcessorInitKwargs(VideosKwargs): +class Qwen2VLVideoProcessorInitKwargs(VideosKwargs, total=False): min_pixels: Optional[int] max_pixels: Optional[int] patch_size: Optional[int] diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py index 70988b3b77c5..ba54b3a0c1f6 100644 --- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py @@ -34,7 +34,7 @@ # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni # and does not use them in video processor class -class Qwen3OmniMoeVideosKwargs(VideosKwargs): +class Qwen3OmniMoeVideosKwargs(VideosKwargs, total=False): min_pixels: Optional[int] max_pixels: Optional[int] patch_size: Optional[int] diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py index c4648788c9dc..3bb06d7f2b08 100644 --- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py @@ -64,7 +64,7 @@ def smart_resize( return h_bar, w_bar -class Qwen3VLVideoProcessorInitKwargs(VideosKwargs): +class Qwen3VLVideoProcessorInitKwargs(VideosKwargs, total=False): patch_size: Optional[int] temporal_patch_size: Optional[int] merge_size: Optional[int] diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py index b3c77a8920cd..5e91e3c4fc01 100644 --- a/src/transformers/models/rt_detr/image_processing_rt_detr.py +++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py @@ -68,7 +68,7 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,) -class RTDetrImageProcessorKwargs(ImagesKwargs): +class RTDetrImageProcessorKwargs(ImagesKwargs, total=False): r""" format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): Data format of the annotations. One of "coco_detection" or "coco_panoptic". diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py index e9da260a6e9c..3ba27f63d993 100644 --- a/src/transformers/models/sam/image_processing_sam.py +++ b/src/transformers/models/sam/image_processing_sam.py @@ -58,7 +58,7 @@ logger = logging.get_logger(__name__) -class SamImageProcessorKwargs(ImagesKwargs): +class SamImageProcessorKwargs(ImagesKwargs, total=False): r""" mask_size (`dict[str, int]`, *optional*): The size `{"longest_edge": int}` to resize the segmentation maps to. diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py index bc82daf2034d..18d812fc6825 100644 --- a/src/transformers/models/sam/processing_sam.py +++ b/src/transformers/models/sam/processing_sam.py @@ -31,7 +31,7 @@ import torch -class SamImagesKwargs(ImagesKwargs): +class SamImagesKwargs(ImagesKwargs, total=False): segmentation_maps: Optional[ImageInput] input_points: Optional[list[list[float]]] input_labels: Optional[list[list[int]]] diff --git a/src/transformers/models/sam2/image_processing_sam2_fast.py b/src/transformers/models/sam2/image_processing_sam2_fast.py index c468f6400d54..b00fba952973 100644 --- a/src/transformers/models/sam2/image_processing_sam2_fast.py +++ b/src/transformers/models/sam2/image_processing_sam2_fast.py @@ -43,7 +43,7 @@ from ...utils import TensorType, auto_docstring -class Sam2FastImageProcessorKwargs(ImagesKwargs): +class Sam2FastImageProcessorKwargs(ImagesKwargs, total=False): r""" mask_size (`dict[str, int]`, *optional*): The size `{"height": int, "width": int}` to resize the segmentation maps to. diff --git a/src/transformers/models/sam2/modular_sam2.py b/src/transformers/models/sam2/modular_sam2.py index 8fcfe36a759e..eac3cc232c1a 100644 --- a/src/transformers/models/sam2/modular_sam2.py +++ b/src/transformers/models/sam2/modular_sam2.py @@ -70,7 +70,7 @@ logger = logging.get_logger(__name__) -class Sam2FastImageProcessorKwargs(ImagesKwargs): +class Sam2FastImageProcessorKwargs(ImagesKwargs, total=False): r""" mask_size (`dict[str, int]`, *optional*): The size `{"height": int, "width": int}` to resize the segmentation maps to. diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py index f2852b8623c4..8e1945971ebc 100644 --- a/src/transformers/models/sam_hq/processing_samhq.py +++ b/src/transformers/models/sam_hq/processing_samhq.py @@ -31,7 +31,7 @@ import torch -class SamHQImagesKwargs(ImagesKwargs): +class SamHQImagesKwargs(ImagesKwargs, total=False): segmentation_maps: Optional[ImageInput] input_points: Optional[list[list[float]]] input_labels: Optional[list[list[int]]] diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py index ce9ace8115a4..bedb4ff54651 100644 --- a/src/transformers/models/segformer/image_processing_segformer.py +++ b/src/transformers/models/segformer/image_processing_segformer.py @@ -55,7 +55,7 @@ logger = logging.get_logger(__name__) -class SegformerImageProcessorKwargs(ImagesKwargs): +class SegformerImageProcessorKwargs(ImagesKwargs, total=False): r""" do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 diff --git a/src/transformers/models/siglip2/image_processing_siglip2.py b/src/transformers/models/siglip2/image_processing_siglip2.py index caff1bce0bc9..d71b66464223 100644 --- a/src/transformers/models/siglip2/image_processing_siglip2.py +++ b/src/transformers/models/siglip2/image_processing_siglip2.py @@ -48,7 +48,7 @@ from PIL import Image -class Siglip2ImageProcessorKwargs(ImagesKwargs): +class Siglip2ImageProcessorKwargs(ImagesKwargs, total=False): """ patch_size (`int`, *optional*, defaults to 16): The size (resolution) of each patch the image will be split to. diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm.py b/src/transformers/models/smolvlm/image_processing_smolvlm.py index e231c1ec6b07..c12c08182a94 100644 --- a/src/transformers/models/smolvlm/image_processing_smolvlm.py +++ b/src/transformers/models/smolvlm/image_processing_smolvlm.py @@ -53,7 +53,7 @@ logger = logging.get_logger(__name__) -class SmolVLMImageProcessorKwargs(ImagesKwargs): +class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False): """ do_image_splitting (`bool`, *optional*, defaults to `True`): Whether to split the image into sub-images concatenated with the original image. They are split into patches diff --git a/src/transformers/models/smolvlm/video_processing_smolvlm.py b/src/transformers/models/smolvlm/video_processing_smolvlm.py index ce73dfb4a82e..d8cecb6c0c5c 100644 --- a/src/transformers/models/smolvlm/video_processing_smolvlm.py +++ b/src/transformers/models/smolvlm/video_processing_smolvlm.py @@ -90,7 +90,7 @@ def get_resize_output_image_size( return height, width -class SmolVLMVideoProcessorInitKwargs(VideosKwargs): +class SmolVLMVideoProcessorInitKwargs(VideosKwargs, total=False): max_image_size: Optional[dict[str, int]] diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py index 633d9b0b16b9..9c810c450ad7 100644 --- a/src/transformers/models/superpoint/image_processing_superpoint.py +++ b/src/transformers/models/superpoint/image_processing_superpoint.py @@ -46,7 +46,7 @@ logger = logging.get_logger(__name__) -class SuperPointImageProcessorKwargs(ImagesKwargs): +class SuperPointImageProcessorKwargs(ImagesKwargs, total=False): r""" do_grayscale (`bool`, *optional*, defaults to `True`): Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py index 018a1bf0f4df..d9de6c684959 100644 --- a/src/transformers/models/swin2sr/image_processing_swin2sr.py +++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py @@ -38,7 +38,7 @@ logger = logging.get_logger(__name__) -class Swin2SRImageProcessorKwargs(ImagesKwargs): +class Swin2SRImageProcessorKwargs(ImagesKwargs, total=False): size_divisor: Optional[int] diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py index 1a4d68522205..e5e127d987e0 100644 --- a/src/transformers/models/textnet/image_processing_textnet.py +++ b/src/transformers/models/textnet/image_processing_textnet.py @@ -49,7 +49,7 @@ import PIL -class TextNetImageProcessorKwargs(ImagesKwargs): +class TextNetImageProcessorKwargs(ImagesKwargs, total=False): size_divisor: Optional[int] diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index 67c1ffe4fae8..42834287e110 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -50,7 +50,7 @@ logger = logging.get_logger(__name__) -class TvpImageProcessorKwargs(ImagesKwargs): +class TvpImageProcessorKwargs(ImagesKwargs, total=False): r""" do_flip_channel_order (`bool`, *optional*): Whether to flip the channel order of the image from RGB to BGR. diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py index bb29e1d1ee30..fadb5302d4ee 100644 --- a/src/transformers/models/vilt/image_processing_vilt.py +++ b/src/transformers/models/vilt/image_processing_vilt.py @@ -47,7 +47,7 @@ logger = logging.get_logger(__name__) -class ViltImageProcessorKwargs(ImagesKwargs): +class ViltImageProcessorKwargs(ImagesKwargs, total=False): size_divisor: Optional[int] diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte.py b/src/transformers/models/vitmatte/image_processing_vitmatte.py index 95933c053ce5..eb994b641962 100644 --- a/src/transformers/models/vitmatte/image_processing_vitmatte.py +++ b/src/transformers/models/vitmatte/image_processing_vitmatte.py @@ -41,7 +41,7 @@ logger = logging.get_logger(__name__) -class VitMatteImageProcessorKwargs(ImagesKwargs): +class VitMatteImageProcessorKwargs(ImagesKwargs, total=False): size_divisor: Optional[int] diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index 21aac76adac8..19fda87897ae 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -81,7 +81,7 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) -class YolosImageProcessorKwargs(ImagesKwargs): +class YolosImageProcessorKwargs(ImagesKwargs, total=False): r""" format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): Data format of the annotations. One of "coco_detection" or "coco_panoptic". diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py index e8ad44dd76c3..3fdf414bc20c 100644 --- a/src/transformers/models/zoedepth/image_processing_zoedepth.py +++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py @@ -62,7 +62,7 @@ logger = logging.get_logger(__name__) -class ZoeDepthImageProcessorKwargs(ImagesKwargs): +class ZoeDepthImageProcessorKwargs(ImagesKwargs, total=False): """ keep_aspect_ratio (`bool`, *optional*, defaults to `True`): If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 17af2698ae56..0d49db5fa229 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -30,6 +30,7 @@ import numpy as np import typing_extensions +from huggingface_hub.dataclasses import validate_typed_dict from huggingface_hub.errors import EntryNotFoundError from .audio_utils import AudioInput, load_audio @@ -38,7 +39,6 @@ from .image_utils import ChannelDimension, ImageInput, is_vision_available from .utils.chat_template_utils import render_jinja_template from .utils.type_validators import ( - TypedDictAdapter, device_validator, image_size_validator, padding_validator, @@ -1398,9 +1398,9 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg typed_dict_obj = TypedDict( "merged_typed_dict", {**preprocessor_typed_dict_obj.__annotations__, **typed_dict_obj.__annotations__}, + total=False, ) - type_validator = TypedDictAdapter(typed_dict_obj) - type_validator.validate_fields(**output_kwargs[key]) + validate_typed_dict(typed_dict_obj, output_kwargs[key]) return output_kwargs @classmethod diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py index 4b56f7842480..57c839151a76 100644 --- a/src/transformers/utils/type_validators.py +++ b/src/transformers/utils/type_validators.py @@ -2,7 +2,7 @@ from dataclasses import MISSING, field, make_dataclass from typing import Annotated, ForwardRef, Optional, TypedDict, Union, get_args, get_origin -from huggingface_hub.dataclasses import as_validated_field, strict +from huggingface_hub.dataclasses import strict from ..tokenization_utils_base import PaddingStrategy, TruncationStrategy from ..video_utils import VideoMetadataType @@ -100,19 +100,16 @@ class TokenizerKwargs(TypedDict, total=False): return make_dataclass(self.type.__name__ + "Config", fields) -@as_validated_field def positive_any_number(value: Optional[Union[int, float]] = None): if value is not None and (not isinstance(value, (int, float)) or not value >= 0): raise ValueError(f"Value must be a positive integer or floating number, got {value}") -@as_validated_field def positive_int(value: Optional[int] = None): if value is not None and (not isinstance(value, int) or not value >= 0): raise ValueError(f"Value must be a positive integer, got {value}") -@as_validated_field def padding_validator(value: Optional[Union[bool, str, PaddingStrategy]] = None): possible_names = ["longest", "max_length", "do_not_pad"] if value is None: @@ -123,7 +120,6 @@ def padding_validator(value: Optional[Union[bool, str, PaddingStrategy]] = None) raise ValueError(f"If padding is a string, the value must be one of {possible_names}") -@as_validated_field def truncation_validator(value: Optional[Union[bool, str, TruncationStrategy]] = None): possible_names = ["only_first", "only_second", "longest_first", "do_not_truncate"] if value is None: @@ -134,7 +130,6 @@ def truncation_validator(value: Optional[Union[bool, str, TruncationStrategy]] = raise ValueError(f"If truncation is a string, value must be one of {possible_names}") -@as_validated_field def image_size_validator(value: Optional[Union[int, Sequence[int], dict[str, int]]] = None): possible_keys = ["height", "width", "longest_edge", "shortest_edge", "max_height", "max_width"] if value is None: @@ -143,7 +138,6 @@ def image_size_validator(value: Optional[Union[int, Sequence[int], dict[str, int raise ValueError(f"Value for size must be a dict with keys {possible_keys} but got size={value}") -@as_validated_field def device_validator(value: Optional[Union[str, int]] = None): possible_names = ["cpu", "cuda", "xla", "xpu", "mps", "meta"] if value is None: @@ -160,7 +154,6 @@ def device_validator(value: Optional[Union[str, int]] = None): ) -@as_validated_field def resampling_validator(value: Optional[Union[int, "PILImageResampling"]] = None): if value is None: pass @@ -172,7 +165,6 @@ def resampling_validator(value: Optional[Union[int, "PILImageResampling"]] = Non raise ValueError(f"The resampling should an integer or `PIL.Image.Resampling`, but got resampling={value}") -@as_validated_field def video_metadata_validator(value: Optional[VideoMetadataType] = None): if value is None: return @@ -204,7 +196,6 @@ def check_dict_keys(d: dict) -> bool: ) -@as_validated_field def tensor_type_validator(value: Optional[Union[str, TensorType]] = None): possible_names = ["pt", "np", "mlx"] if value is None: diff --git a/src/transformers/video_processing_utils.py b/src/transformers/video_processing_utils.py index e5f842611a7b..4283c163c574 100644 --- a/src/transformers/video_processing_utils.py +++ b/src/transformers/video_processing_utils.py @@ -21,6 +21,7 @@ from typing import Any, Callable, Optional, Union import numpy as np +from huggingface_hub.dataclasses import validate_typed_dict from .dynamic_module_utils import custom_object_save from .image_processing_utils import ( @@ -51,7 +52,6 @@ ) from .utils.hub import cached_file from .utils.import_utils import requires -from .utils.type_validators import TypedDictAdapter from .video_utils import ( VideoInput, VideoMetadata, @@ -361,8 +361,7 @@ def preprocess( ) # Perform type validation on received kwargs - type_validator = TypedDictAdapter(self.valid_kwargs) - type_validator.validate_fields(**kwargs) + validate_typed_dict(self.valid_kwargs, kwargs) # Set default kwargs from self. This ensures that if a kwarg is not provided # by the user, it gets its default value from the instance, or is set to None. From 7a4e79f59d91b9aa95d692a5d491cfc601042ad4 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 8 Oct 2025 13:57:20 +0200 Subject: [PATCH 25/28] delete typed dict adaptor --- src/transformers/utils/type_validators.py | 91 +---------------------- 1 file changed, 1 insertion(+), 90 deletions(-) diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py index 57c839151a76..6e6ccdc4c8e9 100644 --- a/src/transformers/utils/type_validators.py +++ b/src/transformers/utils/type_validators.py @@ -1,8 +1,5 @@ from collections.abc import Sequence -from dataclasses import MISSING, field, make_dataclass -from typing import Annotated, ForwardRef, Optional, TypedDict, Union, get_args, get_origin - -from huggingface_hub.dataclasses import strict +from typing import Optional, Union from ..tokenization_utils_base import PaddingStrategy, TruncationStrategy from ..video_utils import VideoMetadataType @@ -14,92 +11,6 @@ from ..image_utils import PILImageResampling -def unpack_annotated_type(type): - if get_origin(type) is Annotated: - base, *meta = get_args(type) - return base, meta[0] - return type, field(default=MISSING) - - -def get_type_hints_from_typed_dict(obj: type[TypedDict]): - """ - Same as `typing.get_type_hints` but does not perform evaluation - on the ForwardRefs. Evaluating might fails if the package is not imported - or installed, therefore we will have our own "guarded" type validations. - All `ForwardRef` will be ignored by the hub validator - """ - raw_annots = obj.__dict__.get("__annotations__", {}) - type_hints = {} - for name, value in raw_annots.items(): - if value is None: - value = type(None) - if isinstance(value, str): - value = ForwardRef(value, is_argument=False) - type_hints[name] = value - return type_hints - - -# Minimalistic version of pydantic.TypeAdapter tailored for `TypedDict` -class TypedDictAdapter: - """ - A utility class used to convert a TypedDict object to dataclass and attach - a hub validator on top based on TypedDict annotations. - - We don't want to replace `TypedDict` by dataclasses in the codebase because - with dataclasses we will lose typing hints that `Unpack[TypedDict]` gives. - So this utility is a sweet spot to keep the balance between DevX and strong - typing`validation. - - Args: - type: The TypedDict object that needs to be validated. - """ - - def __init__( - self, - type: type[TypedDict], - ): - self.type = type - self.dataclass = self.create_dataclass() - self.dataclass = strict(self.dataclass) - - def validate_fields(self, **kwargs): - # If not all kwargs are set, dataclass raises an error in python <= 3.9 - # In newer python we can bypass by creating a dataclass with `kw_only=True` - for field in self.fields: - if field[0] not in kwargs: - kwargs[field[0]] = None - self.dataclass(**kwargs) - - def create_dataclass(self): - """ - Creates a dataclass object dynamically from `TypedDict`, so that - we can use strict type validation from typing hints with `TypedDict`. - - Example: - - @as_validated_field - def padding_validator(value: Union[bool, str, PaddingStrategy] = None): - if value is None: - return - if not isinstance(value, (bool, str, PaddingStrategy)): - raise ValueError(f"Value must be one of '[bool, string, PaddingStrategy]'") - if isinstance(value, str) and value not in ["longest", "max_length", "do_not_pad"]: - raise ValueError(f'Value for padding must be one of `["longest", "max_length", "do_not_pad"]`') - - class TokenizerKwargs(TypedDict, total=False): - text: str - padding: Annotated[Union[bool, str, PaddingStrategy], padding_validator()] - - # Now we can create a dataclass and warp it with hub validators for type constraints - # The dataclass can also be used as a simple config class for easier kwarg management - dataclass = dataclass_from_typed_dict(TokenizerKwargs) - """ - hints = get_type_hints_from_typed_dict(self.type) - fields = [(k, *unpack_annotated_type(v)) for k, v in hints.items()] - self.fields = fields - return make_dataclass(self.type.__name__ + "Config", fields) - - def positive_any_number(value: Optional[Union[int, float]] = None): if value is not None and (not isinstance(value, (int, float)) or not value >= 0): raise ValueError(f"Value must be a positive integer or floating number, got {value}") From 0395b54b2ed57df75b96888364b93a583ac92fe2 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 8 Oct 2025 14:04:26 +0200 Subject: [PATCH 26/28] address comments --- src/transformers/models/kosmos2/processing_kosmos2.py | 2 +- tests/models/vitmatte/test_image_processing_vitmatte.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py index c839bbd6cfb4..2bc653ab3276 100644 --- a/src/transformers/models/kosmos2/processing_kosmos2.py +++ b/src/transformers/models/kosmos2/processing_kosmos2.py @@ -37,7 +37,7 @@ class Kosmos2ImagesKwargs(ImagesKwargs, total=False): - bboxes: Optional[NestedList] + bboxes: Optional[NestedList] # NOTE: hub validators can't accept `Sequence` num_image_tokens: Optional[int] first_image_token_id: Optional[int] diff --git a/tests/models/vitmatte/test_image_processing_vitmatte.py b/tests/models/vitmatte/test_image_processing_vitmatte.py index a17968a1d567..b100fb3c30b6 100644 --- a/tests/models/vitmatte/test_image_processing_vitmatte.py +++ b/tests/models/vitmatte/test_image_processing_vitmatte.py @@ -261,7 +261,7 @@ def test_image_processor_preprocess_arguments(self): trimap = np.random.randint(0, 3, size=image.size[::-1]) # Type validation will fail for fast processors only (for now) - if i == 1: + if image_processing_class.__name__.endswith("Fast"): with self.assertRaises(TypeError): image_processor(image, trimaps=trimap, extra_argument=True) else: From 34c9ec71945965a6e18279c2669f8681daed2945 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 8 Oct 2025 15:44:59 +0200 Subject: [PATCH 27/28] delete optionals --- src/transformers/models/aria/modular_aria.py | 6 +-- .../models/beit/image_processing_beit.py | 2 +- .../image_processing_bridgetower.py | 2 +- .../cohere2_vision/modular_cohere2_vision.py | 6 +-- .../image_processing_conditional_detr.py | 6 +-- .../convnext/image_processing_convnext.py | 2 +- .../image_processing_deepseek_vl.py | 2 +- .../modular_deepseek_vl_hybrid.py | 10 ++--- .../image_processing_deformable_detr.py | 6 +-- .../models/detr/image_processing_detr.py | 6 +-- .../models/donut/image_processing_donut.py | 4 +- .../models/dpt/image_processing_dpt.py | 8 ++-- .../image_processing_efficientloftr.py | 2 +- .../image_processing_efficientnet.py | 4 +- .../models/emu3/image_processing_emu3.py | 4 +- .../models/eomt/image_processing_eomt.py | 2 +- .../models/flava/image_processing_flava.py | 38 +++++++++---------- .../models/gemma3/image_processing_gemma3.py | 8 ++-- .../models/glm4v/image_processing_glm4v.py | 6 +-- .../models/glm4v/video_processing_glm4v.py | 10 ++--- .../got_ocr2/image_processing_got_ocr2.py | 6 +-- .../models/got_ocr2/processing_got_ocr2.py | 10 ++--- .../image_processing_grounding_dino.py | 6 +-- .../idefics/image_processing_idefics.py | 2 +- .../idefics2/image_processing_idefics2.py | 2 +- .../idefics3/image_processing_idefics3.py | 6 +-- .../imagegpt/image_processing_imagegpt.py | 2 +- .../video_processing_instructblipvideo.py | 8 ---- .../internvl/video_processing_internvl.py | 2 +- .../models/janus/modular_janus.py | 2 +- .../models/kosmos2/processing_kosmos2.py | 4 +- .../kosmos2_5/image_processing_kosmos2_5.py | 4 +- .../layoutlmv2/image_processing_layoutlmv2.py | 2 +- .../layoutlmv3/image_processing_layoutlmv3.py | 2 +- .../lfm2_vl/image_processing_lfm2_vl_fast.py | 26 ++++++------- .../models/lfm2_vl/processing_lfm2_vl.py | 19 ---------- .../llama4/image_processing_llama4_fast.py | 4 +- .../llava_next/image_processing_llava_next.py | 2 +- .../image_processing_llava_onevision.py | 2 +- .../image_processing_mask2former.py | 4 +- .../maskformer/image_processing_maskformer.py | 4 +- .../models/mllama/image_processing_mllama.py | 2 +- .../image_processing_mobilenet_v2.py | 2 +- .../mobilevit/image_processing_mobilevit.py | 4 +- .../models/nougat/image_processing_nougat.py | 6 +-- .../oneformer/image_processing_oneformer.py | 2 +- .../models/ovis2/image_processing_ovis2.py | 8 ++-- .../image_processing_perception_lm_fast.py | 4 +- .../image_processing_phi4_multimodal_fast.py | 4 +- .../pix2struct/image_processing_pix2struct.py | 2 +- .../pixtral/image_processing_pixtral.py | 2 +- .../poolformer/image_processing_poolformer.py | 2 +- .../image_processing_prompt_depth_anything.py | 8 ++-- .../qwen2_5_omni/processing_qwen2_5_omni.py | 20 +++++----- .../qwen2_vl/image_processing_qwen2_vl.py | 10 ++--- .../qwen2_vl/video_processing_qwen2_vl.py | 14 +++---- .../processing_qwen3_omni_moe.py | 22 +++++------ .../qwen3_vl/video_processing_qwen3_vl.py | 10 ++--- .../rt_detr/image_processing_rt_detr.py | 6 +-- .../models/sam/image_processing_sam.py | 4 +- src/transformers/models/sam/processing_sam.py | 6 +-- .../models/sam2/image_processing_sam2_fast.py | 2 +- src/transformers/models/sam2/modular_sam2.py | 2 +- .../models/sam_hq/processing_samhq.py | 6 +-- .../segformer/image_processing_segformer.py | 2 +- .../siglip2/image_processing_siglip2.py | 4 +- .../smolvlm/image_processing_smolvlm.py | 6 +-- .../smolvlm/video_processing_smolvlm.py | 2 +- .../superpoint/image_processing_superpoint.py | 2 +- .../swin2sr/image_processing_swin2sr.py | 2 +- .../textnet/image_processing_textnet.py | 2 +- .../models/tvp/image_processing_tvp.py | 2 +- .../models/vilt/image_processing_vilt.py | 2 +- .../vitmatte/image_processing_vitmatte.py | 2 +- .../models/yolos/image_processing_yolos.py | 6 +-- .../zoedepth/image_processing_zoedepth.py | 4 +- src/transformers/processing_utils.py | 1 - 77 files changed, 205 insertions(+), 233 deletions(-) diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index a4dda1eee8f4..46e35911c1f1 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -905,9 +905,9 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non class AriaImagesKwargs(ImagesKwargs, total=False): - split_image: Optional[bool] - max_image_size: Optional[int] - min_image_size: Optional[int] + split_image: bool + max_image_size: int + min_image_size: int class AriaProcessorKwargs(ProcessingKwargs, total=False): diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py index 9025e2868df8..884619f12b13 100644 --- a/src/transformers/models/beit/image_processing_beit.py +++ b/src/transformers/models/beit/image_processing_beit.py @@ -63,7 +63,7 @@ class BeitImageProcessorKwargs(ImagesKwargs, total=False): ADE20k). The background label will be replaced by 255. """ - do_reduce_labels: Optional[bool] + do_reduce_labels: bool @requires(backends=("vision",)) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index c44cf6a7ee3d..73bfc7407666 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -124,7 +124,7 @@ def get_resize_output_image_size( class BridgeTowerImageProcessorKwargs(ImagesKwargs, total=False): - size_divisor: Optional[int] + size_divisor: int class BridgeTowerImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py index 5df3e075ed20..b801c24575ca 100644 --- a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py @@ -316,9 +316,9 @@ class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False): set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. """ - crop_to_patches: Optional[bool] - min_patches: Optional[int] - max_patches: Optional[int] + crop_to_patches: bool + min_patches: int + max_patches: int @auto_docstring diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index efc532d413c4..3f639e0c1ae3 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -745,9 +745,9 @@ class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False): Path to the directory containing the segmentation masks. """ - format: Optional[Union[str, AnnotationFormat]] - do_convert_annotations: Optional[bool] - return_segmentation_masks: Optional[bool] + format: Union[str, AnnotationFormat] + do_convert_annotations: bool + return_segmentation_masks: bool annotations: Optional[Union[AnnotationType, list[AnnotationType]]] masks_path: Optional[Union[str, pathlib.Path]] diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py index 95e9cd91bd4a..c4e279346f3c 100644 --- a/src/transformers/models/convnext/image_processing_convnext.py +++ b/src/transformers/models/convnext/image_processing_convnext.py @@ -57,7 +57,7 @@ class ConvNextImageProcessorKwargs(ImagesKwargs, total=False): overridden by `crop_pct` in the`preprocess` method. """ - crop_pct: Optional[float] + crop_pct: float @requires(backends=("vision",)) diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py index f3f4c1ac6e34..763182de4039 100644 --- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py @@ -56,7 +56,7 @@ class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False): falls below this value after resizing. """ - min_size: Optional[int] + min_size: int class DeepseekVLImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py index 343ba01ac874..43af7d43dfb3 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -448,11 +448,11 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False): number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. """ - min_size: Optional[int] - high_res_size: Optional[dict] - high_res_resample: Optional[Union["PILImageResampling", int]] - high_res_image_mean: Optional[Union[float, list[float], tuple[float, ...]]] - high_res_image_std: Optional[Union[float, list[float], tuple[float, ...]]] + min_size: int + high_res_size: dict + high_res_resample: Union["PILImageResampling", int] + high_res_image_mean: Union[float, list[float], tuple[float, ...]] + high_res_image_std: Union[float, list[float], tuple[float, ...]] class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor): diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index 9fd63b6340c6..83587f45c295 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -98,9 +98,9 @@ class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False): Path to the directory containing the segmentation masks. """ - format: Optional[Union[str, AnnotationFormat]] - do_convert_annotations: Optional[bool] - return_segmentation_masks: Optional[bool] + format: Union[str, AnnotationFormat] + do_convert_annotations: bool + return_segmentation_masks: bool annotations: Optional[Union[AnnotationType, list[AnnotationType]]] masks_path: Optional[Union[str, pathlib.Path]] diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index 1cd636a0cb72..2f149b662ec2 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -100,9 +100,9 @@ class DetrImageProcessorKwargs(ImagesKwargs, total=False): Path to the directory containing the segmentation masks. """ - format: Optional[Union[str, AnnotationFormat]] - do_convert_annotations: Optional[bool] - return_segmentation_masks: Optional[bool] + format: Union[str, AnnotationFormat] + do_convert_annotations: bool + return_segmentation_masks: bool annotations: Optional[Union[AnnotationType, list[AnnotationType]]] masks_path: Optional[Union[str, pathlib.Path]] diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py index 802b05c776fd..0f74ac62ec92 100644 --- a/src/transformers/models/donut/image_processing_donut.py +++ b/src/transformers/models/donut/image_processing_donut.py @@ -60,8 +60,8 @@ class DonutImageProcessorKwargs(ImagesKwargs, total=False): Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees. """ - do_thumbnail: Optional[bool] - do_align_long_axis: Optional[bool] + do_thumbnail: bool + do_align_long_axis: bool @requires(backends=("vision",)) diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py index 71e930a7bfcf..6246b1f3f7c0 100644 --- a/src/transformers/models/dpt/image_processing_dpt.py +++ b/src/transformers/models/dpt/image_processing_dpt.py @@ -78,10 +78,10 @@ class DPTImageProcessorKwargs(ImagesKwargs, total=False): ADE20k). The background label will be replaced by 255. """ - ensure_multiple_of: Optional[int] - size_divisor: Optional[int] - keep_aspect_ratio: Optional[bool] - do_reduce_labels: Optional[bool] + ensure_multiple_of: int + size_divisor: int + keep_aspect_ratio: bool + do_reduce_labels: bool def get_resize_output_image_size( diff --git a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py index a5c06be89e98..acf9105fe77a 100644 --- a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py +++ b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py @@ -56,7 +56,7 @@ class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False): Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. """ - do_grayscale: Optional[bool] = True + do_grayscale: bool # Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py index 0a3a7542ff67..2a5b5c93749b 100644 --- a/src/transformers/models/efficientnet/image_processing_efficientnet.py +++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py @@ -52,8 +52,8 @@ class EfficientNetImageProcessorKwargs(ImagesKwargs, total=False): Normalize the image again with the standard deviation only for image classification if set to True. """ - rescale_offset: Optional[bool] - include_top: Optional[bool] + rescale_offset: bool + include_top: bool class EfficientNetImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py index ec270fef2c87..0c550937581f 100644 --- a/src/transformers/models/emu3/image_processing_emu3.py +++ b/src/transformers/models/emu3/image_processing_emu3.py @@ -48,8 +48,8 @@ class Emu3ImageProcessorKwargs(ImagesKwargs, total=False): - ratio: Optional[str] - image_area: Optional[int] + ratio: str + image_area: int def smart_resize( diff --git a/src/transformers/models/eomt/image_processing_eomt.py b/src/transformers/models/eomt/image_processing_eomt.py index b422ca3020d9..3459911cde1f 100644 --- a/src/transformers/models/eomt/image_processing_eomt.py +++ b/src/transformers/models/eomt/image_processing_eomt.py @@ -66,7 +66,7 @@ class EomtImageProcessorKwargs(ImagesKwargs, total=False): denoted with 0 (background) will be replaced with `ignore_index`. """ - do_split_image: Optional[bool] + do_split_image: bool ignore_index: Optional[int] diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py index 2005011dcb2b..b62717ae2cd6 100644 --- a/src/transformers/models/flava/image_processing_flava.py +++ b/src/transformers/models/flava/image_processing_flava.py @@ -118,26 +118,26 @@ class FlavaImageProcessorKwargs(ImagesKwargs, total=False): """ # Mask related params - return_image_mask: Optional[bool] - input_size_patches: Optional[int] - total_mask_patches: Optional[int] - mask_group_min_patches: Optional[int] - mask_group_max_patches: Optional[int] - mask_group_min_aspect_ratio: Optional[float] - mask_group_max_aspect_ratio: Optional[float] + return_image_mask: bool + input_size_patches: int + total_mask_patches: int + mask_group_min_patches: int + mask_group_max_patches: int + mask_group_min_aspect_ratio: float + mask_group_max_aspect_ratio: float # Codebook related params - return_codebook_pixels: Optional[bool] - codebook_do_resize: Optional[bool] - codebook_size: Optional[bool] - codebook_resample: Optional[int] - codebook_do_center_crop: Optional[bool] - codebook_crop_size: Optional[int] - codebook_do_rescale: Optional[bool] - codebook_rescale_factor: Optional[Union[int, float]] - codebook_do_map_pixels: Optional[bool] - codebook_do_normalize: Optional[bool] - codebook_image_mean: Optional[Union[float, Iterable[float]]] - codebook_image_std: Optional[Union[float, Iterable[float]]] + return_codebook_pixels: bool + codebook_do_resize: bool + codebook_size: bool + codebook_resample: int + codebook_do_center_crop: bool + codebook_crop_size: int + codebook_do_rescale: bool + codebook_rescale_factor: Union[int, float] + codebook_do_map_pixels: bool + codebook_do_normalize: bool + codebook_image_mean: Union[float, Iterable[float]] + codebook_image_std: Union[float, Iterable[float]] # Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py diff --git a/src/transformers/models/gemma3/image_processing_gemma3.py b/src/transformers/models/gemma3/image_processing_gemma3.py index 4b32a1f31a05..d4bd7a00000e 100644 --- a/src/transformers/models/gemma3/image_processing_gemma3.py +++ b/src/transformers/models/gemma3/image_processing_gemma3.py @@ -63,10 +63,10 @@ class Gemma3ImageProcessorKwargs(ImagesKwargs, total=False): Minimum aspect ratio to activate pan and scan. """ - do_pan_and_scan: Optional[bool] - pan_and_scan_min_crop_size: Optional[int] - pan_and_scan_max_num_crops: Optional[int] - pan_and_scan_min_ratio_to_activate: Optional[float] + do_pan_and_scan: bool + pan_and_scan_min_crop_size: int + pan_and_scan_max_num_crops: int + pan_and_scan_min_ratio_to_activate: float class Gemma3ImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/glm4v/image_processing_glm4v.py b/src/transformers/models/glm4v/image_processing_glm4v.py index e58d295ca465..9a4348010750 100644 --- a/src/transformers/models/glm4v/image_processing_glm4v.py +++ b/src/transformers/models/glm4v/image_processing_glm4v.py @@ -57,9 +57,9 @@ class Glm4vImageProcessorKwargs(ImagesKwargs, total=False): The merge size of the vision encoder to llm encoder. """ - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] + patch_size: int + temporal_patch_size: int + merge_size: int def smart_resize( diff --git a/src/transformers/models/glm4v/video_processing_glm4v.py b/src/transformers/models/glm4v/video_processing_glm4v.py index 95ea8160c606..f27adfc7e25e 100644 --- a/src/transformers/models/glm4v/video_processing_glm4v.py +++ b/src/transformers/models/glm4v/video_processing_glm4v.py @@ -37,11 +37,11 @@ class Glm4vVideoProcessorInitKwargs(VideosKwargs, total=False): - max_image_size: Optional[dict[str, int]] - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - max_duration: Optional[int] + max_image_size: dict[str, int] + patch_size: int + temporal_patch_size: int + merge_size: int + max_duration: int @add_start_docstrings( diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py index 5882f22b1bd0..3fd5f7d512c1 100644 --- a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py +++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py @@ -62,9 +62,9 @@ class GotOcr2ImageProcessorKwargs(ImagesKwargs, total=False): set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. """ - crop_to_patches: Optional[bool] - min_patches: Optional[int] - max_patches: Optional[int] + crop_to_patches: bool + min_patches: int + max_patches: int # Similar to image_processing_mllama.get_all_supported_aspect_ratios diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py index 447122e18c22..1843b7f28830 100644 --- a/src/transformers/models/got_ocr2/processing_got_ocr2.py +++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py @@ -36,13 +36,13 @@ class GotOcr2TextKwargs(TextKwargs, total=False): class GotOcr2ImagesKwargs(ImagesKwargs, total=False): - crop_to_patches: Optional[bool] - min_patches: Optional[int] - max_patches: Optional[int] + crop_to_patches: bool + min_patches: int + max_patches: int box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]] color: Optional[str] - num_image_tokens: Optional[int] - multi_page: Optional[bool] + num_image_tokens: int + multi_page: bool class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False): diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index f556b2d295c3..eb21ea3b376e 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -109,9 +109,9 @@ class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False): Path to the directory containing the segmentation masks. """ - format: Optional[Union[str, AnnotationFormat]] - do_convert_annotations: Optional[bool] - return_segmentation_masks: Optional[bool] + format: Union[str, AnnotationFormat] + do_convert_annotations: bool + return_segmentation_masks: bool annotations: Optional[Union[AnnotationType, list[AnnotationType]]] masks_path: Optional[Union[str, pathlib.Path]] diff --git a/src/transformers/models/idefics/image_processing_idefics.py b/src/transformers/models/idefics/image_processing_idefics.py index 6a53e4d9b7d4..870c741b826d 100644 --- a/src/transformers/models/idefics/image_processing_idefics.py +++ b/src/transformers/models/idefics/image_processing_idefics.py @@ -47,7 +47,7 @@ class IdeficsImageProcessorKwargs(ImagesKwargs, total=False): """ transform: Optional[Callable] - image_size: Optional[dict[str, int]] + image_size: dict[str, int] def convert_to_rgb(image): diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py index 1baec594bc90..e068ac42f403 100644 --- a/src/transformers/models/idefics2/image_processing_idefics2.py +++ b/src/transformers/models/idefics2/image_processing_idefics2.py @@ -53,7 +53,7 @@ class Idefics2ImageProcessorKwargs(ImagesKwargs, total=False): Whether to split the image into a sequence 4 equal sub-images concatenated with the original image. """ - do_image_splitting: Optional[bool] + do_image_splitting: bool def get_resize_output_image_size(image, size, input_data_format) -> tuple[int, int]: diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py index d53a75596fea..65e17ef4b776 100644 --- a/src/transformers/models/idefics3/image_processing_idefics3.py +++ b/src/transformers/models/idefics3/image_processing_idefics3.py @@ -59,9 +59,9 @@ class Idefics3ImageProcessorKwargs(ImagesKwargs, total=False): Whether to return the row and column information of the images. """ - do_image_splitting: Optional[bool] - max_image_size: Optional[dict[str, int]] - return_row_col_info: Optional[bool] + do_image_splitting: bool + max_image_size: dict[str, int] + return_row_col_info: bool def _resize_output_size_rescale_to_max_len( diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py index f5a1682d9be3..ab7057f7d407 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py @@ -56,7 +56,7 @@ class ImageGPTImageProcessorKwargs(ImagesKwargs, total=False): """ clusters: Optional[Union[np.ndarray, list[list[int]], "torch.Tensor"]] - do_color_quantize: Optional[bool] + do_color_quantize: bool def squared_euclidean_distance(a, b): diff --git a/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py index d89d9069d495..f2c49925ef19 100644 --- a/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py @@ -24,15 +24,11 @@ from ...image_processing_utils import BatchFeature from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict -from ...processing_utils import Unpack, VideosKwargs from ...utils import TensorType from ...video_processing_utils import BaseVideoProcessor from ...video_utils import group_videos_by_shape, reorder_videos -class InstructBlipVideoVideoProcessorInitKwargs(VideosKwargs, total=False): ... - - class InstructBlipVideoVideoProcessor(BaseVideoProcessor): resample = PILImageResampling.BICUBIC image_mean = OPENAI_CLIP_MEAN @@ -44,12 +40,8 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor): do_normalize = True do_convert_rgb = True do_sample_frames = False # Set to False for BC, recommended to set `True` in new models - valid_kwargs = InstructBlipVideoVideoProcessorInitKwargs model_input_names = ["pixel_values"] - def __init__(self, **kwargs: Unpack[InstructBlipVideoVideoProcessorInitKwargs]): - super().__init__(**kwargs) - def _preprocess( self, videos: list["torch.Tensor"], diff --git a/src/transformers/models/internvl/video_processing_internvl.py b/src/transformers/models/internvl/video_processing_internvl.py index 703c40b94f0c..a544bb08815a 100644 --- a/src/transformers/models/internvl/video_processing_internvl.py +++ b/src/transformers/models/internvl/video_processing_internvl.py @@ -28,7 +28,7 @@ class InternVLVideoProcessorInitKwargs(VideosKwargs, total=False): - initial_shift: Optional[Union[bool, float, int]] + initial_shift: Union[bool, float, int] class InternVLVideoProcessor(BaseVideoProcessor): diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index 36fe7abae438..6a1742b44362 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -1296,7 +1296,7 @@ class JanusImageProcessorKwargs(ImagesKwargs, total=False): falls below this value after resizing. """ - min_size: Optional[int] + min_size: int class JanusImageProcessor(BlipImageProcessor): diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py index 2bc653ab3276..f9fb98df6ac2 100644 --- a/src/transformers/models/kosmos2/processing_kosmos2.py +++ b/src/transformers/models/kosmos2/processing_kosmos2.py @@ -38,12 +38,12 @@ class Kosmos2ImagesKwargs(ImagesKwargs, total=False): bboxes: Optional[NestedList] # NOTE: hub validators can't accept `Sequence` - num_image_tokens: Optional[int] + num_image_tokens: int first_image_token_id: Optional[int] class Kosmos2TextKwargs(TextKwargs, total=False): - add_eos_token: Optional[bool] + add_eos_token: bool class Kosmos2ProcessorKwargs(ProcessingKwargs, total=False): diff --git a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py index ca80cb978ec9..fed17e08e1a7 100644 --- a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py +++ b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py @@ -55,8 +55,8 @@ class Kosmos2_5ImageProcessorKwargs(ImagesKwargs, total=False): [KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419). """ - patch_size: Optional[dict[str, int]] - max_patches: Optional[int] + patch_size: dict[str, int] + max_patches: int # Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py index 5e9289c2701b..6f53698f30b2 100644 --- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py @@ -66,7 +66,7 @@ class LayoutLMv2ImageProcessorKwargs(ImagesKwargs, total=False): `preprocess` method. """ - apply_ocr: Optional[bool] + apply_ocr: bool ocr_lang: Optional[str] tesseract_config: Optional[str] diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py index 9ba4f5507fc1..44d4b33e11d9 100644 --- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py @@ -70,7 +70,7 @@ class LayoutLMv3ImageProcessorKwargs(ImagesKwargs, total=False): `preprocess` method. """ - apply_ocr: Optional[bool] + apply_ocr: bool ocr_lang: Optional[str] tesseract_config: Optional[str] diff --git a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py index 3d16a4d0a273..85d8fcd11b92 100755 --- a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +++ b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py @@ -14,7 +14,7 @@ # limitations under the License. import math from functools import lru_cache -from typing import Optional, Union +from typing import Union import torch from torchvision.transforms.v2 import functional as F @@ -175,18 +175,18 @@ class Lfm2VlImageProcessorKwargs(ImagesKwargs, total=False): The downsampling factor for images used when resizing the image. """ - downsample_factor: Optional[int] - do_image_splitting: Optional[bool] - min_tiles: Optional[int] - max_tiles: Optional[int] - use_thumbnail: Optional[bool] - min_image_tokens: Optional[int] - max_image_tokens: Optional[int] - encoder_patch_size: Optional[int] - tile_size: Optional[int] - max_pixels_tolerance: Optional[float] - do_pad: Optional[bool] - return_row_col_info: Optional[bool] + downsample_factor: int + do_image_splitting: bool + min_tiles: int + max_tiles: int + use_thumbnail: bool + min_image_tokens: int + max_image_tokens: int + encoder_patch_size: int + tile_size: int + max_pixels_tolerance: float + do_pad: bool + return_row_col_info: bool @auto_docstring diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py index b00ee3d05d04..311dfdc3b123 100755 --- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py +++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py @@ -18,7 +18,6 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, make_nested_list_of_images from ...processing_utils import ( - ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, @@ -31,30 +30,12 @@ logger = logging.get_logger(__name__) -class Lfm2VlImagesKwargs(ImagesKwargs, total=False): - downsample_factor: Optional[int] - do_image_splitting: Optional[bool] - min_tiles: Optional[int] - max_tiles: Optional[int] - use_thumbnail: Optional[bool] - min_image_tokens: Optional[int] - max_image_tokens: Optional[int] - encoder_patch_size: Optional[int] - tile_size: Optional[int] - max_pixels_tolerance: Optional[float] - patch_size: Optional[int] - do_pad: Optional[bool] - return_row_col_info: Optional[bool] - - class Lfm2VlTextKwargs(TextKwargs, total=False): use_image_special_tokens: Optional[bool] class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Lfm2VlImagesKwargs text_kwargs: Lfm2VlTextKwargs - _defaults = { "images_kwargs": { "return_row_col_info": True, diff --git a/src/transformers/models/llama4/image_processing_llama4_fast.py b/src/transformers/models/llama4/image_processing_llama4_fast.py index 42b2221d6a18..ccbb60585b0b 100644 --- a/src/transformers/models/llama4/image_processing_llama4_fast.py +++ b/src/transformers/models/llama4/image_processing_llama4_fast.py @@ -320,8 +320,8 @@ class Llama4ImageProcessorKwargs(ImagesKwargs, total=False): but never upsample, unless the image is smaller than the patch size. """ - max_patches: Optional[int] - resize_to_max_canvas: Optional[bool] + max_patches: int + resize_to_max_canvas: bool @auto_docstring diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index 26ca94dad6e9..c4bc1ed07287 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -67,7 +67,7 @@ class LlavaNextImageProcessorKwargs(ImagesKwargs, total=False): method. """ - image_grid_pinpoints: Optional[list[list[int]]] + image_grid_pinpoints: list[list[int]] def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]: diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py index 3654edcdbf71..4b0f399e4959 100644 --- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py @@ -66,7 +66,7 @@ class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False): method. """ - image_grid_pinpoints: Optional[list[list[int]]] + image_grid_pinpoints: list[list[int]] # Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py index 235dbe8039f1..79b449eae416 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_mask2former.py @@ -74,9 +74,9 @@ class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False): The number of labels in the segmentation map. """ - size_divisor: Optional[int] + size_divisor: int ignore_index: Optional[int] - do_reduce_labels: Optional[bool] + do_reduce_labels: bool num_labels: Optional[int] diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py index 4046fcafb07a..7d83809ced66 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer.py +++ b/src/transformers/models/maskformer/image_processing_maskformer.py @@ -80,9 +80,9 @@ class MaskFormerImageProcessorKwargs(ImagesKwargs, total=False): The number of labels in the segmentation map. """ - size_divisor: Optional[int] + size_divisor: int ignore_index: Optional[int] - do_reduce_labels: Optional[bool] + do_reduce_labels: bool num_labels: Optional[int] diff --git a/src/transformers/models/mllama/image_processing_mllama.py b/src/transformers/models/mllama/image_processing_mllama.py index 50579703b905..1a1d76774868 100644 --- a/src/transformers/models/mllama/image_processing_mllama.py +++ b/src/transformers/models/mllama/image_processing_mllama.py @@ -56,7 +56,7 @@ class MllamaImageProcessorKwargs(ImagesKwargs, total=False): The maximum number of tiles allowed. """ - max_image_tiles: Optional[int] + max_image_tiles: int @lru_cache(maxsize=10) diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py index ad4c6937b76a..876d9c6be444 100644 --- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py @@ -59,7 +59,7 @@ class MobileNetV2ImageProcessorKwargs(ImagesKwargs, total=False): ADE20k). The background label will be replaced by 255. """ - do_reduce_labels: Optional[bool] + do_reduce_labels: bool @requires(backends=("vision",)) diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py index 8a914608295f..0a9b6bc64423 100644 --- a/src/transformers/models/mobilevit/image_processing_mobilevit.py +++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py @@ -63,8 +63,8 @@ class MobileVitImageProcessorKwargs(ImagesKwargs, total=False): ADE20k). The background label will be replaced by 255. """ - do_flip_channel_order: Optional[bool] - do_reduce_labels: Optional[bool] + do_flip_channel_order: bool + do_reduce_labels: bool @requires(backends=("vision",)) diff --git a/src/transformers/models/nougat/image_processing_nougat.py b/src/transformers/models/nougat/image_processing_nougat.py index 35db47bdf0c9..a9178ab43e07 100644 --- a/src/transformers/models/nougat/image_processing_nougat.py +++ b/src/transformers/models/nougat/image_processing_nougat.py @@ -62,9 +62,9 @@ class NougatImageProcessorKwargs(ImagesKwargs, total=False): Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees. """ - do_crop_margin: Optional[bool] - do_thumbnail: Optional[bool] - do_align_long_axis: Optional[bool] + do_crop_margin: bool + do_thumbnail: bool + do_align_long_axis: bool class NougatImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py index 058c655559a8..00d4989fdf28 100644 --- a/src/transformers/models/oneformer/image_processing_oneformer.py +++ b/src/transformers/models/oneformer/image_processing_oneformer.py @@ -85,7 +85,7 @@ class OneFormerImageProcessorKwargs(ImagesKwargs, total=False): num_text: Optional[int] num_labels: Optional[int] ignore_index: Optional[int] - do_reduce_labels: Optional[bool] + do_reduce_labels: bool # Copied from transformers.models.detr.image_processing_detr.max_across_indices diff --git a/src/transformers/models/ovis2/image_processing_ovis2.py b/src/transformers/models/ovis2/image_processing_ovis2.py index ab4ae57f0cb8..4598e9f3f521 100644 --- a/src/transformers/models/ovis2/image_processing_ovis2.py +++ b/src/transformers/models/ovis2/image_processing_ovis2.py @@ -61,10 +61,10 @@ class Ovis2ImageProcessorKwargs(ImagesKwargs, total=False): `preprocess` method. """ - crop_to_patches: Optional[bool] - min_patches: Optional[int] - max_patches: Optional[int] - use_covering_area_grid: Optional[bool] + crop_to_patches: bool + min_patches: int + max_patches: int + use_covering_area_grid: bool # Similar to image_processing_mllama.get_all_supported_aspect_ratios diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py index 8409be51e034..03ff515e63af 100644 --- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py +++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py @@ -54,8 +54,8 @@ class PerceptionLMImageProcessorKwargs(ImagesKwargs, total=False): """ vision_input_type: Optional[str] - tile_size: Optional[int] - max_num_tiles: Optional[int] + tile_size: int + max_num_tiles: int @auto_docstring diff --git a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py index 6dcbf61f38f6..98f160a1fd5e 100644 --- a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +++ b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py @@ -43,8 +43,8 @@ class Phi4MultimodalImageProcessorKwargs(ImagesKwargs, total=False): The maximum number of crops per image. """ - patch_size: Optional[int] - dynamic_hd: Optional[int] + patch_size: int + dynamic_hd: int @auto_docstring diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py index 66cc7cd0b04a..3ec36ebda440 100644 --- a/src/transformers/models/pix2struct/image_processing_pix2struct.py +++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py @@ -57,7 +57,7 @@ class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False): Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`. """ - max_patches: Optional[int] + max_patches: int header_text: Optional[Union[list[str], str]] diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py index 387c82f4e0a0..3cbfaeb41922 100644 --- a/src/transformers/models/pixtral/image_processing_pixtral.py +++ b/src/transformers/models/pixtral/image_processing_pixtral.py @@ -56,7 +56,7 @@ class PixtralImageProcessorKwargs(ImagesKwargs, total=False): Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method. """ - patch_size: Optional[Union[dict[str, int], int]] + patch_size: Union[dict[str, int], int] # Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white. diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py index c7bb38c4340c..8d466739638d 100644 --- a/src/transformers/models/poolformer/image_processing_poolformer.py +++ b/src/transformers/models/poolformer/image_processing_poolformer.py @@ -54,7 +54,7 @@ class PoolFormerImageProcessorKwargs(ImagesKwargs, total=False): Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`. """ - crop_pct: Optional[float] + crop_pct: float class PoolFormerImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py index cb8c0e37af5f..b62ba7994f0a 100644 --- a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +++ b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py @@ -64,10 +64,10 @@ class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs, total=False): Scale factor to convert the prompt depth to meters. """ - keep_aspect_ratio: Optional[bool] - ensure_multiple_of: Optional[int] - size_divisor: Optional[int] - prompt_scale_to_meter: Optional[float] + keep_aspect_ratio: bool + ensure_multiple_of: int + size_divisor: int + prompt_scale_to_meter: float def _constrain_to_multiple_of(val, multiple, min_val=0, max_val=None): diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index f522f63beb33..ea60155999e6 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -33,16 +33,16 @@ # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni # and does not use them in video processor class class Qwen2_5_OmniVideosKwargs(VideosKwargs, total=False): - min_pixels: Optional[int] - max_pixels: Optional[int] - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - min_frames: Optional[int] - max_frames: Optional[int] - use_audio_in_video: Optional[bool] - seconds_per_chunk: Optional[float] - position_id_per_seconds: Optional[Union[int, float]] + min_pixels: int + max_pixels: int + patch_size: int + temporal_patch_size: int + merge_size: int + min_frames: int + max_frames: int + use_audio_in_video: bool + seconds_per_chunk: float + position_id_per_seconds: Union[int, float] class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False): diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py index 82b7bce43fe9..e5a1e0a7551e 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py @@ -66,11 +66,11 @@ class Qwen2VLImageProcessorKwargs(ImagesKwargs, total=False): The merge size of the vision encoder to llm encoder. """ - min_pixels: Optional[int] - max_pixels: Optional[int] - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] + min_pixels: int + max_pixels: int + patch_size: int + temporal_patch_size: int + merge_size: int def smart_resize( diff --git a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py index 2fc8bf1ac5b3..11b5ff80dade 100644 --- a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py @@ -42,13 +42,13 @@ class Qwen2VLVideoProcessorInitKwargs(VideosKwargs, total=False): - min_pixels: Optional[int] - max_pixels: Optional[int] - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - min_frames: Optional[int] - max_frames: Optional[int] + min_pixels: int + max_pixels: int + patch_size: int + temporal_patch_size: int + merge_size: int + min_frames: int + max_frames: int @add_start_docstrings( diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py index ba54b3a0c1f6..df5629931fa3 100644 --- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import re -from typing import Optional, Union +from typing import Union import numpy as np @@ -35,16 +35,16 @@ # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni # and does not use them in video processor class class Qwen3OmniMoeVideosKwargs(VideosKwargs, total=False): - min_pixels: Optional[int] - max_pixels: Optional[int] - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - min_frames: Optional[int] - max_frames: Optional[int] - use_audio_in_video: Optional[bool] - seconds_per_chunk: Optional[float] - position_id_per_seconds: Optional[Union[int, float]] + min_pixels: int + max_pixels: int + patch_size: int + temporal_patch_size: int + merge_size: int + min_frames: int + max_frames: int + use_audio_in_video: bool + seconds_per_chunk: float + position_id_per_seconds: Union[int, float] class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False): diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py index 3bb06d7f2b08..e74f55b642dd 100644 --- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py @@ -65,11 +65,11 @@ def smart_resize( class Qwen3VLVideoProcessorInitKwargs(VideosKwargs, total=False): - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - min_frames: Optional[int] - max_frames: Optional[int] + patch_size: int + temporal_patch_size: int + merge_size: int + min_frames: int + max_frames: int @add_start_docstrings( diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py index 5e91e3c4fc01..b366ca62fabf 100644 --- a/src/transformers/models/rt_detr/image_processing_rt_detr.py +++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py @@ -84,9 +84,9 @@ class RTDetrImageProcessorKwargs(ImagesKwargs, total=False): Path to the directory containing the segmentation masks. """ - format: Optional[Union[str, AnnotationFormat]] - do_convert_annotations: Optional[bool] - return_segmentation_masks: Optional[bool] + format: Union[str, AnnotationFormat] + do_convert_annotations: bool + return_segmentation_masks: bool annotations: Optional[Union[AnnotationType, list[AnnotationType]]] masks_path: Optional[Union[str, pathlib.Path]] diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py index 3ba27f63d993..eb2615b3e963 100644 --- a/src/transformers/models/sam/image_processing_sam.py +++ b/src/transformers/models/sam/image_processing_sam.py @@ -67,8 +67,8 @@ class SamImageProcessorKwargs(ImagesKwargs, total=False): map size provided for preprocessing. """ - mask_size: Optional[dict[str, int]] - mask_pad_size: Optional[dict[str, int]] + mask_size: dict[str, int] + mask_pad_size: dict[str, int] class SamImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py index 18d812fc6825..d6cdd2ab2653 100644 --- a/src/transformers/models/sam/processing_sam.py +++ b/src/transformers/models/sam/processing_sam.py @@ -36,9 +36,9 @@ class SamImagesKwargs(ImagesKwargs, total=False): input_points: Optional[list[list[float]]] input_labels: Optional[list[list[int]]] input_boxes: Optional[list[list[list[float]]]] - point_pad_value: Optional[int] - mask_size: Optional[dict[str, int]] - mask_pad_size: Optional[dict[str, int]] + point_pad_value: int + mask_size: dict[str, int] + mask_pad_size: dict[str, int] class SamProcessorKwargs(ProcessingKwargs, total=False): diff --git a/src/transformers/models/sam2/image_processing_sam2_fast.py b/src/transformers/models/sam2/image_processing_sam2_fast.py index b00fba952973..014354d8c642 100644 --- a/src/transformers/models/sam2/image_processing_sam2_fast.py +++ b/src/transformers/models/sam2/image_processing_sam2_fast.py @@ -49,7 +49,7 @@ class Sam2FastImageProcessorKwargs(ImagesKwargs, total=False): The size `{"height": int, "width": int}` to resize the segmentation maps to. """ - mask_size: Optional[dict[str, int]] + mask_size: dict[str, int] def _compute_stability_score(masks: "torch.Tensor", mask_threshold: float, stability_score_offset: int): diff --git a/src/transformers/models/sam2/modular_sam2.py b/src/transformers/models/sam2/modular_sam2.py index eac3cc232c1a..d451fc946e6d 100644 --- a/src/transformers/models/sam2/modular_sam2.py +++ b/src/transformers/models/sam2/modular_sam2.py @@ -76,7 +76,7 @@ class Sam2FastImageProcessorKwargs(ImagesKwargs, total=False): The size `{"height": int, "width": int}` to resize the segmentation maps to. """ - mask_size: Optional[dict[str, int]] + mask_size: dict[str, int] @auto_docstring diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py index 8e1945971ebc..2c3b36d98c0a 100644 --- a/src/transformers/models/sam_hq/processing_samhq.py +++ b/src/transformers/models/sam_hq/processing_samhq.py @@ -36,9 +36,9 @@ class SamHQImagesKwargs(ImagesKwargs, total=False): input_points: Optional[list[list[float]]] input_labels: Optional[list[list[int]]] input_boxes: Optional[list[list[list[float]]]] - point_pad_value: Optional[int] - mask_size: Optional[dict[str, int]] - mask_pad_size: Optional[dict[str, int]] + point_pad_value: int + mask_size: dict[str, int] + mask_pad_size: dict[str, int] class SamHQProcessorKwargs(ProcessingKwargs, total=False): diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py index bedb4ff54651..ede9d589294b 100644 --- a/src/transformers/models/segformer/image_processing_segformer.py +++ b/src/transformers/models/segformer/image_processing_segformer.py @@ -63,7 +63,7 @@ class SegformerImageProcessorKwargs(ImagesKwargs, total=False): ADE20k). The background label will be replaced by 255. """ - do_reduce_labels: Optional[bool] + do_reduce_labels: bool @requires(backends=("vision",)) diff --git a/src/transformers/models/siglip2/image_processing_siglip2.py b/src/transformers/models/siglip2/image_processing_siglip2.py index d71b66464223..85063fc9078a 100644 --- a/src/transformers/models/siglip2/image_processing_siglip2.py +++ b/src/transformers/models/siglip2/image_processing_siglip2.py @@ -57,8 +57,8 @@ class Siglip2ImageProcessorKwargs(ImagesKwargs, total=False): and then padded in "patch" dimension to match this number exactly. """ - patch_size: Optional[int] - max_num_patches: Optional[int] + patch_size: int + max_num_patches: int @lru_cache(maxsize=256) diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm.py b/src/transformers/models/smolvlm/image_processing_smolvlm.py index c12c08182a94..a946cc0c191b 100644 --- a/src/transformers/models/smolvlm/image_processing_smolvlm.py +++ b/src/transformers/models/smolvlm/image_processing_smolvlm.py @@ -64,9 +64,9 @@ class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False): Whether to return the row and column information of the images. """ - do_image_splitting: Optional[bool] - max_image_size: Optional[dict[str, int]] - return_row_col_info: Optional[bool] + do_image_splitting: bool + max_image_size: dict[str, int] + return_row_col_info: bool MAX_IMAGE_SIZE = 4096 # 4k resolution as absolute maximum diff --git a/src/transformers/models/smolvlm/video_processing_smolvlm.py b/src/transformers/models/smolvlm/video_processing_smolvlm.py index d8cecb6c0c5c..09751486f0ae 100644 --- a/src/transformers/models/smolvlm/video_processing_smolvlm.py +++ b/src/transformers/models/smolvlm/video_processing_smolvlm.py @@ -91,7 +91,7 @@ def get_resize_output_image_size( class SmolVLMVideoProcessorInitKwargs(VideosKwargs, total=False): - max_image_size: Optional[dict[str, int]] + max_image_size: dict[str, int] class SmolVLMVideoProcessor(BaseVideoProcessor): diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py index 9c810c450ad7..57b1a9dc6cb1 100644 --- a/src/transformers/models/superpoint/image_processing_superpoint.py +++ b/src/transformers/models/superpoint/image_processing_superpoint.py @@ -52,7 +52,7 @@ class SuperPointImageProcessorKwargs(ImagesKwargs, total=False): Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. """ - do_grayscale: Optional[bool] = True + do_grayscale: bool def is_grayscale( diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py index d9de6c684959..0ba052e92e05 100644 --- a/src/transformers/models/swin2sr/image_processing_swin2sr.py +++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py @@ -39,7 +39,7 @@ class Swin2SRImageProcessorKwargs(ImagesKwargs, total=False): - size_divisor: Optional[int] + size_divisor: int class Swin2SRImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py index e5e127d987e0..bd7aa6f5086e 100644 --- a/src/transformers/models/textnet/image_processing_textnet.py +++ b/src/transformers/models/textnet/image_processing_textnet.py @@ -50,7 +50,7 @@ class TextNetImageProcessorKwargs(ImagesKwargs, total=False): - size_divisor: Optional[int] + size_divisor: int class TextNetImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index 42834287e110..d1ae5c374b4b 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -60,7 +60,7 @@ class TvpImageProcessorKwargs(ImagesKwargs, total=False): Padding mode to use — `'constant'`, `'edge'`, `'reflect'`, or `'symmetric'`. """ - do_flip_channel_order: Optional[bool] + do_flip_channel_order: bool constant_values: Optional[Union[float, list[float]]] pad_mode: Optional[str] diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py index fadb5302d4ee..5c1b2acf6e4b 100644 --- a/src/transformers/models/vilt/image_processing_vilt.py +++ b/src/transformers/models/vilt/image_processing_vilt.py @@ -48,7 +48,7 @@ class ViltImageProcessorKwargs(ImagesKwargs, total=False): - size_divisor: Optional[int] + size_divisor: int def max_across_indices(values: Iterable[Any]) -> list[Any]: diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte.py b/src/transformers/models/vitmatte/image_processing_vitmatte.py index eb994b641962..ea54ba603435 100644 --- a/src/transformers/models/vitmatte/image_processing_vitmatte.py +++ b/src/transformers/models/vitmatte/image_processing_vitmatte.py @@ -42,7 +42,7 @@ class VitMatteImageProcessorKwargs(ImagesKwargs, total=False): - size_divisor: Optional[int] + size_divisor: int class VitMatteImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index 19fda87897ae..b594c296707b 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -97,9 +97,9 @@ class YolosImageProcessorKwargs(ImagesKwargs, total=False): Path to the directory containing the segmentation masks. """ - format: Optional[Union[str, AnnotationFormat]] - do_convert_annotations: Optional[bool] - return_segmentation_masks: Optional[bool] + format: Union[str, AnnotationFormat] + do_convert_annotations: bool + return_segmentation_masks: bool annotations: Optional[Union[AnnotationType, list[AnnotationType]]] masks_path: Optional[Union[str, pathlib.Path]] diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py index 3fdf414bc20c..d94a2ee088eb 100644 --- a/src/transformers/models/zoedepth/image_processing_zoedepth.py +++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py @@ -77,8 +77,8 @@ class ZoeDepthImageProcessorKwargs(ImagesKwargs, total=False): Can be overridden by `ensure_multiple_of` in `preprocess`. """ - keep_aspect_ratio: Optional[bool] - ensure_multiple_of: Optional[int] + keep_aspect_ratio: bool + ensure_multiple_of: int def get_resize_output_image_size( diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index ebd5ab276c9d..55844c8d9cce 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -14,7 +14,6 @@ """ Processing saving/loading class for common processors. """ -# from __future__ import annotations import bisect import copy From 774c2603e748f57e5708126f36dd136300fb39be Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 8 Oct 2025 16:01:13 +0200 Subject: [PATCH 28/28] frigit to fix copies --- src/transformers/models/aria/processing_aria.py | 6 +++--- .../image_processing_cohere2_vision_fast.py | 6 +++--- .../image_processing_deepseek_vl_hybrid.py | 10 +++++----- .../models/janus/image_processing_janus.py | 2 +- src/transformers/models/sam_hq/processing_samhq.py | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py index ee8a8fd3f9ef..d0841c96aee2 100644 --- a/src/transformers/models/aria/processing_aria.py +++ b/src/transformers/models/aria/processing_aria.py @@ -31,9 +31,9 @@ class AriaImagesKwargs(ImagesKwargs, total=False): - split_image: Optional[bool] - max_image_size: Optional[int] - min_image_size: Optional[int] + split_image: bool + max_image_size: int + min_image_size: int class AriaProcessorKwargs(ProcessingKwargs, total=False): diff --git a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py index 638b7549bfae..afdd683e2312 100644 --- a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +++ b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py @@ -46,9 +46,9 @@ class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False): set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. """ - crop_to_patches: Optional[bool] - min_patches: Optional[int] - max_patches: Optional[int] + crop_to_patches: bool + min_patches: int + max_patches: int @lru_cache(maxsize=10) diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py index 7837cff2d33b..c91aab91fca5 100644 --- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py @@ -69,11 +69,11 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False): number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. """ - min_size: Optional[int] - high_res_size: Optional[dict] - high_res_resample: Optional[Union["PILImageResampling", int]] - high_res_image_mean: Optional[Union[float, list[float], tuple[float, ...]]] - high_res_image_std: Optional[Union[float, list[float], tuple[float, ...]]] + min_size: int + high_res_size: dict + high_res_resample: Union["PILImageResampling", int] + high_res_image_mean: Union[float, list[float], tuple[float, ...]] + high_res_image_std: Union[float, list[float], tuple[float, ...]] class DeepseekVLHybridImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/janus/image_processing_janus.py b/src/transformers/models/janus/image_processing_janus.py index 6b83ecf8eb5c..c47461174516 100644 --- a/src/transformers/models/janus/image_processing_janus.py +++ b/src/transformers/models/janus/image_processing_janus.py @@ -58,7 +58,7 @@ class JanusImageProcessorKwargs(ImagesKwargs, total=False): falls below this value after resizing. """ - min_size: Optional[int] + min_size: int class JanusImageProcessor(BaseImageProcessor): diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py index 2c3b36d98c0a..d0b11ab06146 100644 --- a/src/transformers/models/sam_hq/processing_samhq.py +++ b/src/transformers/models/sam_hq/processing_samhq.py @@ -36,7 +36,7 @@ class SamHQImagesKwargs(ImagesKwargs, total=False): input_points: Optional[list[list[float]]] input_labels: Optional[list[list[int]]] input_boxes: Optional[list[list[list[float]]]] - point_pad_value: int + point_pad_value: Optional[int] mask_size: dict[str, int] mask_pad_size: dict[str, int]