Skip to content
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
762b651
initial design draft
zucchini-nlp Sep 10, 2025
02e22c6
delete
zucchini-nlp Sep 10, 2025
e744875
fix a few tests
zucchini-nlp Sep 10, 2025
63532bf
fix
zucchini-nlp Sep 10, 2025
1f62d6f
fix the rest of tests
zucchini-nlp Sep 11, 2025
c203ffd
common-kwargs
zucchini-nlp Sep 11, 2025
725a479
why the runner complains about typing with "|"?
zucchini-nlp Sep 11, 2025
d8ca683
revert
zucchini-nlp Sep 11, 2025
8ff15f7
forgot to delete
zucchini-nlp Sep 11, 2025
b0e8120
update
zucchini-nlp Sep 11, 2025
9f761c6
fix last issues
zucchini-nlp Sep 11, 2025
f935cff
add more detalis in docs
zucchini-nlp Sep 16, 2025
e6a77d8
pin the latest hub release
zucchini-nlp Sep 24, 2025
01841b3
merge main
zucchini-nlp Sep 24, 2025
5a42630
fix tests for new models
zucchini-nlp Sep 24, 2025
fe4ba56
also fast image processor
zucchini-nlp Sep 24, 2025
6e8d77e
fix copies
zucchini-nlp Sep 24, 2025
ba41992
image processing ast validated
zucchini-nlp Sep 25, 2025
601985c
Merge remote-tracking branch 'upstream/main' into validate-processor-…
zucchini-nlp Sep 25, 2025
3233a70
fix more tests
zucchini-nlp Sep 25, 2025
909b98e
typo.and fix copies
zucchini-nlp Sep 25, 2025
9b0bc0c
Merge branch 'main' into validate-processor-kwargs
zucchini-nlp Sep 25, 2025
4410dd3
bump
zucchini-nlp Sep 25, 2025
121931c
merge main
zucchini-nlp Oct 3, 2025
1daa883
style
zucchini-nlp Oct 3, 2025
bd902fb
Merge remote-tracking branch 'upstream/main' into validate-processor-…
zucchini-nlp Oct 7, 2025
b8385a2
fix some tests
zucchini-nlp Oct 7, 2025
69448bb
fix copies
zucchini-nlp Oct 8, 2025
d253615
pin rc4 and mark all TypedDict as non-total
zucchini-nlp Oct 8, 2025
0c52d03
Merge branch 'main' into validate-processor-kwargs
zucchini-nlp Oct 8, 2025
7a4e79f
delete typed dict adaptor
zucchini-nlp Oct 8, 2025
0395b54
address comments
zucchini-nlp Oct 8, 2025
34c9ec7
delete optionals
zucchini-nlp Oct 8, 2025
774c260
frigit to fix copies
zucchini-nlp Oct 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/transformers/image_processing_utils_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
logging,
)
from .utils.import_utils import is_rocm_platform
from .utils.type_validators import TypedDictAdapter


if is_vision_available():
Expand Down Expand Up @@ -710,6 +711,11 @@ def _validate_preprocess_kwargs(
def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
# args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)

# Perform type validation on received kwargs
type_validator = TypedDictAdapter(self.valid_kwargs)
type_validator.validate_fields(**kwargs)

# Set default kwargs from self. This ensures that if a kwarg is not provided
# by the user, it gets its default value from the instance, or is set to None.
for kwarg_name in self._valid_kwargs_names:
Expand Down
10 changes: 9 additions & 1 deletion src/transformers/models/aria/modular_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
)
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_utils import PreTrainedModel
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils import PreTokenizedInput, TextInput
from ...utils import TensorType, TransformersKwargs, auto_docstring, can_return_tuple, logging
from ..auto import CONFIG_MAPPING, AutoConfig, AutoTokenizer
Expand Down Expand Up @@ -904,7 +904,15 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
return num_patches


class AriaImagesKwargs(ImagesKwargs, total=False):
split_image: Optional[bool]
max_image_size: Optional[int]
min_image_size: Optional[int]
Comment thread
zucchini-nlp marked this conversation as resolved.
Outdated


Comment on lines +907 to +912

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was missing, Aria has model-specific kwargs

class AriaProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: AriaImagesKwargs

_defaults = {
"text_kwargs": {
"padding": False,
Expand Down
10 changes: 9 additions & 1 deletion src/transformers/models/aria/processing_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,21 @@

from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils import PreTokenizedInput, TextInput
from ...utils import TensorType
from ..auto import AutoTokenizer


class AriaImagesKwargs(ImagesKwargs, total=False):
split_image: Optional[bool]
max_image_size: Optional[int]
min_image_size: Optional[int]


class AriaProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: AriaImagesKwargs

_defaults = {
"text_kwargs": {
"padding": False,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class DeepseekVLImageProcessorKwargs(ImagesKwargs):
falls below this value after resizing.
"""

min_size: int
min_size: Optional[int]


class DeepseekVLImageProcessor(BaseImageProcessor):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,11 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
"""

min_size: int
high_res_size: dict
high_res_resample: "PILImageResampling"
high_res_image_mean: list[float]
high_res_image_std: list[float]
min_size: Optional[int]
high_res_size: Optional[dict]
high_res_resample: Optional[Union["PILImageResampling", int]]
high_res_image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
high_res_image_std: Optional[Union[float, list[float], tuple[float, ...]]]


class DeepseekVLHybridImageProcessor(BaseImageProcessor):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -448,11 +448,11 @@ class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
"""

min_size: int
high_res_size: dict
high_res_resample: "PILImageResampling"
high_res_image_mean: list[float]
high_res_image_std: list[float]
min_size: Optional[int]
high_res_size: Optional[dict]
high_res_resample: Optional[Union["PILImageResampling", int]]
high_res_image_mean: Optional[Union[float, list[float], tuple[float, ...]]]
high_res_image_std: Optional[Union[float, list[float], tuple[float, ...]]]


class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
Expand Down
4 changes: 3 additions & 1 deletion src/transformers/models/dia/processing_dia.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False):
"generation": True,
"sampling_rate": 44100,
},
"common_kwargs": {"return_tensors": "pt"},
"common_kwargs": {
"return_tensors": "pt",
},
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ class EfficientNetImageProcessorKwargs(ImagesKwargs):
Normalize the image again with the standard deviation only for image classification if set to True.
"""

rescale_offset: bool
include_top: bool
rescale_offset: Optional[bool]
include_top: Optional[bool]


class EfficientNetImageProcessor(BaseImageProcessor):
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/eomt/image_processing_eomt.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ class EomtImageProcessorKwargs(ImagesKwargs):
denoted with 0 (background) will be replaced with `ignore_index`.
"""

do_split_image: bool
ignore_index: Optional[int] = None
do_split_image: Optional[bool]
ignore_index: Optional[int]


# Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@


class InternVLVideoProcessorInitKwargs(VideosKwargs):
initial_shift: Union[bool, float, int]
initial_shift: Optional[Union[bool, float, int]]


class InternVLVideoProcessor(BaseVideoProcessor):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/janus/image_processing_janus.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class JanusImageProcessorKwargs(ImagesKwargs):
falls below this value after resizing.
"""

min_size: int
min_size: Optional[int]


class JanusImageProcessor(BaseImageProcessor):
Expand Down
4 changes: 3 additions & 1 deletion src/transformers/models/kosmos2/processing_kosmos2.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,11 @@
list[list[tuple[float, float, float]]],
]

NestedList = list[Union[Optional[int], "NestedList"]]


class Kosmos2ImagesKwargs(ImagesKwargs, total=False):
bboxes: Optional[list[float]]
bboxes: Optional[NestedList]
Comment thread
zucchini-nlp marked this conversation as resolved.
Outdated
num_image_tokens: Optional[int]
first_image_token_id: Optional[int]

Expand Down
10 changes: 6 additions & 4 deletions src/transformers/models/lfm2_vl/processing_lfm2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
ImagesKwargs,
ProcessingKwargs,
ProcessorMixin,
TextKwargs,
Unpack,
)
from ...tokenization_utils_base import BatchEncoding, TextInput
Expand All @@ -46,8 +47,13 @@ class Lfm2VlImagesKwargs(ImagesKwargs, total=False):
return_row_col_info: Optional[bool]


class Lfm2VlTextKwargs(TextKwargs, total=False):
use_image_special_tokens: Optional[bool]


class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False):
images_kwargs: Lfm2VlImagesKwargs
text_kwargs: Lfm2VlTextKwargs

_defaults = {
"images_kwargs": {
Expand Down Expand Up @@ -75,8 +81,6 @@ class Lfm2VlProcessor(ProcessorMixin):
An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
chat_template (`str`, *optional*):
A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
use_image_special_tokens (`bool`, *optional*, defaults to `True`):
Whether to use image special tokens or not when processing.
"""

attributes = ["image_processor", "tokenizer"]
Expand All @@ -88,12 +92,10 @@ def __init__(
image_processor,
tokenizer,
chat_template: Optional[str] = None,
use_image_special_tokens: Optional[bool] = True,
**kwargs,
):
self.image_token = tokenizer.image_token
self.image_token_id = tokenizer.image_token_id
self.use_image_special_tokens = use_image_special_tokens
self.image_start_token = tokenizer.image_start_token
self.image_end_token = tokenizer.image_end_token
self.image_thumbnail_token = tokenizer.image_thumbnail
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,7 @@ def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionImagePro
batch_num_images = [1] * len(images)
else:
batch_num_images = [1]
kwargs["batch_num_images"] = batch_num_images
return super().preprocess(images, **kwargs)
return super().preprocess(images, batch_num_images, **kwargs)

def _resize_for_patching(
self,
Expand Down Expand Up @@ -202,6 +201,7 @@ def _pad_for_batching(
def _preprocess(
self,
images: list["torch.Tensor"],
batch_num_images: list[int],
do_resize: bool,
size: SizeDict,
image_grid_pinpoints: list[list[int]],
Expand All @@ -214,7 +214,6 @@ def _preprocess(
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
do_pad: bool,
batch_num_images: list[int],
disable_grouping: Optional[bool],
return_tensors: Optional[Union[str, TensorType]],
**kwargs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

from ...cache_utils import Cache
from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import group_images_by_shape, reorder_images
from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
from ...image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
Expand Down Expand Up @@ -128,12 +128,12 @@ def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionImagePro
batch_num_images = [1] * len(images)
else:
batch_num_images = [1]
kwargs["batch_num_images"] = batch_num_images
return super().preprocess(images, **kwargs)
return BaseImageProcessorFast.preprocess(images, batch_num_images, **kwargs)

def _preprocess(
self,
images: list["torch.Tensor"],
batch_num_images: list[int],
do_resize: bool,
size: SizeDict,
image_grid_pinpoints: list[list[int]],
Expand All @@ -146,7 +146,6 @@ def _preprocess(
image_mean: Optional[Union[float, list[float]]],
image_std: Optional[Union[float, list[float]]],
do_pad: bool,
batch_num_images: list[int],
disable_grouping: Optional[bool],
return_tensors: Optional[Union[str, TensorType]],
**kwargs,
Expand Down
4 changes: 1 addition & 3 deletions src/transformers/models/mllama/processing_mllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,7 @@ def __call__(
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)

return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
images_kwargs = output_kwargs["images_kwargs"]

data = {}
if text is not None:
Expand Down Expand Up @@ -306,7 +304,7 @@ def __call__(
)

if images is not None:
image_features = self.image_processor(images, **images_kwargs)
image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
num_tiles = image_features.pop("num_tiles")
data.update(image_features)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
do_rescale = True
do_normalize = True
do_convert_rgb = True
vision_input_type = "thumb+tail"
vision_input_type = "thumb+tile"
tile_size = 448
max_num_tiles = 36
size = {"width": 448, "height": 448} # for backward compatibility in tests
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/pixtral/image_processing_pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@

class PixtralImageProcessorKwargs(ImagesKwargs):
"""
patch_size (`dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`):
patch_size (`Union[dict[str, int], int]` *optional*, defaults to `{"height": 16, "width": 16}`):
Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
"""

patch_size: Optional[dict[str, int]]
patch_size: Optional[Union[dict[str, int], int]]


# Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class Qwen2_5_OmniVideosKwargs(VideosKwargs):
max_frames: Optional[int]
use_audio_in_video: Optional[bool]
seconds_per_chunk: Optional[float]
position_id_per_seconds: Optional[int]
position_id_per_seconds: Optional[Union[int, float]]


class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from typing import Optional
from typing import Optional, Union

import numpy as np

Expand All @@ -44,7 +44,7 @@ class Qwen3OmniMoeVideosKwargs(VideosKwargs):
max_frames: Optional[int]
use_audio_in_video: Optional[bool]
seconds_per_chunk: Optional[float]
position_id_per_seconds: Optional[int]
position_id_per_seconds: Optional[Union[int, float]]


class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/udop/processing_udop.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

class UdopTextKwargs(TextKwargs, total=False):
word_labels: Optional[Union[list[int], list[list[int]]]]
boxes: Union[list[list[int]], list[list[list[int]]]]
boxes: Optional[Union[list[list[int]], list[list[list[int]]]]]


class UdopProcessorKwargs(ProcessingKwargs, total=False):
Expand Down
Loading