From b85036f9a12b6ce29f823e31ca668dc4dc2b20c8 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 3 Jun 2024 09:38:41 +0200
Subject: [PATCH 01/45] add initial design for uniform processors + align model

---
 .../models/align/processing_align.py          | 163 ++++++++++++--
 src/transformers/processing_utils.py          | 202 +++++++++++++++++-
 src/transformers/tokenization_utils_base.py   |   2 +
 tests/models/align/test_processor_align.py    |   1 -
 tests/test_image_processing_common.py         |   1 +
 5 files changed, 343 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index 3bc97afd1ca5..0ddc50c094c0 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -16,8 +16,66 @@
 Image/Text processor class for ALIGN
 """
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from typing import List, Union
+
+from ...image_utils import ImageInput
+from ...processing_utils import (
+    CommonKwargs,
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    TextKwargs,
+)
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...utils import is_torch_available, is_vision_available
+
+
+# TODO (@molbap) This is a bother, forward references from TypedDict are resolved and need this to work
+if is_vision_available():
+    import PIL  # noqa: F401
+if is_torch_available():
+    import torch  # noqa: F401
+
+
+class AlignProcessorKwargs(ProcessingKwargs, total=False):
+    """
+    Inherits from `ProcessingKwargs` to provide:
+        1) Additional keys that this model requires to process inputs.
+        2) Default values for extra keys.
+    New keys have to be defined as follows to ensure type hinting is done correctly.
+
+    ```python
+    common_kwargs: CommonKwargs = {
+            **CommonKwargs.__annotations__,
+        }
+        text_kwargs: TextKwargs = {
+            **TextKwargs.__annotations__,
+            "a_new_text_boolean_key": Optional[bool],
+        }
+        images_kwargs: ImagesKwargs = {
+            **ImagesKwargs.__annotations__,
+            "a_new_image_processing_key": Optional[int]
+        }
+    ```
+
+    """
+
+    common_kwargs: CommonKwargs = {
+        **CommonKwargs.__annotations__,
+    }
+    text_kwargs: TextKwargs = {
+        **TextKwargs.__annotations__,
+    }
+    images_kwargs: ImagesKwargs = {
+        **ImagesKwargs.__annotations__,
+    }
+
+    _defaults = {
+        "text_kwargs": {
+            "padding": "max_length",
+            "max_length": 64,
+        },
+    }
 
 
 class AlignProcessor(ProcessorMixin):
@@ -26,12 +84,39 @@ class AlignProcessor(ProcessorMixin):
     [`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that interits both the image processor and
     tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
     information.
+    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
+        ```python
+        from transformers import AlignProcessor
+        from PIL import Image
+        model_id = "kakaobrain/align-base"
+        processor = AlignProcessor.from_pretrained(model_id)
+
+        # Define the kwargs for each modality
+        common_kwargs = {"return_tensors": "pt"}
+        images_kwargs = {"crop_size": {"height": 224, "width": 224}}
+        text_kwargs = {"padding": "do_not_pad"}
+
+        # Combine them into a single dictionary
+
+        all_kwargs = {
+        "images_kwargs": images_kwargs,
+        "text_kwargs": text_kwargs,
+        "common_kwargs": common_kwargs
+        }
+
+        processor(images=your_pil_image, text=["What is that?"], **all_kwargs)
+
+        # passing directly any number of kwargs is also supported, but not recommended
+
+        processor(images=your_pil_image, text=["What is that?"], padding="do_not_pad)
+        ```
 
     Args:
         image_processor ([`EfficientNetImageProcessor`]):
             The image processor is a required input.
         tokenizer ([`BertTokenizer`, `BertTokenizerFast`]):
             The tokenizer is a required input.
+
     """
 
     attributes = ["image_processor", "tokenizer"]
@@ -41,11 +126,21 @@ class AlignProcessor(ProcessorMixin):
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
-    def __call__(self, text=None, images=None, padding="max_length", max_length=64, return_tensors=None, **kwargs):
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        audio=None,
+        videos=None,
+        text_kwargs: AlignProcessorKwargs.text_kwargs = {},
+        images_kwargs: AlignProcessorKwargs.images_kwargs = {},
+        common_kwargs: AlignProcessorKwargs.common_kwargs = {},
+        **kwargs: AlignProcessorKwargs,
+    ) -> BatchEncoding:
         """
         Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
-        and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` arguments to
         EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
         to the doctsring of the above two methods for more information.
 
@@ -57,20 +152,12 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64,
             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`):
-                Activates and controls padding for tokenization of input text. Choose between [`True` or `'longest'`,
-                `'max_length'`, `False` or `'do_not_pad'`]
-            max_length (`int`, *optional*, defaults to `max_length`):
-                Maximum padding value to use to pad the input text during tokenization.
-
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
@@ -81,15 +168,45 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64,
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
         if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
-
+            raise ValueError("You must specify either text or images.")
+        # Init with default values if they exist
+        text_kwargs = AlignProcessorKwargs._defaults.get("text_kwargs", {}).copy()
+
+        # then override with tokenizer-level arguments passed
+        text_kwargs.update(
+            {k: v for k, v in self.tokenizer.init_kwargs.items() if k in AlignProcessorKwargs.text_kwargs}
+        )
+        # then get passed per-modality dictionaries if they exist
+        text_kwargs.update(kwargs.pop("text_kwargs", {}))
+        images_kwargs.update(kwargs.pop("images_kwargs", {}))
+        common_kwargs.update(kwargs.pop("common_kwargs", {}))
+        # then merge kwargs by name
+        for text_key in AlignProcessorKwargs.text_kwargs.keys():
+            text_kwarg_value = kwargs.pop(text_key, None)
+            if text_kwarg_value is not None:
+                text_kwargs[text_key] = text_kwarg_value
+
+        for images_key in AlignProcessorKwargs.images_kwargs.keys():
+            images_kwarg_value = kwargs.pop(images_key, None)
+            if images_kwarg_value is not None:
+                images_kwargs[images_key] = images_kwarg_value
+        # if something remains in kwargs, it belongs to common
+        common_kwargs.update(kwargs)
+
+        # all modality-specific kwargs are updated with common kwargs
+        text_kwargs.update(common_kwargs)
+        images_kwargs.update(common_kwargs)
+
+        # then, we can pass correct kwargs to each processor
         if text is not None:
-            encoding = self.tokenizer(
-                text, padding=padding, max_length=max_length, return_tensors=return_tensors, **kwargs
-            )
+            encoding = self.tokenizer(text, **text_kwargs)
 
         if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, **images_kwargs)
+
+        # BC for explicit return_tensors
+        if "return_tensors" in common_kwargs:
+            return_tensors = common_kwargs.pop("return_tensors", None)
 
         if text is not None and images is not None:
             encoding["pixel_values"] = image_features.pixel_values
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index d76fa4dccccf..df6a178bb73f 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -22,13 +22,27 @@
 import os
 import warnings
 from pathlib import Path
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union
+
+import numpy as np
 
 from .dynamic_module_utils import custom_object_save
-from .tokenization_utils_base import PreTrainedTokenizerBase
+from .image_utils import ChannelDimension, is_vision_available
+
+
+if is_vision_available():
+    from .image_utils import PILImageResampling
+
+
+from .tokenization_utils_base import (
+    PaddingStrategy,
+    PreTrainedTokenizerBase,
+    TruncationStrategy,
+)
 from .utils import (
     PROCESSOR_NAME,
     PushToHubMixin,
+    TensorType,
     add_model_info_to_auto_map,
     add_model_info_to_custom_pipelines,
     cached_file,
@@ -54,6 +68,190 @@
 }
 
 
+class TextKwargs(TypedDict, total=False):
+    """
+    Keyword arguments for text processing. For extended documentation, check out tokenization_utils_base methods and
+    docstrings associated.
+
+    Attributes:
+        add_special_tokens (`bool`, *optional*, defaults to `True`):
+            Whether or not to add special tokens when encoding the sequences.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+            Activates and controls padding.
+        truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+            Activates and controls truncation.
+        max_length (`int`, *optional*):
+            Controls the maximum length to use by one of the truncation/padding parameters.
+        stride (`int`, *optional*, defaults to 0):
+            If set, the overflowing tokens will contain some tokens from the end of the truncated sequence.
+        is_split_into_words (`bool`, *optional*, defaults to `False`):
+            Whether or not the input is already pre-tokenized.
+        pad_to_multiple_of (`int`, *optional*):
+            If set, will pad the sequence to a multiple of the provided value.
+        return_token_type_ids (`bool`, *optional*):
+            Whether to return token type IDs.
+        return_attention_mask (`bool`, *optional*):
+            Whether to return the attention mask.
+        return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not to return overflowing token sequences.
+        return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not to return special tokens mask information.
+        return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+            Whether or not to return `(char_start, char_end)` for each token.
+        return_length (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the lengths of the encoded inputs.
+        verbose (`bool`, *optional*, defaults to `True`):
+            Whether or not to print more information and warnings.
+        padding_side (`str`, *optional*):
+            The side on which padding will be applied.
+    """
+
+    add_special_tokens: Optional[bool]
+    padding: Union[bool, str, PaddingStrategy]
+    truncation: Union[bool, str, TruncationStrategy]
+    max_length: Optional[int]
+    stride: Optional[int]
+    is_split_into_words: Optional[bool]
+    pad_to_multiple_of: Optional[int]
+    return_token_type_ids: Optional[bool]
+    return_attention_mask: Optional[bool]
+    return_overflowing_tokens: Optional[bool]
+    return_special_tokens_mask: Optional[bool]
+    return_offsets_mapping: Optional[bool]
+    return_length: Optional[bool]
+    verbose: Optional[bool]
+    padding_side: Optional[str]
+
+
+class ImagesKwargs(TypedDict, total=False):
+    """
+    Keyword arguments for image processing. For extended documentation, check the appropriate ImageProcessor
+    class methods and docstrings.
+
+    Attributes:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image.
+        size (`Dict[str, int]`, *optional*, defaults to `{'shortest_edge': 288}`):
+            Resize the shorter side of the input to `size["shortest_edge"]`.
+        size_divisor (`int`, *optional*, defaults to 32):
+            The size by which to make sure both the height and width can be divided.
+        crop_size (`Dict[str, int]`, *optional*):
+            Desired output size when applying center-cropping.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image.
+        data_format (`ChannelDimension` or `str`, *optional*, defaults to `"channels_first"`):
+            The channel dimension format for the output image.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format for the input image.
+    """
+
+    do_resize: Optional[bool]
+    size: Optional[Dict[str, int]]
+    size_divisor: Optional[int]
+    crop_size: Optional[Dict[str, int]]
+    resample: Optional[Union["PILImageResampling", int]]
+    do_rescale: Optional[bool]
+    rescale_factor: Optional[float]
+    do_normalize: Optional[bool]
+    image_mean: Optional[Union[float, List[float]]]
+    image_std: Optional[Union[float, List[float]]]
+    do_pad: Optional[bool]
+    do_center_crop: Optional[bool]
+    data_format: Optional[ChannelDimension]
+    input_data_format: Optional[Union[str, ChannelDimension]]
+
+
+class VideosKwargs(TypedDict, total=False):
+    """
+    Keyword arguments for video processing.
+
+    Attributes:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image.
+        # ... (Add docstrings for other videos_kwargs)
+    """
+
+    do_resize: Optional[bool]
+    size: Optional[Dict[str, int]]
+    size_divisor: Optional[int]
+    resample: Optional["PILImageResampling"]
+    do_rescale: Optional[bool]
+    rescale_factor: Optional[float]
+    do_normalize: Optional[bool]
+    image_mean: Optional[Union[float, List[float]]]
+    image_std: Optional[Union[float, List[float]]]
+    do_pad: Optional[bool]
+    do_center_crop: Optional[bool]
+    data_format: Optional[ChannelDimension]
+    input_data_format: Optional[Union[str, ChannelDimension]]
+
+
+class AudioKwargs(TypedDict, total=False):
+    """
+    Keyword arguments for audio processing.
+
+    Attributes:
+        sampling_rate (`int`, *optional*):
+            The sampling rate at which the `raw_speech` input was sampled.
+        raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+            The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+            values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+            stereo, i.e. single float per timestep.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding
+            index) among:
+
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                sequence if provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        truncation (`bool`):
+            Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+        pad_to_multiple_of (`int`, *optional*):
+            If set, will pad the sequence to a multiple of the provided value.
+        return_attention_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
+    """
+
+    sampling_rate: Optional[int]
+    raw_speech: Optional[Union["np.ndarray", List[float], List["np.ndarray"], List[List[float]]]]
+    padding: Optional[Union[bool, str, PaddingStrategy]]
+    max_length: Optional[int]
+    truncation: Optional[bool]
+    pad_to_multiple_of: Optional[int]
+    return_attention_mask: Optional[bool]
+
+
+class CommonKwargs(TypedDict, total=False):
+    return_tensors: Optional[Union[str, TensorType]]
+
+
+class ProcessingKwargs(TypedDict, total=False):
+    common_kwargs: CommonKwargs
+    text_kwargs: TextKwargs
+    images_kwargs: ImagesKwargs
+    audio_kwargs: AudioKwargs
+    videos_kwargs: VideosKwargs
+
+
 class ProcessorMixin(PushToHubMixin):
     """
     This is a mixin used to provide saving/loading functionality for all processor classes.
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 116fbfdf7bbb..9d401501885e 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -126,6 +126,8 @@ class EncodingFast:
 PreTokenizedInputPair = Tuple[List[str], List[str]]
 EncodedInputPair = Tuple[List[int], List[int]]
 
+# Define type aliases for text-related non-text modalities
+AudioInput = Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], List[float]]
 
 # Slow tokenizers used to be saved in three separated files
 SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py
index 12fbea5a50cd..9c5f4cccca88 100644
--- a/tests/models/align/test_processor_align.py
+++ b/tests/models/align/test_processor_align.py
@@ -159,7 +159,6 @@ def test_tokenizer(self):
         encoded_processor = processor(text=input_str)
 
         encoded_tok = tokenizer(input_str, padding="max_length", max_length=64)
-
         for key in encoded_tok.keys():
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index 90c1a4e7e127..074da8a2bbab 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -291,6 +291,7 @@ def test_call_numpy_4_channels(self):
         )
 
     def test_image_processor_preprocess_arguments(self):
+        # Test that an instantiated image processor is called with the correct arg spec
         image_processor = self.image_processing_class(**self.image_processor_dict)
         if hasattr(image_processor, "_valid_processor_keys") and hasattr(image_processor, "preprocess"):
             preprocess_parameter_names = inspect.getfullargspec(image_processor.preprocess).args

From bb8ac70e1f27de6c7c8e85975dbf3fb248a2c606 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 3 Jun 2024 10:58:45 +0200
Subject: [PATCH 02/45] fix mutable default :eyes:

---
 .../models/align/processing_align.py             | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index 0ddc50c094c0..431d2fec625f 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -132,9 +132,9 @@ def __call__(
         images: ImageInput = None,
         audio=None,
         videos=None,
-        text_kwargs: AlignProcessorKwargs.text_kwargs = {},
-        images_kwargs: AlignProcessorKwargs.images_kwargs = {},
-        common_kwargs: AlignProcessorKwargs.common_kwargs = {},
+        text_kwargs: AlignProcessorKwargs.text_kwargs = None,
+        images_kwargs: AlignProcessorKwargs.images_kwargs = None,
+        common_kwargs: AlignProcessorKwargs.common_kwargs = None,
         **kwargs: AlignProcessorKwargs,
     ) -> BatchEncoding:
         """
@@ -169,6 +169,15 @@ def __call__(
         """
         if text is None and images is None:
             raise ValueError("You must specify either text or images.")
+
+        # set kwargs as empty dicts to avoid default mutable
+        if text_kwargs is None:
+            text_kwargs = {}
+        if images_kwargs is None:
+            images_kwargs = {}
+        if common_kwargs is None:
+            common_kwargs = {}
+
         # Init with default values if they exist
         text_kwargs = AlignProcessorKwargs._defaults.get("text_kwargs", {}).copy()
 
@@ -177,6 +186,7 @@ def __call__(
             {k: v for k, v in self.tokenizer.init_kwargs.items() if k in AlignProcessorKwargs.text_kwargs}
         )
         # then get passed per-modality dictionaries if they exist
+
         text_kwargs.update(kwargs.pop("text_kwargs", {}))
         images_kwargs.update(kwargs.pop("images_kwargs", {}))
         common_kwargs.update(kwargs.pop("common_kwargs", {}))

From cd8c6018361a7604f063590a04bf10e05ff1cd72 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 3 Jun 2024 10:59:15 +0200
Subject: [PATCH 03/45] add configuration test

---
 tests/models/align/test_processor_align.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py
index 9c5f4cccca88..eaa54f911292 100644
--- a/tests/models/align/test_processor_align.py
+++ b/tests/models/align/test_processor_align.py
@@ -204,3 +204,16 @@ def test_model_input_names(self):
         inputs = processor(text=input_str, images=image_input)
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
+
+    def test_defaults_preserved(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer(max_length=117)
+
+        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertEqual(len(inputs["input_ids"]), 117)

From f00c85277dd083cdd259727cffd679b3c98e6731 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 3 Jun 2024 12:28:41 +0200
Subject: [PATCH 04/45] handle structured kwargs w defaults + add test

---
 .../models/align/processing_align.py          | 10 ++++----
 tests/models/align/test_processor_align.py    | 23 +++++++++++++++++++
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index 431d2fec625f..e8c4857814bd 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -169,7 +169,6 @@ def __call__(
         """
         if text is None and images is None:
             raise ValueError("You must specify either text or images.")
-
         # set kwargs as empty dicts to avoid default mutable
         if text_kwargs is None:
             text_kwargs = {}
@@ -177,19 +176,18 @@ def __call__(
             images_kwargs = {}
         if common_kwargs is None:
             common_kwargs = {}
-
         # Init with default values if they exist
-        text_kwargs = AlignProcessorKwargs._defaults.get("text_kwargs", {}).copy()
+        default_text_kwargs = AlignProcessorKwargs._defaults.get("text_kwargs", {}).copy()
 
         # then override with tokenizer-level arguments passed
-        text_kwargs.update(
+        default_text_kwargs.update(
             {k: v for k, v in self.tokenizer.init_kwargs.items() if k in AlignProcessorKwargs.text_kwargs}
         )
         # then get passed per-modality dictionaries if they exist
-
-        text_kwargs.update(kwargs.pop("text_kwargs", {}))
+        text_kwargs = {**default_text_kwargs, **text_kwargs, **kwargs.pop("text_kwargs", {})}
         images_kwargs.update(kwargs.pop("images_kwargs", {}))
         common_kwargs.update(kwargs.pop("common_kwargs", {}))
+
         # then merge kwargs by name
         for text_key in AlignProcessorKwargs.text_kwargs.keys():
             text_kwarg_value = kwargs.pop(text_key, None)
diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py
index eaa54f911292..86bc19817acd 100644
--- a/tests/models/align/test_processor_align.py
+++ b/tests/models/align/test_processor_align.py
@@ -217,3 +217,26 @@ def test_defaults_preserved(self):
         inputs = processor(text=input_str, images=image_input)
 
         self.assertEqual(len(inputs["input_ids"]), 117)
+
+    def test_structured_kwargs(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        common_kwargs = {"return_tensors": "pt"}
+        images_kwargs = {"crop_size": {"height": 214, "width": 214}}
+        text_kwargs = {"padding": "max_length", "max_length": 76}
+
+        # Combine them into a single dictionary
+        all_kwargs = {"images_kwargs": images_kwargs, "text_kwargs": text_kwargs, "common_kwargs": common_kwargs}
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        print(inputs["input_ids"])
+        self.assertEquals(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)

From 693036f94411cddf0451ea1a61a84747ab341b06 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 3 Jun 2024 13:48:57 +0200
Subject: [PATCH 05/45] protect torch-specific test

---
 tests/models/align/test_processor_align.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py
index 86bc19817acd..823824494028 100644
--- a/tests/models/align/test_processor_align.py
+++ b/tests/models/align/test_processor_align.py
@@ -23,7 +23,7 @@
 
 from transformers import BertTokenizer, BertTokenizerFast
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_vision
+from transformers.testing_utils import require_vision, require_torch
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
 
 
@@ -218,6 +218,7 @@ def test_defaults_preserved(self):
 
         self.assertEqual(len(inputs["input_ids"]), 117)
 
+    @require_torch
     def test_structured_kwargs(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()

From 766da3a6f82f7ffd0a5519635cfeb27f91576398 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 3 Jun 2024 13:49:18 +0200
Subject: [PATCH 06/45] fix style

---
 tests/models/align/test_processor_align.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py
index 823824494028..aab99b6a3254 100644
--- a/tests/models/align/test_processor_align.py
+++ b/tests/models/align/test_processor_align.py
@@ -23,7 +23,7 @@
 
 from transformers import BertTokenizer, BertTokenizerFast
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_vision, require_torch
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
 
 
From 844394de7e5cd93c4ce5d9fe535467ac964a021e Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 3 Jun 2024 14:11:22 +0200
Subject: [PATCH 07/45] fix

---
 tests/models/align/test_processor_align.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py
index aab99b6a3254..5332f1469333 100644
--- a/tests/models/align/test_processor_align.py
+++ b/tests/models/align/test_processor_align.py
@@ -237,7 +237,6 @@ def test_structured_kwargs(self):
         all_kwargs = {"images_kwargs": images_kwargs, "text_kwargs": text_kwargs, "common_kwargs": common_kwargs}
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        print(inputs["input_ids"])
         self.assertEquals(inputs["pixel_values"].shape[2], 214)
 
         self.assertEqual(len(inputs["input_ids"][0]), 76)

From c19bbc6f5e00d6e8e2c48e034295e5b873a26121 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Tue, 4 Jun 2024 13:26:42 +0200
Subject: [PATCH 08/45] fix assertEqual

---
 tests/models/align/test_processor_align.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py
index 5332f1469333..70349cf8598f 100644
--- a/tests/models/align/test_processor_align.py
+++ b/tests/models/align/test_processor_align.py
@@ -237,6 +237,6 @@ def test_structured_kwargs(self):
         all_kwargs = {"images_kwargs": images_kwargs, "text_kwargs": text_kwargs, "common_kwargs": common_kwargs}
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEquals(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
         self.assertEqual(len(inputs["input_ids"][0]), 76)

From 3c38119854061131a377c07353202d1dbadce096 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Tue, 4 Jun 2024 13:27:28 +0200
Subject: [PATCH 09/45] move kwargs merging to processing common

---
 .../models/align/processing_align.py          |  50 ++-------
 src/transformers/processing_utils.py          | 105 ++++++++++++++++++
 2 files changed, 116 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index e8c4857814bd..f382e2b90bca 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -169,52 +169,24 @@ def __call__(
         """
         if text is None and images is None:
             raise ValueError("You must specify either text or images.")
-        # set kwargs as empty dicts to avoid default mutable
-        if text_kwargs is None:
-            text_kwargs = {}
-        if images_kwargs is None:
-            images_kwargs = {}
-        if common_kwargs is None:
-            common_kwargs = {}
-        # Init with default values if they exist
-        default_text_kwargs = AlignProcessorKwargs._defaults.get("text_kwargs", {}).copy()
-
-        # then override with tokenizer-level arguments passed
-        default_text_kwargs.update(
-            {k: v for k, v in self.tokenizer.init_kwargs.items() if k in AlignProcessorKwargs.text_kwargs}
+        output_kwargs = self._merge_kwargs(
+            AlignProcessorKwargs,
+            text_kwargs=text_kwargs,
+            images_kwargs=images_kwargs,
+            common_kwargs=common_kwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
         )
-        # then get passed per-modality dictionaries if they exist
-        text_kwargs = {**default_text_kwargs, **text_kwargs, **kwargs.pop("text_kwargs", {})}
-        images_kwargs.update(kwargs.pop("images_kwargs", {}))
-        common_kwargs.update(kwargs.pop("common_kwargs", {}))
-
-        # then merge kwargs by name
-        for text_key in AlignProcessorKwargs.text_kwargs.keys():
-            text_kwarg_value = kwargs.pop(text_key, None)
-            if text_kwarg_value is not None:
-                text_kwargs[text_key] = text_kwarg_value
-
-        for images_key in AlignProcessorKwargs.images_kwargs.keys():
-            images_kwarg_value = kwargs.pop(images_key, None)
-            if images_kwarg_value is not None:
-                images_kwargs[images_key] = images_kwarg_value
-        # if something remains in kwargs, it belongs to common
-        common_kwargs.update(kwargs)
-
-        # all modality-specific kwargs are updated with common kwargs
-        text_kwargs.update(common_kwargs)
-        images_kwargs.update(common_kwargs)
-
         # then, we can pass correct kwargs to each processor
         if text is not None:
-            encoding = self.tokenizer(text, **text_kwargs)
+            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
         if images is not None:
-            image_features = self.image_processor(images, **images_kwargs)
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
 
         # BC for explicit return_tensors
-        if "return_tensors" in common_kwargs:
-            return_tensors = common_kwargs.pop("return_tensors", None)
+        if "return_tensors" in output_kwargs["common_kwargs"]:
+            return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
 
         if text is not None and images is not None:
             encoding["pixel_values"] = image_features.pixel_values
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index df6a178bb73f..0390b1c25c42 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -607,6 +607,111 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
         else:
             return processor
 
+    def _merge_kwargs(
+        self,
+        ModelProcessorKwargs: ProcessingKwargs,
+        text_kwargs: Optional[TextKwargs] = None,
+        images_kwargs: Optional[ImagesKwargs] = None,
+        common_kwargs: Optional[CommonKwargs] = None,
+        videos_kwargs: Optional[VideosKwargs] = None,
+        audio_kwargs: Optional[AudioKwargs] = None,
+        tokenizer_init_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ) -> Dict[str, Dict]:
+        """
+        Method to merge dictionaries of kwargs cleanly separated by modality within a Processor instance.
+        The order of operations is as follows:
+            1) kwargs passed as before have highest priority to preserve BC. They mix modalities and may not result in
+            correct behaviour.
+                ```python
+                high_priority_kwargs = {"crop_size" = (224, 224), "padding" = "max_length"}
+                processor(..., **high_priority_kwargs)
+                ```
+            2) kwargs specified as a dictionary and passed to the processor __call__ have second highest priority.
+            This is the recommended API.
+                ```python
+                recommended_priority_kwargs = {"text_kwargs": {"padding":"max_length"}, "images_kwargs": {"crop_size": (224, 224)}}
+                processor(..., **recommended_priority_kwargs)
+                ```
+            3) kwargs passed as modality-specific kwargs have third priority.
+                ```python
+                processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": (224, 224)}})
+                ```
+            4) kwargs passed during instantiation of a modality processor have fourth priority.
+                ```python
+                tokenizer = tokenizer_class(..., {"padding": "max_length"})
+                image_processor = image_processor_class(...)
+                processor(tokenizer, image_processor) # will pass max_length unless overriden by kwargs at call
+                ```
+            5) defaults kwargs specified at processor level have lowest priority.
+
+        Args:
+            ModelProcessorKwargs (`ProcessingKwargs`):
+                Typed dictionary of kwargs specifically required by the model passed.
+            text_kwargs (`TextKwargs`, *optional*):
+                Typed dictionary of kwargs inputs applied to the text modality processor, i.e. the tokenizer.
+            images_kwargs (`ImagesKwargs`, *optional*):
+                Typed dictionary of kwargs inputs applied to the images modality processor.
+            videos_kwargs (`VideosKwargs`, *optional*):
+                Typed dictionary of kwargs inputs applied to the videos modality processor.
+            audio_kwargs (`AudioKwargs`, *optional*):
+                Typed dictionary of kwargs inputs applied to the audio modality processor.
+            tokenizer_init_kwargs (`Dict`, *optional*):
+                Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over other kwargs.
+
+        Returns:
+            output_kwargs (`Dict`):
+                Dictionary of per-modality kwargs to be passed to each modality-specific processor.
+
+        """
+
+        # Initialize dictionaries
+        output_kwargs = {
+            "text_kwargs": text_kwargs or {},
+            "images_kwargs": images_kwargs or {},
+            "audio_kwargs": audio_kwargs or {},
+            "videos_kwargs": videos_kwargs or {},
+            "common_kwargs": common_kwargs or {},
+        }
+
+        default_kwargs = {
+            "text_kwargs": {},
+            "images_kwargs": {},
+            "audio_kwargs": {},
+            "videos_kwargs": {},
+            "common_kwargs": {},
+        }
+
+        # get defaults from set model processor kwargs if they exist
+        for modality in default_kwargs:
+            default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
+        # then override with tokenizer-level arguments passed
+        if tokenizer_init_kwargs:
+            default_kwargs["text_kwargs"].update(
+                {k: v for k, v in tokenizer_init_kwargs.items() if k in ModelProcessorKwargs.text_kwargs}
+            )
+
+        # then get passed per-modality dictionaries if they exist
+        for modality in output_kwargs:
+            output_kwargs[modality] = {
+                **default_kwargs[modality],
+                **output_kwargs[modality],
+                **kwargs.pop(modality, {}),
+            }
+            # then merge kwargs by name
+            for modality_key in ModelProcessorKwargs[modality].__annotations__.keys():
+                modality_kwarg_value = kwargs.pop(modality_key, None)
+                if modality_kwarg_value is not None:
+                    output_kwargs[modality] = modality_kwarg_value
+
+        # if something remains in kwargs, it belongs to common
+        output_kwargs["common_kwargs"].update(kwargs)
+        # all modality-specific kwargs are updated with common kwargs
+        for modality in output_kwargs:
+            output_kwargs[modality].update(output_kwargs["common_kwargs"])
+
+        return output_kwargs
+
     @classmethod
     def from_pretrained(
         cls,

From 81ae819d802d6a8e6523e1f6649d172eff9e4d75 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Wed, 5 Jun 2024 18:12:39 +0200
Subject: [PATCH 10/45] rework kwargs for type hinting

---
 .../models/align/processing_align.py          | 48 ++++--------
 src/transformers/processing_utils.py          | 75 +++++++++----------
 tests/models/align/test_processor_align.py    | 48 ++++++++++--
 3 files changed, 91 insertions(+), 80 deletions(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index f382e2b90bca..9934384e2b07 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -16,7 +16,7 @@
 Image/Text processor class for ALIGN
 """
 
-from typing import List, Union
+from typing import List, Union, Unpack
 
 from ...image_utils import ImageInput
 from ...processing_utils import (
@@ -37,7 +37,7 @@
     import torch  # noqa: F401
 
 
-class AlignProcessorKwargs(ProcessingKwargs, total=False):
+class AlignProcessorKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False):
     """
     Inherits from `ProcessingKwargs` to provide:
         1) Additional keys that this model requires to process inputs.
@@ -45,31 +45,18 @@ class AlignProcessorKwargs(ProcessingKwargs, total=False):
     New keys have to be defined as follows to ensure type hinting is done correctly.
 
     ```python
-    common_kwargs: CommonKwargs = {
-            **CommonKwargs.__annotations__,
-        }
-        text_kwargs: TextKwargs = {
-            **TextKwargs.__annotations__,
-            "a_new_text_boolean_key": Optional[bool],
-        }
-        images_kwargs: ImagesKwargs = {
-            **ImagesKwargs.__annotations__,
-            "a_new_image_processing_key": Optional[int]
-        }
-    ```
+    images_kwargs: ImagesKwargs = {"new_image_kwarg": Optional[bool]}
 
-    """
-
-    common_kwargs: CommonKwargs = {
-        **CommonKwargs.__annotations__,
-    }
-    text_kwargs: TextKwargs = {
-        **TextKwargs.__annotations__,
-    }
-    images_kwargs: ImagesKwargs = {
-        **ImagesKwargs.__annotations__,
+    _defaults = {
+        "text_kwargs": {
+            "padding": "max_length",
+            "max_length": 64,
+        },
     }
 
+    ```
+    """
+
     _defaults = {
         "text_kwargs": {
             "padding": "max_length",
@@ -106,9 +93,10 @@ class AlignProcessor(ProcessorMixin):
 
         processor(images=your_pil_image, text=["What is that?"], **all_kwargs)
 
-        # passing directly any number of kwargs is also supported, but not recommended
+        # passing directly any number of kwargs flattened is also supported
 
-        processor(images=your_pil_image, text=["What is that?"], padding="do_not_pad)
+        all_kwargs = {"return_tensors": "pt", "crop_size": {"height": 214, "width": 214}, "padding": "max_length", "max_length": 76}
+        processor(images=your_pil_image, text=["What is that?"], **all_kwargs)
         ```
 
     Args:
@@ -132,10 +120,7 @@ def __call__(
         images: ImageInput = None,
         audio=None,
         videos=None,
-        text_kwargs: AlignProcessorKwargs.text_kwargs = None,
-        images_kwargs: AlignProcessorKwargs.images_kwargs = None,
-        common_kwargs: AlignProcessorKwargs.common_kwargs = None,
-        **kwargs: AlignProcessorKwargs,
+        **kwargs: Unpack[AlignProcessorKwargs],
     ) -> BatchEncoding:
         """
         Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
@@ -171,9 +156,6 @@ def __call__(
             raise ValueError("You must specify either text or images.")
         output_kwargs = self._merge_kwargs(
             AlignProcessorKwargs,
-            text_kwargs=text_kwargs,
-            images_kwargs=images_kwargs,
-            common_kwargs=common_kwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 0390b1c25c42..49cd2a471b5b 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -244,12 +244,22 @@ class CommonKwargs(TypedDict, total=False):
     return_tensors: Optional[Union[str, TensorType]]
 
 
-class ProcessingKwargs(TypedDict, total=False):
-    common_kwargs: CommonKwargs
-    text_kwargs: TextKwargs
-    images_kwargs: ImagesKwargs
-    audio_kwargs: AudioKwargs
-    videos_kwargs: VideosKwargs
+class ProcessingKwargs(TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, total=False):
+    common_kwargs: CommonKwargs = {
+        **CommonKwargs.__annotations__,
+    }
+    text_kwargs: TextKwargs = {
+        **TextKwargs.__annotations__,
+    }
+    images_kwargs: ImagesKwargs = {
+        **ImagesKwargs.__annotations__,
+    }
+    videos_kwargs: VideosKwargs = {
+        **VideosKwargs.__annotations__,
+    }
+    audio_kwargs: AudioKwargs = {
+        **AudioKwargs.__annotations__,
+    }
 
 
 class ProcessorMixin(PushToHubMixin):
@@ -610,11 +620,6 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
     def _merge_kwargs(
         self,
         ModelProcessorKwargs: ProcessingKwargs,
-        text_kwargs: Optional[TextKwargs] = None,
-        images_kwargs: Optional[ImagesKwargs] = None,
-        common_kwargs: Optional[CommonKwargs] = None,
-        videos_kwargs: Optional[VideosKwargs] = None,
-        audio_kwargs: Optional[AudioKwargs] = None,
         tokenizer_init_kwargs: Optional[Dict] = None,
         **kwargs,
     ) -> Dict[str, Dict]:
@@ -648,30 +653,21 @@ def _merge_kwargs(
         Args:
             ModelProcessorKwargs (`ProcessingKwargs`):
                 Typed dictionary of kwargs specifically required by the model passed.
-            text_kwargs (`TextKwargs`, *optional*):
-                Typed dictionary of kwargs inputs applied to the text modality processor, i.e. the tokenizer.
-            images_kwargs (`ImagesKwargs`, *optional*):
-                Typed dictionary of kwargs inputs applied to the images modality processor.
-            videos_kwargs (`VideosKwargs`, *optional*):
-                Typed dictionary of kwargs inputs applied to the videos modality processor.
-            audio_kwargs (`AudioKwargs`, *optional*):
-                Typed dictionary of kwargs inputs applied to the audio modality processor.
             tokenizer_init_kwargs (`Dict`, *optional*):
-                Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over other kwargs.
+                Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over defaults.
 
         Returns:
             output_kwargs (`Dict`):
                 Dictionary of per-modality kwargs to be passed to each modality-specific processor.
 
         """
-
         # Initialize dictionaries
         output_kwargs = {
-            "text_kwargs": text_kwargs or {},
-            "images_kwargs": images_kwargs or {},
-            "audio_kwargs": audio_kwargs or {},
-            "videos_kwargs": videos_kwargs or {},
-            "common_kwargs": common_kwargs or {},
+            "text_kwargs": {},
+            "images_kwargs": {},
+            "audio_kwargs": {},
+            "videos_kwargs": {},
+            "common_kwargs": {},
         }
 
         default_kwargs = {
@@ -685,31 +681,28 @@ def _merge_kwargs(
         # get defaults from set model processor kwargs if they exist
         for modality in default_kwargs:
             default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
-        # then override with tokenizer-level arguments passed
-        if tokenizer_init_kwargs:
-            default_kwargs["text_kwargs"].update(
-                {k: v for k, v in tokenizer_init_kwargs.items() if k in ModelProcessorKwargs.text_kwargs}
-            )
-
-        # then get passed per-modality dictionaries if they exist
+        # update modality kwargs with passed kwargs
         for modality in output_kwargs:
             output_kwargs[modality] = {
                 **default_kwargs[modality],
-                **output_kwargs[modality],
-                **kwargs.pop(modality, {}),
             }
-            # then merge kwargs by name
-            for modality_key in ModelProcessorKwargs[modality].__annotations__.keys():
-                modality_kwarg_value = kwargs.pop(modality_key, None)
-                if modality_kwarg_value is not None:
-                    output_kwargs[modality] = modality_kwarg_value
+            for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
+                # init with tokenizer init kwargs if necessary
+                if modality_key in tokenizer_init_kwargs:
+                    output_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key]
+                # check if we received a structured kwarg dict or not to handle it correctly
+                if modality in kwargs:
+                    kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
+                else:
+                    kwarg_value = kwargs.pop(modality_key, "__empty__")
+                if kwarg_value != "__empty__":
+                    output_kwargs[modality][modality_key] = kwarg_value
 
         # if something remains in kwargs, it belongs to common
         output_kwargs["common_kwargs"].update(kwargs)
         # all modality-specific kwargs are updated with common kwargs
         for modality in output_kwargs:
             output_kwargs[modality].update(output_kwargs["common_kwargs"])
-
         return output_kwargs
 
     @classmethod
diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py
index 70349cf8598f..31413f28c66d 100644
--- a/tests/models/align/test_processor_align.py
+++ b/tests/models/align/test_processor_align.py
@@ -205,7 +205,8 @@ def test_model_input_names(self):
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
 
-    def test_defaults_preserved(self):
+    # TODO move these tests to a common Mixin
+    def test_defaults_preserved_kwargs(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer(max_length=117)
 
@@ -218,6 +219,19 @@ def test_defaults_preserved(self):
 
         self.assertEqual(len(inputs["input_ids"]), 117)
 
+    @require_torch
+    def test_defaults_preserved_image_kwargs(self):
+        image_processor = self.get_image_processor(crop_size=(234, 234))
+        tokenizer = self.get_tokenizer(max_length=117)
+
+        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
+
     @require_torch
     def test_structured_kwargs(self):
         image_processor = self.get_image_processor()
@@ -229,12 +243,34 @@ def test_structured_kwargs(self):
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
-        common_kwargs = {"return_tensors": "pt"}
-        images_kwargs = {"crop_size": {"height": 214, "width": 214}}
-        text_kwargs = {"padding": "max_length", "max_length": 76}
+        all_kwargs = {
+            "return_tensors": "pt",
+            "crop_size": {"height": 214, "width": 214},
+            "padding": "max_length",
+            "max_length": 76,
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
-        # Combine them into a single dictionary
-        all_kwargs = {"images_kwargs": images_kwargs, "text_kwargs": text_kwargs, "common_kwargs": common_kwargs}
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    def test_structured_kwargs_nested(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.assertEqual(inputs["pixel_values"].shape[2], 214)

From ce4abcdcc47bafc221e9554bf52dfe00d4ae79bb Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Fri, 7 Jun 2024 13:39:11 +0200
Subject: [PATCH 11/45] just get Unpack from extensions

---
 src/transformers/models/align/processing_align.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index 9934384e2b07..a20f2d027aa5 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -16,7 +16,13 @@
 Image/Text processor class for ALIGN
 """
 
-from typing import List, Union, Unpack
+from typing import List, Union
+
+
+try:
+    from typing import Unpack
+except ImportError:
+    from typing_extensions import Unpack
 
 from ...image_utils import ImageInput
 from ...processing_utils import (

From 3acdf289630b289efd9657ef6c5574eb86e3107c Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Fri, 7 Jun 2024 13:40:15 +0200
Subject: [PATCH 12/45] run-slow[align]


From 404239fb63aeeeb4016d30db8deb6ebcf5c5faf7 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Fri, 7 Jun 2024 15:20:29 +0200
Subject: [PATCH 13/45] handle kwargs passed as nested dict

---
 src/transformers/processing_utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 49cd2a471b5b..383788275fff 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -698,8 +698,14 @@ def _merge_kwargs(
                 if kwarg_value != "__empty__":
                     output_kwargs[modality][modality_key] = kwarg_value
 
-        # if something remains in kwargs, it belongs to common
-        output_kwargs["common_kwargs"].update(kwargs)
+        # if something remains in kwargs, it belongs to common after flattening
+        if set(kwargs) & set(default_kwargs):
+            # here kwargs is dictionary-based since it shares keys with default set
+            [output_kwargs["common_kwargs"].update(subdict) for _, subdict in kwargs.items()]
+        else:
+            # here it's a flat dict
+            output_kwargs["common_kwargs"].update(kwargs)
+
         # all modality-specific kwargs are updated with common kwargs
         for modality in output_kwargs:
             output_kwargs[modality].update(output_kwargs["common_kwargs"])

From 603be40f520ac555428c50d99b958de84e7087c8 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Fri, 7 Jun 2024 15:21:18 +0200
Subject: [PATCH 14/45] add from_pretrained test for nested kwargs handling

---
 tests/models/align/test_processor_align.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py
index 31413f28c66d..3108e48cadf9 100644
--- a/tests/models/align/test_processor_align.py
+++ b/tests/models/align/test_processor_align.py
@@ -261,7 +261,24 @@ def test_structured_kwargs_nested(self):
         tokenizer = self.get_tokenizer()
 
         processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
 
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    def test_structured_kwargs_nested_from_dict(self):
+        processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 

From 71c9d6c036cde611ba3086d7be38be79db3c87d7 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Fri, 7 Jun 2024 15:22:19 +0200
Subject: [PATCH 15/45] [run-slow]align


From 26383c52e693613c9266f549d77ceeb2ba5c61de Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Fri, 7 Jun 2024 15:47:48 +0200
Subject: [PATCH 16/45] update documentation + imports

---
 .../models/align/processing_align.py          |  3 ++-
 src/transformers/processing_utils.py          | 25 ++++++++++---------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index a20f2d027aa5..7559d58fa1c6 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -31,9 +31,10 @@
     ProcessingKwargs,
     ProcessorMixin,
     TextKwargs,
+    is_vision_available,
 )
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
-from ...utils import is_torch_available, is_vision_available
+from ...utils import is_torch_available
 
 
 # TODO (@molbap) This is a bother, forward references from TypedDict are resolved and need this to work
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 383788275fff..432d476cd572 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -626,30 +626,31 @@ def _merge_kwargs(
         """
         Method to merge dictionaries of kwargs cleanly separated by modality within a Processor instance.
         The order of operations is as follows:
-            1) kwargs passed as before have highest priority to preserve BC. They mix modalities and may not result in
-            correct behaviour.
+            1) kwargs passed as before have highest priority to preserve BC.
                 ```python
                 high_priority_kwargs = {"crop_size" = (224, 224), "padding" = "max_length"}
                 processor(..., **high_priority_kwargs)
                 ```
-            2) kwargs specified as a dictionary and passed to the processor __call__ have second highest priority.
-            This is the recommended API.
-                ```python
-                recommended_priority_kwargs = {"text_kwargs": {"padding":"max_length"}, "images_kwargs": {"crop_size": (224, 224)}}
-                processor(..., **recommended_priority_kwargs)
-                ```
-            3) kwargs passed as modality-specific kwargs have third priority.
+            2) kwargs passed as modality-specific kwargs have second priority. This is the recommended API.
                 ```python
                 processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": (224, 224)}})
                 ```
-            4) kwargs passed during instantiation of a modality processor have fourth priority.
+            3) kwargs passed during instantiation of a modality processor have fourth priority.
                 ```python
                 tokenizer = tokenizer_class(..., {"padding": "max_length"})
                 image_processor = image_processor_class(...)
                 processor(tokenizer, image_processor) # will pass max_length unless overriden by kwargs at call
                 ```
-            5) defaults kwargs specified at processor level have lowest priority.
-
+            4) defaults kwargs specified at processor level have lowest priority.
+                ```python
+                class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False):
+                    _defaults = {
+                        "text_kwargs": {
+                            "padding": "max_length",
+                            "max_length": 64,
+                        },
+                    }
+                ```
         Args:
             ModelProcessorKwargs (`ProcessingKwargs`):
                 Typed dictionary of kwargs specifically required by the model passed.

From 4521f4fd5f384916a231cd9d97070b3662690445 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Fri, 7 Jun 2024 15:51:31 +0200
Subject: [PATCH 17/45] update audio inputs

---
 src/transformers/tokenization_utils_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 9d401501885e..1471aa25d05a 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -127,7 +127,7 @@ class EncodingFast:
 EncodedInputPair = Tuple[List[int], List[int]]
 
 # Define type aliases for text-related non-text modalities
-AudioInput = Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], List[float]]
+AudioInput = Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]
 
 # Slow tokenizers used to be saved in three separated files
 SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"

From b96eb6483d91c74633584a38867ee5b4721ff83a Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Fri, 7 Jun 2024 15:56:26 +0200
Subject: [PATCH 18/45] protect audio types, silly

---
 src/transformers/tokenization_utils_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 1471aa25d05a..c0410f75b2c2 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -127,7 +127,7 @@ class EncodingFast:
 EncodedInputPair = Tuple[List[int], List[int]]
 
 # Define type aliases for text-related non-text modalities
-AudioInput = Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]
+AudioInput = Union["np.ndarray", "torch.Tensor", List["np.ndarray"], List["torch.Tensor"]]
 
 # Slow tokenizers used to be saved in three separated files
 SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"

From 9c5c01cd69ff5a5f035df81db24298fe7125ad38 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Fri, 7 Jun 2024 17:31:43 +0200
Subject: [PATCH 19/45] try removing imports

---
 src/transformers/models/align/processing_align.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index 7559d58fa1c6..04d50f8188ad 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -31,17 +31,17 @@
     ProcessingKwargs,
     ProcessorMixin,
     TextKwargs,
-    is_vision_available,
 )
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
-from ...utils import is_torch_available
 
 
+"""
 # TODO (@molbap) This is a bother, forward references from TypedDict are resolved and need this to work
 if is_vision_available():
     import PIL  # noqa: F401
 if is_torch_available():
     import torch  # noqa: F401
+"""
 
 
 class AlignProcessorKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False):

From 3ccb50521d3bc8f3d838efef5a0b689ccecb53e9 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Fri, 7 Jun 2024 17:42:57 +0200
Subject: [PATCH 20/45] make things simpler

---
 .../models/align/processing_align.py          | 28 -------------------
 src/transformers/processing_utils.py          | 20 +++++++++++++
 2 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index 04d50f8188ad..c89c2f5e1b62 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -35,35 +35,7 @@
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 
 
-"""
-# TODO (@molbap) This is a bother, forward references from TypedDict are resolved and need this to work
-if is_vision_available():
-    import PIL  # noqa: F401
-if is_torch_available():
-    import torch  # noqa: F401
-"""
-
-
 class AlignProcessorKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False):
-    """
-    Inherits from `ProcessingKwargs` to provide:
-        1) Additional keys that this model requires to process inputs.
-        2) Default values for extra keys.
-    New keys have to be defined as follows to ensure type hinting is done correctly.
-
-    ```python
-    images_kwargs: ImagesKwargs = {"new_image_kwarg": Optional[bool]}
-
-    _defaults = {
-        "text_kwargs": {
-            "padding": "max_length",
-            "max_length": 64,
-        },
-    }
-
-    ```
-    """
-
     _defaults = {
         "text_kwargs": {
             "padding": "max_length",
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 432d476cd572..113abd603601 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -245,6 +245,26 @@ class CommonKwargs(TypedDict, total=False):
 
 
 class ProcessingKwargs(TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, total=False):
+    """
+    Base class for kwargs passing to processors.
+    A model should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide:
+        1) Additional, typed keys and that this model requires to process inputs.
+        2) Default values for existing keys.
+    New keys have to be defined as follows to ensure type hinting is done correctly.
+
+    ```python
+    images_kwargs: ImagesKwargs = {"new_image_kwarg": Optional[bool]}
+
+    _defaults = {
+        "text_kwargs": {
+            "padding": "max_length",
+            "max_length": 64,
+        },
+    }
+
+    ```
+    """
+
     common_kwargs: CommonKwargs = {
         **CommonKwargs.__annotations__,
     }

From 142acf302adb346f1c767606f989d270970cd1e7 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Fri, 7 Jun 2024 18:02:18 +0200
Subject: [PATCH 21/45] simplerer

---
 src/transformers/models/align/processing_align.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index c89c2f5e1b62..b2af0997ac94 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -26,16 +26,13 @@
 
 from ...image_utils import ImageInput
 from ...processing_utils import (
-    CommonKwargs,
-    ImagesKwargs,
     ProcessingKwargs,
     ProcessorMixin,
-    TextKwargs,
 )
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 
 
-class AlignProcessorKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False):
+class AlignProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {
         "text_kwargs": {
             "padding": "max_length",

From 60a5730877cb5776d834307ca55ec6a97b35e152 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 10 Jun 2024 11:38:10 +0200
Subject: [PATCH 22/45] move out kwargs test to common mixin

---
 tests/models/align/test_processor_align.py |  97 +---------------
 tests/test_processing_common.py            | 122 +++++++++++++++++++++
 2 files changed, 128 insertions(+), 91 deletions(-)

diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py
index 3108e48cadf9..3c904e59a883 100644
--- a/tests/models/align/test_processor_align.py
+++ b/tests/models/align/test_processor_align.py
@@ -23,9 +23,11 @@
 
 from transformers import BertTokenizer, BertTokenizerFast
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
     from PIL import Image
@@ -34,7 +36,9 @@
 
 
 @require_vision
-class AlignProcessorTest(unittest.TestCase):
+class AlignProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = AlignProcessor
+
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
@@ -204,92 +208,3 @@ def test_model_input_names(self):
         inputs = processor(text=input_str, images=image_input)
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
-
-    # TODO move these tests to a common Mixin
-    def test_defaults_preserved_kwargs(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer(max_length=117)
-
-        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertEqual(len(inputs["input_ids"]), 117)
-
-    @require_torch
-    def test_defaults_preserved_image_kwargs(self):
-        image_processor = self.get_image_processor(crop_size=(234, 234))
-        tokenizer = self.get_tokenizer(max_length=117)
-
-        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
-
-    @require_torch
-    def test_structured_kwargs(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "return_tensors": "pt",
-            "crop_size": {"height": 214, "width": 214},
-            "padding": "max_length",
-            "max_length": 76,
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    @require_torch
-    def test_structured_kwargs_nested(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    @require_torch
-    def test_structured_kwargs_nested_from_dict(self):
-        processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 402e6a735151..dcd5773eeda6 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -18,6 +18,8 @@
 import tempfile
 import unittest
 
+import numpy as np
+
 from transformers import CLIPTokenizerFast, ProcessorMixin
 from transformers.models.auto.processing_auto import processor_class_from_name
 from transformers.testing_utils import (
@@ -30,6 +32,8 @@
 
 
 if is_vision_available():
+    from PIL import Image
+
     from transformers import CLIPImageProcessor
 
 
@@ -64,6 +68,15 @@ def get_processor(self):
         processor = self.processor_class(**components, **self.prepare_processor_dict())
         return processor
 
+    @require_vision
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+        return image_inputs
+
     def test_processor_to_json_string(self):
         processor = self.get_processor()
         obj = json.loads(processor.to_json_string())
@@ -82,6 +95,115 @@ def test_processor_from_and_save_pretrained(self):
 
                 self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
 
+    # These kwargs-related tests ensure that processors are correctly instantiated.
+    # they need to be applied only if an image_processor exists.
+    @require_vision
+    @require_torch
+    def test_defaults_preserved_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertEqual(len(inputs["input_ids"]), 117)
+
+    @require_torch
+    @require_vision
+    def test_defaults_preserved_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "return_tensors": "pt",
+            "crop_size": {"height": 214, "width": 214},
+            "padding": "max_length",
+            "max_length": 76,
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
 
 class MyProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]

From be6c141d3e3a992452e2ee0e7bd6ea3afdaa3883 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 10 Jun 2024 11:38:31 +0200
Subject: [PATCH 23/45] [run-slow]align


From 84135d739f5156d4707d5c7954f6d09ea92285fb Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 10 Jun 2024 12:15:08 +0200
Subject: [PATCH 24/45] skip tests for old processors

---
 tests/test_processing_common.py | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index dcd5773eeda6..cf08561e6956 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -14,8 +14,10 @@
 # limitations under the License.
 
 
+import inspect
 import json
 import tempfile
+import typing
 import unittest
 
 import numpy as np
@@ -97,6 +99,20 @@ def test_processor_from_and_save_pretrained(self):
 
     # These kwargs-related tests ensure that processors are correctly instantiated.
     # they need to be applied only if an image_processor exists.
+
+    def skip_processor_without_typed_kwargs(self, processor):
+        # TODO this signature check is to test only uniformized processors.
+        # Once all are updated, remove it.
+        is_kwargs_typed_dict = False
+        call_signature = inspect.signature(processor.__call__)
+        for param in call_signature.parameters.values():
+            if param.kind == param.VAR_KEYWORD and param.annotation != param.empty:
+                is_kwargs_typed_dict = (
+                    hasattr(param.annotation, "__origin__") and param.annotation.__origin__ == typing.Unpack
+                )
+        if not is_kwargs_typed_dict:
+            self.skipTest(f"{self.processor_class} doesn't have typed kwargs.")
+
     @require_vision
     @require_torch
     def test_defaults_preserved_kwargs(self):
@@ -106,13 +122,12 @@ def test_defaults_preserved_kwargs(self):
         tokenizer = self.get_component("tokenizer", max_length=117)
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-
+        self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertEqual(len(inputs["input_ids"]), 117)
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(len(inputs["input_ids"][0]), 117)
 
     @require_torch
     @require_vision
@@ -123,6 +138,7 @@ def test_defaults_preserved_image_kwargs(self):
         tokenizer = self.get_component("tokenizer", max_length=117)
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
@@ -139,6 +155,7 @@ def test_structured_kwargs(self):
         tokenizer = self.get_component("tokenizer")
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
@@ -165,6 +182,8 @@ def test_structured_kwargs_nested(self):
         tokenizer = self.get_component("tokenizer")
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
@@ -176,6 +195,8 @@ def test_structured_kwargs_nested(self):
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
         self.assertEqual(len(inputs["input_ids"][0]), 76)
@@ -185,10 +206,12 @@ def test_structured_kwargs_nested(self):
     def test_structured_kwargs_nested_from_dict(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 

From ce967aca755ed5b9b8d6b0626f8226bacbe11f3f Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 10 Jun 2024 12:15:26 +0200
Subject: [PATCH 25/45] [run-slow]align, clip


From f78ec52f744710ccd136d662bbb953ba9585bf90 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 10 Jun 2024 12:25:59 +0200
Subject: [PATCH 26/45] !$#@!! protect imports, darn it

---
 tests/test_processing_common.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index cf08561e6956..d046da8d1efe 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -17,7 +17,12 @@
 import inspect
 import json
 import tempfile
-import typing
+
+
+try:
+    from typing import Unpack
+except ImportError:
+    from typing_extensions import Unpack
 import unittest
 
 import numpy as np
@@ -108,7 +113,7 @@ def skip_processor_without_typed_kwargs(self, processor):
         for param in call_signature.parameters.values():
             if param.kind == param.VAR_KEYWORD and param.annotation != param.empty:
                 is_kwargs_typed_dict = (
-                    hasattr(param.annotation, "__origin__") and param.annotation.__origin__ == typing.Unpack
+                    hasattr(param.annotation, "__origin__") and param.annotation.__origin__ == Unpack
                 )
         if not is_kwargs_typed_dict:
             self.skipTest(f"{self.processor_class} doesn't have typed kwargs.")

From 52fd5ad6cc68a94d6141114c6c7ba6010ad65b59 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 10 Jun 2024 12:26:09 +0200
Subject: [PATCH 27/45] [run-slow]align, clip


From d510030acc89dd02cba79bc6b0d837277c7250ef Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Mon, 10 Jun 2024 17:12:30 +0200
Subject: [PATCH 28/45] [run-slow]align, clip


From fd43bcd457d22a70eb66ad84c0f1eb164bb654d4 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Tue, 11 Jun 2024 13:49:54 +0200
Subject: [PATCH 29/45] update doc

---
 .../models/align/processing_align.py          | 26 +++++--------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index b2af0997ac94..58eb63c45e69 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -54,25 +54,13 @@ class AlignProcessor(ProcessorMixin):
         model_id = "kakaobrain/align-base"
         processor = AlignProcessor.from_pretrained(model_id)
 
-        # Define the kwargs for each modality
-        common_kwargs = {"return_tensors": "pt"}
-        images_kwargs = {"crop_size": {"height": 224, "width": 224}}
-        text_kwargs = {"padding": "do_not_pad"}
-
-        # Combine them into a single dictionary
-
-        all_kwargs = {
-        "images_kwargs": images_kwargs,
-        "text_kwargs": text_kwargs,
-        "common_kwargs": common_kwargs
-        }
-
-        processor(images=your_pil_image, text=["What is that?"], **all_kwargs)
-
-        # passing directly any number of kwargs flattened is also supported
-
-        all_kwargs = {"return_tensors": "pt", "crop_size": {"height": 214, "width": 214}, "padding": "max_length", "max_length": 76}
-        processor(images=your_pil_image, text=["What is that?"], **all_kwargs)
+        processor(
+            images=your_pil_image,
+            text=["What is that?"],
+            images_kwargs = {"crop_size": {"height": 224, "width": 224}},
+            text_kwargs = {"padding": "do_not_pad"},
+            common_kwargs = {"return_tensors": "pt"},
+        )
         ```
 
     Args:

From b2cd7c9e044a9b2558a56a159c6cb94da03c9aab Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Tue, 11 Jun 2024 13:53:46 +0200
Subject: [PATCH 30/45] improve documentation for default values

---
 src/transformers/processing_utils.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 113abd603601..d94ef86f2d6c 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -248,19 +248,25 @@ class ProcessingKwargs(TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, Comm
     """
     Base class for kwargs passing to processors.
     A model should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide:
-        1) Additional, typed keys and that this model requires to process inputs.
-        2) Default values for existing keys.
+        1) Additional typed keys and that this model requires to process inputs.
+        2) Default values for existing keys under a `_defaults` attribute.
     New keys have to be defined as follows to ensure type hinting is done correctly.
 
     ```python
-    images_kwargs: ImagesKwargs = {"new_image_kwarg": Optional[bool]}
-
-    _defaults = {
-        "text_kwargs": {
-            "padding": "max_length",
-            "max_length": 64,
-        },
-    }
+    # adding a new image kwarg for this model
+    class ModelImagesKwargs(ImagesKwargs, total=False):
+        new_image_kwarg: Optional[bool]
+
+    class ModelProcessorKwargs(ProcessingKwargs, total=False):
+        images_kwargs: ModelImagesKwargs
+        _defaults = {
+            "images_kwargs: {
+                "new_image_kwarg": False,
+            }
+            "text_kwargs": {
+                "padding": "max_length",
+            },
+        }
 
     ```
     """

From bcbd64646ecd8cc863a14cbcd16ac3bca47f3303 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Tue, 11 Jun 2024 15:07:53 +0200
Subject: [PATCH 31/45] add model_max_length testing

This parameter depends on tokenizers received.
---
 tests/test_processing_common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index d046da8d1efe..a4c217df2c66 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -171,6 +171,7 @@ def test_structured_kwargs(self):
             "crop_size": {"height": 214, "width": 214},
             "padding": "max_length",
             "max_length": 76,
+            "model_max_length": 76,
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)

From 39c1587e23f34b158e665f33c4f7a1eef3932a42 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Tue, 11 Jun 2024 15:08:13 +0200
Subject: [PATCH 32/45] Raise if kwargs are specified in two places

---
 src/transformers/processing_utils.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index d94ef86f2d6c..97060c89a3e7 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -709,7 +709,9 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
         for modality in default_kwargs:
             default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
         # update modality kwargs with passed kwargs
-        for modality in output_kwargs:
+        non_modality_kwargs = set(kwargs) - set(output_kwargs)
+
+        for modality in set(output_kwargs):
             output_kwargs[modality] = {
                 **default_kwargs[modality],
             }
@@ -720,11 +722,17 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                 # check if we received a structured kwarg dict or not to handle it correctly
                 if modality in kwargs:
                     kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
-                else:
+                    # check if this key was passed as a flat kwarg.
+                    if kwarg_value != "__empty__" and modality_key in non_modality_kwargs:
+                        raise ValueError(
+                            f"Keyword argument {modality_key} was passed two times: in a dictionary for {modality} and as a **kwarg."
+                        )
+                elif modality_key in kwargs:
                     kwarg_value = kwargs.pop(modality_key, "__empty__")
+                else:
+                    kwarg_value = "__empty__"
                 if kwarg_value != "__empty__":
                     output_kwargs[modality][modality_key] = kwarg_value
-
         # if something remains in kwargs, it belongs to common after flattening
         if set(kwargs) & set(default_kwargs):
             # here kwargs is dictionary-based since it shares keys with default set

From 1f73bdf586dcc838498c9b790c0fbaab923386f5 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Tue, 11 Jun 2024 16:21:19 +0200
Subject: [PATCH 33/45] fix


From e4d6d12703db9aadc04efe349bfe4729b55dcd1c Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Wed, 12 Jun 2024 18:24:08 +0200
Subject: [PATCH 34/45] expand VideoInput

---
 src/transformers/image_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index aa09e74558a3..4df2ae1f1633 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -81,8 +81,7 @@
 ]  # noqa
 
 
-VideoInput = Union[np.ndarray, "torch.Tensor", List[np.ndarray], List["torch.Tensor"]]  # noqa
-
+VideoInput = Union[List["PIL.Image.Image"], np.ndarray, "torch.Tensor", List[np.ndarray], List["torch.Tensor"]], List[List["PIL.Image.Image"]], List[List[np.ndarrray]], List[List["torch.Tensor"]]  # noqa
 
 class ChannelDimension(ExplicitEnum):
     FIRST = "channels_first"

From 1e09e4a971858a0eff272e9fbc4c6de68df7b86d Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Wed, 12 Jun 2024 18:24:43 +0200
Subject: [PATCH 35/45] fix

---
 src/transformers/image_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 4df2ae1f1633..073af604afd7 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -81,7 +81,7 @@
 ]  # noqa
 
 
-VideoInput = Union[List["PIL.Image.Image"], np.ndarray, "torch.Tensor", List[np.ndarray], List["torch.Tensor"]], List[List["PIL.Image.Image"]], List[List[np.ndarrray]], List[List["torch.Tensor"]]  # noqa
+VideoInput = Union[List["PIL.Image.Image"], np.ndarray, "torch.Tensor", List[np.ndarray], List["torch.Tensor"]], List[List["PIL.Image.Image"]], List[List[np.ndarrray]], List[List["torch.Tensor"]]]  # noqa
 
 class ChannelDimension(ExplicitEnum):
     FIRST = "channels_first"

From d4232f0be677314ebcead5e244911f8753067ec3 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Wed, 12 Jun 2024 18:27:29 +0200
Subject: [PATCH 36/45] fix style

---
 src/transformers/image_utils.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 073af604afd7..45cc6e09c928 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -81,7 +81,17 @@
 ]  # noqa
 
 
-VideoInput = Union[List["PIL.Image.Image"], np.ndarray, "torch.Tensor", List[np.ndarray], List["torch.Tensor"]], List[List["PIL.Image.Image"]], List[List[np.ndarrray]], List[List["torch.Tensor"]]]  # noqa
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    np.ndarray,
+    "torch.Tensor",
+    List[np.ndarray],
+    List["torch.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List[np.ndarrray]],
+    List[List["torch.Tensor"]],
+]  # noqa
+
 
 class ChannelDimension(ExplicitEnum):
     FIRST = "channels_first"

From 162b1a702af1be0e21e4dbd9f82954432774e503 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Wed, 12 Jun 2024 18:27:43 +0200
Subject: [PATCH 37/45] remove defaults values

---
 src/transformers/processing_utils.py | 78 ++++++++++++++++++----------
 1 file changed, 50 insertions(+), 28 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 97060c89a3e7..cb60e77c87f2 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -74,17 +74,17 @@ class TextKwargs(TypedDict, total=False):
     docstrings associated.
 
     Attributes:
-        add_special_tokens (`bool`, *optional*, defaults to `True`):
+        add_special_tokens (`bool`, *optional*)
             Whether or not to add special tokens when encoding the sequences.
-        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*)
             Activates and controls padding.
-        truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+        truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*):
             Activates and controls truncation.
         max_length (`int`, *optional*):
             Controls the maximum length to use by one of the truncation/padding parameters.
-        stride (`int`, *optional*, defaults to 0):
+        stride (`int`, *optional*):
             If set, the overflowing tokens will contain some tokens from the end of the truncated sequence.
-        is_split_into_words (`bool`, *optional*, defaults to `False`):
+        is_split_into_words (`bool`, *optional*):
             Whether or not the input is already pre-tokenized.
         pad_to_multiple_of (`int`, *optional*):
             If set, will pad the sequence to a multiple of the provided value.
@@ -92,15 +92,15 @@ class TextKwargs(TypedDict, total=False):
             Whether to return token type IDs.
         return_attention_mask (`bool`, *optional*):
             Whether to return the attention mask.
-        return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
+        return_overflowing_tokens (`bool`, *optional*):
             Whether or not to return overflowing token sequences.
-        return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
+        return_special_tokens_mask (`bool`, *optional*):
             Whether or not to return special tokens mask information.
-        return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+        return_offsets_mapping (`bool`, *optional*):
             Whether or not to return `(char_start, char_end)` for each token.
-        return_length (`bool`, *optional*, defaults to `False`):
+        return_length (`bool`, *optional*):
             Whether or not to return the lengths of the encoded inputs.
-        verbose (`bool`, *optional*, defaults to `True`):
+        verbose (`bool`, *optional*):
             Whether or not to print more information and warnings.
         padding_side (`str`, *optional*):
             The side on which padding will be applied.
@@ -129,31 +129,31 @@ class ImagesKwargs(TypedDict, total=False):
     class methods and docstrings.
 
     Attributes:
-        do_resize (`bool`, *optional*, defaults to `True`):
+        do_resize (`bool`, *optional*):
             Whether to resize the image.
-        size (`Dict[str, int]`, *optional*, defaults to `{'shortest_edge': 288}`):
+        size (`Dict[str, int]`, *optional*):
             Resize the shorter side of the input to `size["shortest_edge"]`.
-        size_divisor (`int`, *optional*, defaults to 32):
+        size_divisor (`int`, *optional*):
             The size by which to make sure both the height and width can be divided.
         crop_size (`Dict[str, int]`, *optional*):
             Desired output size when applying center-cropping.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*):
             Resampling filter to use if resizing the image.
-        do_rescale (`bool`, *optional*, defaults to `True`):
+        do_rescale (`bool`, *optional*):
             Whether to rescale the image by the specified scale `rescale_factor`.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+        rescale_factor (`int` or `float`, *optional*):
             Scale factor to use if rescaling the image.
-        do_normalize (`bool`, *optional*, defaults to `True`):
+        do_normalize (`bool`, *optional*):
             Whether to normalize the image.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+        image_mean (`float` or `List[float]`, *optional*):
             Mean to use if normalizing the image.
-        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+        image_std (`float` or `List[float]`, *optional*):
             Standard deviation to use if normalizing the image.
-        do_pad (`bool`, *optional*, defaults to `True`):
+        do_pad (`bool`, *optional*):
             Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
-        do_center_crop (`bool`, *optional*, defaults to `True`):
+        do_center_crop (`bool`, *optional*):
             Whether to center crop the image.
-        data_format (`ChannelDimension` or `str`, *optional*, defaults to `"channels_first"`):
+        data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the output image.
         input_data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the input image.
@@ -180,9 +180,32 @@ class VideosKwargs(TypedDict, total=False):
     Keyword arguments for video processing.
 
     Attributes:
-        do_resize (`bool`, *optional*, defaults to `True`):
+        do_resize (`bool`):
             Whether to resize the image.
-        # ... (Add docstrings for other videos_kwargs)
+        size (`Dict[str, int]`, *optional*):
+            Resize the shorter side of the input to `size["shortest_edge"]`.
+        size_divisor (`int`, *optional*):
+            The size by which to make sure both the height and width can be divided.
+        resample (`PILImageResampling`, *optional*):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*):
+            Mean to use if normalizing the image.
+        image_std (`float` or `List[float]`, *optional*):
+            Standard deviation to use if normalizing the image.
+        do_pad (`bool`, *optional*):
+            Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
+        do_center_crop (`bool`, *optional*):
+            Whether to center crop the image.
+        data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format for the output image.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format for the input image.
     """
 
     do_resize: Optional[bool]
@@ -219,15 +242,14 @@ class AudioKwargs(TypedDict, total=False):
                 sequence if provided).
             - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                 acceptable input length for the model if that argument is not provided.
-            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                lengths).
+            - `False` or `'do_not_pad'`
         max_length (`int`, *optional*):
             Maximum length of the returned list and optionally padding length (see above).
-        truncation (`bool`):
+        truncation (`bool`, *optional*):
             Activates truncation to cut input sequences longer than *max_length* to *max_length*.
         pad_to_multiple_of (`int`, *optional*):
             If set, will pad the sequence to a multiple of the provided value.
-        return_attention_mask (`bool`, *optional*, defaults to `False`):
+        return_attention_mask (`bool`, *optional*):
             Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
     """
 

From 0da1dc315c52ad98eb2e68f13ba77d95f2a27e7c Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Wed, 12 Jun 2024 18:44:24 +0200
Subject: [PATCH 38/45] add comment to indicate documentation on adding kwargs

---
 src/transformers/models/align/processing_align.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index 58eb63c45e69..5fdaf0514048 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -33,6 +33,7 @@
 
 
 class AlignProcessorKwargs(ProcessingKwargs, total=False):
+    # see processing_utils.ProcessingKwargs documentation for usage.
     _defaults = {
         "text_kwargs": {
             "padding": "max_length",

From f6f1dacd76f02cd9d272323bbd64fa9048526924 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Wed, 12 Jun 2024 18:47:56 +0200
Subject: [PATCH 39/45] protect imports

---
 src/transformers/image_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 45cc6e09c928..13fdb1d6ebc2 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -83,12 +83,12 @@
 
 VideoInput = Union[
     List["PIL.Image.Image"],
-    np.ndarray,
+    "np.ndarray",
     "torch.Tensor",
-    List[np.ndarray],
+    List["np.ndarray"],
     List["torch.Tensor"],
     List[List["PIL.Image.Image"]],
-    List[List[np.ndarrray]],
+    List[List["np.ndarrray"]],
     List[List["torch.Tensor"]],
 ]  # noqa
 

From c4b7e840b86597e1e5a310851739b3b9a7d41148 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Wed, 12 Jun 2024 18:49:24 +0200
Subject: [PATCH 40/45] [run-slow]align


From 3ce3608dced7a7151b85046f30377382778e8e40 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Wed, 12 Jun 2024 19:06:02 +0200
Subject: [PATCH 41/45] fix


From 6b83e39dc647a3471e8b86f93ba6edbeb3d08add Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Thu, 13 Jun 2024 12:44:12 +0200
Subject: [PATCH 42/45] remove set() that breaks ordering

---
 src/transformers/processing_utils.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index cb60e77c87f2..9c757ee21c81 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -665,6 +665,15 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
         else:
             return processor
 
+    def update_kwargs(self, kwarg_dict, modality, key, value):
+        """Helper function to update kwargs and handle conflicts."""
+        if key in kwarg_dict[modality]:
+            raise ValueError(
+                f"Keyword argument {key} was passed two times: in a dictionary for {modality} and as a **kwarg."
+            )
+        kwarg_dict[modality][key] = value
+        return kwarg_dict
+
     def _merge_kwargs(
         self,
         ModelProcessorKwargs: ProcessingKwargs,
@@ -730,17 +739,17 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
         # get defaults from set model processor kwargs if they exist
         for modality in default_kwargs:
             default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
-        # update modality kwargs with passed kwargs
-        non_modality_kwargs = set(kwargs) - set(output_kwargs)
-
-        for modality in set(output_kwargs):
-            output_kwargs[modality] = {
-                **default_kwargs[modality],
-            }
+            # update defaults with arguments from tokenizer init
             for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
                 # init with tokenizer init kwargs if necessary
                 if modality_key in tokenizer_init_kwargs:
-                    output_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key]
+                    default_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key]
+        # now defaults kwargs are updated with the tokenizers defaults.
+        output_kwargs.update(default_kwargs)
+        # update modality kwargs with passed kwargs
+        non_modality_kwargs = set(kwargs) - set(output_kwargs)
+        for modality in output_kwargs:
+            for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
                 # check if we received a structured kwarg dict or not to handle it correctly
                 if modality in kwargs:
                     kwarg_value = kwargs[modality].pop(modality_key, "__empty__")

From 3818b86af9e07b129f090b67e67b13fe709a324c Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Thu, 13 Jun 2024 12:44:24 +0200
Subject: [PATCH 43/45] test more

---
 tests/test_processing_common.py | 105 ++++++++++++++++++++++++++++----
 1 file changed, 92 insertions(+), 13 deletions(-)

diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index a4c217df2c66..074aa2f1d625 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -44,6 +44,8 @@
     from transformers import CLIPImageProcessor
 
 
+@require_torch
+@require_vision
 @require_torch
 class ProcessorTesterMixin:
     processor_class = None
@@ -120,7 +122,7 @@ def skip_processor_without_typed_kwargs(self, processor):
 
     @require_vision
     @require_torch
-    def test_defaults_preserved_kwargs(self):
+    def test_tokenizer_defaults_preserved_by_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -136,7 +138,7 @@ def test_defaults_preserved_kwargs(self):
 
     @require_torch
     @require_vision
-    def test_defaults_preserved_image_kwargs(self):
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", crop_size=(234, 234))
@@ -151,9 +153,42 @@ def test_defaults_preserved_image_kwargs(self):
         inputs = processor(text=input_str, images=image_input)
         self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
 
+    @require_vision
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
+        self.assertEqual(len(inputs["input_ids"][0]), 112)
+
     @require_torch
     @require_vision
-    def test_structured_kwargs(self):
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, crop_size=[224, 224])
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -164,20 +199,64 @@ def test_structured_kwargs(self):
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            padding="max_length",
+            max_length=76,
+        )
 
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "return_tensors": "pt",
-            "crop_size": {"height": 214, "width": 214},
-            "padding": "max_length",
-            "max_length": 76,
-            "model_max_length": 76,
-        }
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
 
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
+        self.assertEqual(len(inputs["input_ids"][0]), 6)
+
+    @require_torch
+    @require_vision
+    def test_doubly_passed_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer"]
+        image_input = self.prepare_image_inputs()
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                images=image_input,
+                images_kwargs={"crop_size": {"height": 222, "width": 222}},
+                crop_size={"height": 214, "width": 214},
+            )
 
     @require_torch
     @require_vision

From 31b7a602274901a6c007c822de26c70c91e33e05 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Thu, 13 Jun 2024 12:46:22 +0200
Subject: [PATCH 44/45] removed unused func

---
 src/transformers/processing_utils.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 9c757ee21c81..03db53c91091 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -665,15 +665,6 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
         else:
             return processor
 
-    def update_kwargs(self, kwarg_dict, modality, key, value):
-        """Helper function to update kwargs and handle conflicts."""
-        if key in kwarg_dict[modality]:
-            raise ValueError(
-                f"Keyword argument {key} was passed two times: in a dictionary for {modality} and as a **kwarg."
-            )
-        kwarg_dict[modality][key] = value
-        return kwarg_dict
-
     def _merge_kwargs(
         self,
         ModelProcessorKwargs: ProcessingKwargs,
@@ -745,7 +736,9 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                 if modality_key in tokenizer_init_kwargs:
                     default_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key]
         # now defaults kwargs are updated with the tokenizers defaults.
+        # pass defaults to output dictionary
         output_kwargs.update(default_kwargs)
+
         # update modality kwargs with passed kwargs
         non_modality_kwargs = set(kwargs) - set(output_kwargs)
         for modality in output_kwargs:

From 4072336a03e4e98d2c8881d685943fc51e2b9b88 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <pablo.montalvo.leroux@gmail.com>
Date: Thu, 13 Jun 2024 15:41:38 +0200
Subject: [PATCH 45/45] [run-slow]align