From b85036f9a12b6ce29f823e31ca668dc4dc2b20c8 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 3 Jun 2024 09:38:41 +0200 Subject: [PATCH 01/45] add initial design for uniform processors + align model --- .../models/align/processing_align.py | 163 ++++++++++++-- src/transformers/processing_utils.py | 202 +++++++++++++++++- src/transformers/tokenization_utils_base.py | 2 + tests/models/align/test_processor_align.py | 1 - tests/test_image_processing_common.py | 1 + 5 files changed, 343 insertions(+), 26 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index 3bc97afd1ca5..0ddc50c094c0 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -16,8 +16,66 @@ Image/Text processor class for ALIGN """ -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding +from typing import List, Union + +from ...image_utils import ImageInput +from ...processing_utils import ( + CommonKwargs, + ImagesKwargs, + ProcessingKwargs, + ProcessorMixin, + TextKwargs, +) +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput +from ...utils import is_torch_available, is_vision_available + + +# TODO (@molbap) This is a bother, forward references from TypedDict are resolved and need this to work +if is_vision_available(): + import PIL # noqa: F401 +if is_torch_available(): + import torch # noqa: F401 + + +class AlignProcessorKwargs(ProcessingKwargs, total=False): + """ + Inherits from `ProcessingKwargs` to provide: + 1) Additional keys that this model requires to process inputs. + 2) Default values for extra keys. + New keys have to be defined as follows to ensure type hinting is done correctly. + + ```python + common_kwargs: CommonKwargs = { + **CommonKwargs.__annotations__, + } + text_kwargs: TextKwargs = { + **TextKwargs.__annotations__, + "a_new_text_boolean_key": Optional[bool], + } + images_kwargs: ImagesKwargs = { + **ImagesKwargs.__annotations__, + "a_new_image_processing_key": Optional[int] + } + ``` + + """ + + common_kwargs: CommonKwargs = { + **CommonKwargs.__annotations__, + } + text_kwargs: TextKwargs = { + **TextKwargs.__annotations__, + } + images_kwargs: ImagesKwargs = { + **ImagesKwargs.__annotations__, + } + + _defaults = { + "text_kwargs": { + "padding": "max_length", + "max_length": 64, + }, + } class AlignProcessor(ProcessorMixin): @@ -26,12 +84,39 @@ class AlignProcessor(ProcessorMixin): [`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that interits both the image processor and tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information. + The preferred way of passing kwargs is as a dictionary per modality, see usage example below. + ```python + from transformers import AlignProcessor + from PIL import Image + model_id = "kakaobrain/align-base" + processor = AlignProcessor.from_pretrained(model_id) + + # Define the kwargs for each modality + common_kwargs = {"return_tensors": "pt"} + images_kwargs = {"crop_size": {"height": 224, "width": 224}} + text_kwargs = {"padding": "do_not_pad"} + + # Combine them into a single dictionary + + all_kwargs = { + "images_kwargs": images_kwargs, + "text_kwargs": text_kwargs, + "common_kwargs": common_kwargs + } + + processor(images=your_pil_image, text=["What is that?"], **all_kwargs) + + # passing directly any number of kwargs is also supported, but not recommended + + processor(images=your_pil_image, text=["What is that?"], padding="do_not_pad) + ``` Args: image_processor ([`EfficientNetImageProcessor`]): The image processor is a required input. tokenizer ([`BertTokenizer`, `BertTokenizerFast`]): The tokenizer is a required input. + """ attributes = ["image_processor", "tokenizer"] @@ -41,11 +126,21 @@ class AlignProcessor(ProcessorMixin): def __init__(self, image_processor, tokenizer): super().__init__(image_processor, tokenizer) - def __call__(self, text=None, images=None, padding="max_length", max_length=64, return_tensors=None, **kwargs): + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + images: ImageInput = None, + audio=None, + videos=None, + text_kwargs: AlignProcessorKwargs.text_kwargs = {}, + images_kwargs: AlignProcessorKwargs.images_kwargs = {}, + common_kwargs: AlignProcessorKwargs.common_kwargs = {}, + **kwargs: AlignProcessorKwargs, + ) -> BatchEncoding: """ Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text` - and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode - the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` arguments to EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring of the above two methods for more information. @@ -57,20 +152,12 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64, images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. - padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`): - Activates and controls padding for tokenization of input text. Choose between [`True` or `'longest'`, - `'max_length'`, `False` or `'do_not_pad'`] - max_length (`int`, *optional*, defaults to `max_length`): - Maximum padding value to use to pad the input text during tokenization. - return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Acceptable values are: - - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. - + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. Returns: [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: @@ -81,15 +168,45 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64, - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ if text is None and images is None: - raise ValueError("You have to specify either text or images. Both cannot be none.") - + raise ValueError("You must specify either text or images.") + # Init with default values if they exist + text_kwargs = AlignProcessorKwargs._defaults.get("text_kwargs", {}).copy() + + # then override with tokenizer-level arguments passed + text_kwargs.update( + {k: v for k, v in self.tokenizer.init_kwargs.items() if k in AlignProcessorKwargs.text_kwargs} + ) + # then get passed per-modality dictionaries if they exist + text_kwargs.update(kwargs.pop("text_kwargs", {})) + images_kwargs.update(kwargs.pop("images_kwargs", {})) + common_kwargs.update(kwargs.pop("common_kwargs", {})) + # then merge kwargs by name + for text_key in AlignProcessorKwargs.text_kwargs.keys(): + text_kwarg_value = kwargs.pop(text_key, None) + if text_kwarg_value is not None: + text_kwargs[text_key] = text_kwarg_value + + for images_key in AlignProcessorKwargs.images_kwargs.keys(): + images_kwarg_value = kwargs.pop(images_key, None) + if images_kwarg_value is not None: + images_kwargs[images_key] = images_kwarg_value + # if something remains in kwargs, it belongs to common + common_kwargs.update(kwargs) + + # all modality-specific kwargs are updated with common kwargs + text_kwargs.update(common_kwargs) + images_kwargs.update(common_kwargs) + + # then, we can pass correct kwargs to each processor if text is not None: - encoding = self.tokenizer( - text, padding=padding, max_length=max_length, return_tensors=return_tensors, **kwargs - ) + encoding = self.tokenizer(text, **text_kwargs) if images is not None: - image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) + image_features = self.image_processor(images, **images_kwargs) + + # BC for explicit return_tensors + if "return_tensors" in common_kwargs: + return_tensors = common_kwargs.pop("return_tensors", None) if text is not None and images is not None: encoding["pixel_values"] = image_features.pixel_values diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index d76fa4dccccf..df6a178bb73f 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -22,13 +22,27 @@ import os import warnings from pathlib import Path -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union + +import numpy as np from .dynamic_module_utils import custom_object_save -from .tokenization_utils_base import PreTrainedTokenizerBase +from .image_utils import ChannelDimension, is_vision_available + + +if is_vision_available(): + from .image_utils import PILImageResampling + + +from .tokenization_utils_base import ( + PaddingStrategy, + PreTrainedTokenizerBase, + TruncationStrategy, +) from .utils import ( PROCESSOR_NAME, PushToHubMixin, + TensorType, add_model_info_to_auto_map, add_model_info_to_custom_pipelines, cached_file, @@ -54,6 +68,190 @@ } +class TextKwargs(TypedDict, total=False): + """ + Keyword arguments for text processing. For extended documentation, check out tokenization_utils_base methods and + docstrings associated. + + Attributes: + add_special_tokens (`bool`, *optional*, defaults to `True`): + Whether or not to add special tokens when encoding the sequences. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. + truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): + Activates and controls truncation. + max_length (`int`, *optional*): + Controls the maximum length to use by one of the truncation/padding parameters. + stride (`int`, *optional*, defaults to 0): + If set, the overflowing tokens will contain some tokens from the end of the truncated sequence. + is_split_into_words (`bool`, *optional*, defaults to `False`): + Whether or not the input is already pre-tokenized. + pad_to_multiple_of (`int`, *optional*): + If set, will pad the sequence to a multiple of the provided value. + return_token_type_ids (`bool`, *optional*): + Whether to return token type IDs. + return_attention_mask (`bool`, *optional*): + Whether to return the attention mask. + return_overflowing_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to return overflowing token sequences. + return_special_tokens_mask (`bool`, *optional*, defaults to `False`): + Whether or not to return special tokens mask information. + return_offsets_mapping (`bool`, *optional*, defaults to `False`): + Whether or not to return `(char_start, char_end)` for each token. + return_length (`bool`, *optional*, defaults to `False`): + Whether or not to return the lengths of the encoded inputs. + verbose (`bool`, *optional*, defaults to `True`): + Whether or not to print more information and warnings. + padding_side (`str`, *optional*): + The side on which padding will be applied. + """ + + add_special_tokens: Optional[bool] + padding: Union[bool, str, PaddingStrategy] + truncation: Union[bool, str, TruncationStrategy] + max_length: Optional[int] + stride: Optional[int] + is_split_into_words: Optional[bool] + pad_to_multiple_of: Optional[int] + return_token_type_ids: Optional[bool] + return_attention_mask: Optional[bool] + return_overflowing_tokens: Optional[bool] + return_special_tokens_mask: Optional[bool] + return_offsets_mapping: Optional[bool] + return_length: Optional[bool] + verbose: Optional[bool] + padding_side: Optional[str] + + +class ImagesKwargs(TypedDict, total=False): + """ + Keyword arguments for image processing. For extended documentation, check the appropriate ImageProcessor + class methods and docstrings. + + Attributes: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `{'shortest_edge': 288}`): + Resize the shorter side of the input to `size["shortest_edge"]`. + size_divisor (`int`, *optional*, defaults to 32): + The size by which to make sure both the height and width can be divided. + crop_size (`Dict[str, int]`, *optional*): + Desired output size when applying center-cropping. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image to the `(max_height, max_width)` of the images in the batch. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `"channels_first"`): + The channel dimension format for the output image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. + """ + + do_resize: Optional[bool] + size: Optional[Dict[str, int]] + size_divisor: Optional[int] + crop_size: Optional[Dict[str, int]] + resample: Optional[Union["PILImageResampling", int]] + do_rescale: Optional[bool] + rescale_factor: Optional[float] + do_normalize: Optional[bool] + image_mean: Optional[Union[float, List[float]]] + image_std: Optional[Union[float, List[float]]] + do_pad: Optional[bool] + do_center_crop: Optional[bool] + data_format: Optional[ChannelDimension] + input_data_format: Optional[Union[str, ChannelDimension]] + + +class VideosKwargs(TypedDict, total=False): + """ + Keyword arguments for video processing. + + Attributes: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image. + # ... (Add docstrings for other videos_kwargs) + """ + + do_resize: Optional[bool] + size: Optional[Dict[str, int]] + size_divisor: Optional[int] + resample: Optional["PILImageResampling"] + do_rescale: Optional[bool] + rescale_factor: Optional[float] + do_normalize: Optional[bool] + image_mean: Optional[Union[float, List[float]]] + image_std: Optional[Union[float, List[float]]] + do_pad: Optional[bool] + do_center_crop: Optional[bool] + data_format: Optional[ChannelDimension] + input_data_format: Optional[Union[str, ChannelDimension]] + + +class AudioKwargs(TypedDict, total=False): + """ + Keyword arguments for audio processing. + + Attributes: + sampling_rate (`int`, *optional*): + The sampling rate at which the `raw_speech` input was sampled. + raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`): + The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float + values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not + stereo, i.e. single float per timestep. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + truncation (`bool`): + Activates truncation to cut input sequences longer than *max_length* to *max_length*. + pad_to_multiple_of (`int`, *optional*): + If set, will pad the sequence to a multiple of the provided value. + return_attention_mask (`bool`, *optional*, defaults to `False`): + Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`. + """ + + sampling_rate: Optional[int] + raw_speech: Optional[Union["np.ndarray", List[float], List["np.ndarray"], List[List[float]]]] + padding: Optional[Union[bool, str, PaddingStrategy]] + max_length: Optional[int] + truncation: Optional[bool] + pad_to_multiple_of: Optional[int] + return_attention_mask: Optional[bool] + + +class CommonKwargs(TypedDict, total=False): + return_tensors: Optional[Union[str, TensorType]] + + +class ProcessingKwargs(TypedDict, total=False): + common_kwargs: CommonKwargs + text_kwargs: TextKwargs + images_kwargs: ImagesKwargs + audio_kwargs: AudioKwargs + videos_kwargs: VideosKwargs + + class ProcessorMixin(PushToHubMixin): """ This is a mixin used to provide saving/loading functionality for all processor classes. diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 116fbfdf7bbb..9d401501885e 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -126,6 +126,8 @@ class EncodingFast: PreTokenizedInputPair = Tuple[List[str], List[str]] EncodedInputPair = Tuple[List[int], List[int]] +# Define type aliases for text-related non-text modalities +AudioInput = Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], List[float]] # Slow tokenizers used to be saved in three separated files SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py index 12fbea5a50cd..9c5f4cccca88 100644 --- a/tests/models/align/test_processor_align.py +++ b/tests/models/align/test_processor_align.py @@ -159,7 +159,6 @@ def test_tokenizer(self): encoded_processor = processor(text=input_str) encoded_tok = tokenizer(input_str, padding="max_length", max_length=64) - for key in encoded_tok.keys(): self.assertListEqual(encoded_tok[key], encoded_processor[key]) diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py index 90c1a4e7e127..074da8a2bbab 100644 --- a/tests/test_image_processing_common.py +++ b/tests/test_image_processing_common.py @@ -291,6 +291,7 @@ def test_call_numpy_4_channels(self): ) def test_image_processor_preprocess_arguments(self): + # Test that an instantiated image processor is called with the correct arg spec image_processor = self.image_processing_class(**self.image_processor_dict) if hasattr(image_processor, "_valid_processor_keys") and hasattr(image_processor, "preprocess"): preprocess_parameter_names = inspect.getfullargspec(image_processor.preprocess).args From bb8ac70e1f27de6c7c8e85975dbf3fb248a2c606 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 3 Jun 2024 10:58:45 +0200 Subject: [PATCH 02/45] fix mutable default :eyes: --- .../models/align/processing_align.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index 0ddc50c094c0..431d2fec625f 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -132,9 +132,9 @@ def __call__( images: ImageInput = None, audio=None, videos=None, - text_kwargs: AlignProcessorKwargs.text_kwargs = {}, - images_kwargs: AlignProcessorKwargs.images_kwargs = {}, - common_kwargs: AlignProcessorKwargs.common_kwargs = {}, + text_kwargs: AlignProcessorKwargs.text_kwargs = None, + images_kwargs: AlignProcessorKwargs.images_kwargs = None, + common_kwargs: AlignProcessorKwargs.common_kwargs = None, **kwargs: AlignProcessorKwargs, ) -> BatchEncoding: """ @@ -169,6 +169,15 @@ def __call__( """ if text is None and images is None: raise ValueError("You must specify either text or images.") + + # set kwargs as empty dicts to avoid default mutable + if text_kwargs is None: + text_kwargs = {} + if images_kwargs is None: + images_kwargs = {} + if common_kwargs is None: + common_kwargs = {} + # Init with default values if they exist text_kwargs = AlignProcessorKwargs._defaults.get("text_kwargs", {}).copy() @@ -177,6 +186,7 @@ def __call__( {k: v for k, v in self.tokenizer.init_kwargs.items() if k in AlignProcessorKwargs.text_kwargs} ) # then get passed per-modality dictionaries if they exist + text_kwargs.update(kwargs.pop("text_kwargs", {})) images_kwargs.update(kwargs.pop("images_kwargs", {})) common_kwargs.update(kwargs.pop("common_kwargs", {})) From cd8c6018361a7604f063590a04bf10e05ff1cd72 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 3 Jun 2024 10:59:15 +0200 Subject: [PATCH 03/45] add configuration test --- tests/models/align/test_processor_align.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py index 9c5f4cccca88..eaa54f911292 100644 --- a/tests/models/align/test_processor_align.py +++ b/tests/models/align/test_processor_align.py @@ -204,3 +204,16 @@ def test_model_input_names(self): inputs = processor(text=input_str, images=image_input) self.assertListEqual(list(inputs.keys()), processor.model_input_names) + + def test_defaults_preserved(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer(max_length=117) + + processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertEqual(len(inputs["input_ids"]), 117) From f00c85277dd083cdd259727cffd679b3c98e6731 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 3 Jun 2024 12:28:41 +0200 Subject: [PATCH 04/45] handle structured kwargs w defaults + add test --- .../models/align/processing_align.py | 10 ++++---- tests/models/align/test_processor_align.py | 23 +++++++++++++++++++ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index 431d2fec625f..e8c4857814bd 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -169,7 +169,6 @@ def __call__( """ if text is None and images is None: raise ValueError("You must specify either text or images.") - # set kwargs as empty dicts to avoid default mutable if text_kwargs is None: text_kwargs = {} @@ -177,19 +176,18 @@ def __call__( images_kwargs = {} if common_kwargs is None: common_kwargs = {} - # Init with default values if they exist - text_kwargs = AlignProcessorKwargs._defaults.get("text_kwargs", {}).copy() + default_text_kwargs = AlignProcessorKwargs._defaults.get("text_kwargs", {}).copy() # then override with tokenizer-level arguments passed - text_kwargs.update( + default_text_kwargs.update( {k: v for k, v in self.tokenizer.init_kwargs.items() if k in AlignProcessorKwargs.text_kwargs} ) # then get passed per-modality dictionaries if they exist - - text_kwargs.update(kwargs.pop("text_kwargs", {})) + text_kwargs = {**default_text_kwargs, **text_kwargs, **kwargs.pop("text_kwargs", {})} images_kwargs.update(kwargs.pop("images_kwargs", {})) common_kwargs.update(kwargs.pop("common_kwargs", {})) + # then merge kwargs by name for text_key in AlignProcessorKwargs.text_kwargs.keys(): text_kwarg_value = kwargs.pop(text_key, None) diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py index eaa54f911292..86bc19817acd 100644 --- a/tests/models/align/test_processor_align.py +++ b/tests/models/align/test_processor_align.py @@ -217,3 +217,26 @@ def test_defaults_preserved(self): inputs = processor(text=input_str, images=image_input) self.assertEqual(len(inputs["input_ids"]), 117) + + def test_structured_kwargs(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + common_kwargs = {"return_tensors": "pt"} + images_kwargs = {"crop_size": {"height": 214, "width": 214}} + text_kwargs = {"padding": "max_length", "max_length": 76} + + # Combine them into a single dictionary + all_kwargs = {"images_kwargs": images_kwargs, "text_kwargs": text_kwargs, "common_kwargs": common_kwargs} + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + print(inputs["input_ids"]) + self.assertEquals(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 76) From 693036f94411cddf0451ea1a61a84747ab341b06 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 3 Jun 2024 13:48:57 +0200 Subject: [PATCH 05/45] protect torch-specific test --- tests/models/align/test_processor_align.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py index 86bc19817acd..823824494028 100644 --- a/tests/models/align/test_processor_align.py +++ b/tests/models/align/test_processor_align.py @@ -23,7 +23,7 @@ from transformers import BertTokenizer, BertTokenizerFast from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES -from transformers.testing_utils import require_vision +from transformers.testing_utils import require_vision, require_torch from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available @@ -218,6 +218,7 @@ def test_defaults_preserved(self): self.assertEqual(len(inputs["input_ids"]), 117) + @require_torch def test_structured_kwargs(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() From 766da3a6f82f7ffd0a5519635cfeb27f91576398 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 3 Jun 2024 13:49:18 +0200 Subject: [PATCH 06/45] fix style --- tests/models/align/test_processor_align.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py index 823824494028..aab99b6a3254 100644 --- a/tests/models/align/test_processor_align.py +++ b/tests/models/align/test_processor_align.py @@ -23,7 +23,7 @@ from transformers import BertTokenizer, BertTokenizerFast from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES -from transformers.testing_utils import require_vision, require_torch +from transformers.testing_utils import require_torch, require_vision from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available From 844394de7e5cd93c4ce5d9fe535467ac964a021e Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 3 Jun 2024 14:11:22 +0200 Subject: [PATCH 07/45] fix --- tests/models/align/test_processor_align.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py index aab99b6a3254..5332f1469333 100644 --- a/tests/models/align/test_processor_align.py +++ b/tests/models/align/test_processor_align.py @@ -237,7 +237,6 @@ def test_structured_kwargs(self): all_kwargs = {"images_kwargs": images_kwargs, "text_kwargs": text_kwargs, "common_kwargs": common_kwargs} inputs = processor(text=input_str, images=image_input, **all_kwargs) - print(inputs["input_ids"]) self.assertEquals(inputs["pixel_values"].shape[2], 214) self.assertEqual(len(inputs["input_ids"][0]), 76) From c19bbc6f5e00d6e8e2c48e034295e5b873a26121 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 4 Jun 2024 13:26:42 +0200 Subject: [PATCH 08/45] fix assertEqual --- tests/models/align/test_processor_align.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py index 5332f1469333..70349cf8598f 100644 --- a/tests/models/align/test_processor_align.py +++ b/tests/models/align/test_processor_align.py @@ -237,6 +237,6 @@ def test_structured_kwargs(self): all_kwargs = {"images_kwargs": images_kwargs, "text_kwargs": text_kwargs, "common_kwargs": common_kwargs} inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEquals(inputs["pixel_values"].shape[2], 214) + self.assertEqual(inputs["pixel_values"].shape[2], 214) self.assertEqual(len(inputs["input_ids"][0]), 76) From 3c38119854061131a377c07353202d1dbadce096 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 4 Jun 2024 13:27:28 +0200 Subject: [PATCH 09/45] move kwargs merging to processing common --- .../models/align/processing_align.py | 50 ++------- src/transformers/processing_utils.py | 105 ++++++++++++++++++ 2 files changed, 116 insertions(+), 39 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index e8c4857814bd..f382e2b90bca 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -169,52 +169,24 @@ def __call__( """ if text is None and images is None: raise ValueError("You must specify either text or images.") - # set kwargs as empty dicts to avoid default mutable - if text_kwargs is None: - text_kwargs = {} - if images_kwargs is None: - images_kwargs = {} - if common_kwargs is None: - common_kwargs = {} - # Init with default values if they exist - default_text_kwargs = AlignProcessorKwargs._defaults.get("text_kwargs", {}).copy() - - # then override with tokenizer-level arguments passed - default_text_kwargs.update( - {k: v for k, v in self.tokenizer.init_kwargs.items() if k in AlignProcessorKwargs.text_kwargs} + output_kwargs = self._merge_kwargs( + AlignProcessorKwargs, + text_kwargs=text_kwargs, + images_kwargs=images_kwargs, + common_kwargs=common_kwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, ) - # then get passed per-modality dictionaries if they exist - text_kwargs = {**default_text_kwargs, **text_kwargs, **kwargs.pop("text_kwargs", {})} - images_kwargs.update(kwargs.pop("images_kwargs", {})) - common_kwargs.update(kwargs.pop("common_kwargs", {})) - - # then merge kwargs by name - for text_key in AlignProcessorKwargs.text_kwargs.keys(): - text_kwarg_value = kwargs.pop(text_key, None) - if text_kwarg_value is not None: - text_kwargs[text_key] = text_kwarg_value - - for images_key in AlignProcessorKwargs.images_kwargs.keys(): - images_kwarg_value = kwargs.pop(images_key, None) - if images_kwarg_value is not None: - images_kwargs[images_key] = images_kwarg_value - # if something remains in kwargs, it belongs to common - common_kwargs.update(kwargs) - - # all modality-specific kwargs are updated with common kwargs - text_kwargs.update(common_kwargs) - images_kwargs.update(common_kwargs) - # then, we can pass correct kwargs to each processor if text is not None: - encoding = self.tokenizer(text, **text_kwargs) + encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) if images is not None: - image_features = self.image_processor(images, **images_kwargs) + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) # BC for explicit return_tensors - if "return_tensors" in common_kwargs: - return_tensors = common_kwargs.pop("return_tensors", None) + if "return_tensors" in output_kwargs["common_kwargs"]: + return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None) if text is not None and images is not None: encoding["pixel_values"] = image_features.pixel_values diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index df6a178bb73f..0390b1c25c42 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -607,6 +607,111 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs): else: return processor + def _merge_kwargs( + self, + ModelProcessorKwargs: ProcessingKwargs, + text_kwargs: Optional[TextKwargs] = None, + images_kwargs: Optional[ImagesKwargs] = None, + common_kwargs: Optional[CommonKwargs] = None, + videos_kwargs: Optional[VideosKwargs] = None, + audio_kwargs: Optional[AudioKwargs] = None, + tokenizer_init_kwargs: Optional[Dict] = None, + **kwargs, + ) -> Dict[str, Dict]: + """ + Method to merge dictionaries of kwargs cleanly separated by modality within a Processor instance. + The order of operations is as follows: + 1) kwargs passed as before have highest priority to preserve BC. They mix modalities and may not result in + correct behaviour. + ```python + high_priority_kwargs = {"crop_size" = (224, 224), "padding" = "max_length"} + processor(..., **high_priority_kwargs) + ``` + 2) kwargs specified as a dictionary and passed to the processor __call__ have second highest priority. + This is the recommended API. + ```python + recommended_priority_kwargs = {"text_kwargs": {"padding":"max_length"}, "images_kwargs": {"crop_size": (224, 224)}} + processor(..., **recommended_priority_kwargs) + ``` + 3) kwargs passed as modality-specific kwargs have third priority. + ```python + processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": (224, 224)}}) + ``` + 4) kwargs passed during instantiation of a modality processor have fourth priority. + ```python + tokenizer = tokenizer_class(..., {"padding": "max_length"}) + image_processor = image_processor_class(...) + processor(tokenizer, image_processor) # will pass max_length unless overriden by kwargs at call + ``` + 5) defaults kwargs specified at processor level have lowest priority. + + Args: + ModelProcessorKwargs (`ProcessingKwargs`): + Typed dictionary of kwargs specifically required by the model passed. + text_kwargs (`TextKwargs`, *optional*): + Typed dictionary of kwargs inputs applied to the text modality processor, i.e. the tokenizer. + images_kwargs (`ImagesKwargs`, *optional*): + Typed dictionary of kwargs inputs applied to the images modality processor. + videos_kwargs (`VideosKwargs`, *optional*): + Typed dictionary of kwargs inputs applied to the videos modality processor. + audio_kwargs (`AudioKwargs`, *optional*): + Typed dictionary of kwargs inputs applied to the audio modality processor. + tokenizer_init_kwargs (`Dict`, *optional*): + Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over other kwargs. + + Returns: + output_kwargs (`Dict`): + Dictionary of per-modality kwargs to be passed to each modality-specific processor. + + """ + + # Initialize dictionaries + output_kwargs = { + "text_kwargs": text_kwargs or {}, + "images_kwargs": images_kwargs or {}, + "audio_kwargs": audio_kwargs or {}, + "videos_kwargs": videos_kwargs or {}, + "common_kwargs": common_kwargs or {}, + } + + default_kwargs = { + "text_kwargs": {}, + "images_kwargs": {}, + "audio_kwargs": {}, + "videos_kwargs": {}, + "common_kwargs": {}, + } + + # get defaults from set model processor kwargs if they exist + for modality in default_kwargs: + default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy() + # then override with tokenizer-level arguments passed + if tokenizer_init_kwargs: + default_kwargs["text_kwargs"].update( + {k: v for k, v in tokenizer_init_kwargs.items() if k in ModelProcessorKwargs.text_kwargs} + ) + + # then get passed per-modality dictionaries if they exist + for modality in output_kwargs: + output_kwargs[modality] = { + **default_kwargs[modality], + **output_kwargs[modality], + **kwargs.pop(modality, {}), + } + # then merge kwargs by name + for modality_key in ModelProcessorKwargs[modality].__annotations__.keys(): + modality_kwarg_value = kwargs.pop(modality_key, None) + if modality_kwarg_value is not None: + output_kwargs[modality] = modality_kwarg_value + + # if something remains in kwargs, it belongs to common + output_kwargs["common_kwargs"].update(kwargs) + # all modality-specific kwargs are updated with common kwargs + for modality in output_kwargs: + output_kwargs[modality].update(output_kwargs["common_kwargs"]) + + return output_kwargs + @classmethod def from_pretrained( cls, From 81ae819d802d6a8e6523e1f6649d172eff9e4d75 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Wed, 5 Jun 2024 18:12:39 +0200 Subject: [PATCH 10/45] rework kwargs for type hinting --- .../models/align/processing_align.py | 48 ++++-------- src/transformers/processing_utils.py | 75 +++++++++---------- tests/models/align/test_processor_align.py | 48 ++++++++++-- 3 files changed, 91 insertions(+), 80 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index f382e2b90bca..9934384e2b07 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -16,7 +16,7 @@ Image/Text processor class for ALIGN """ -from typing import List, Union +from typing import List, Union, Unpack from ...image_utils import ImageInput from ...processing_utils import ( @@ -37,7 +37,7 @@ import torch # noqa: F401 -class AlignProcessorKwargs(ProcessingKwargs, total=False): +class AlignProcessorKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False): """ Inherits from `ProcessingKwargs` to provide: 1) Additional keys that this model requires to process inputs. @@ -45,31 +45,18 @@ class AlignProcessorKwargs(ProcessingKwargs, total=False): New keys have to be defined as follows to ensure type hinting is done correctly. ```python - common_kwargs: CommonKwargs = { - **CommonKwargs.__annotations__, - } - text_kwargs: TextKwargs = { - **TextKwargs.__annotations__, - "a_new_text_boolean_key": Optional[bool], - } - images_kwargs: ImagesKwargs = { - **ImagesKwargs.__annotations__, - "a_new_image_processing_key": Optional[int] - } - ``` + images_kwargs: ImagesKwargs = {"new_image_kwarg": Optional[bool]} - """ - - common_kwargs: CommonKwargs = { - **CommonKwargs.__annotations__, - } - text_kwargs: TextKwargs = { - **TextKwargs.__annotations__, - } - images_kwargs: ImagesKwargs = { - **ImagesKwargs.__annotations__, + _defaults = { + "text_kwargs": { + "padding": "max_length", + "max_length": 64, + }, } + ``` + """ + _defaults = { "text_kwargs": { "padding": "max_length", @@ -106,9 +93,10 @@ class AlignProcessor(ProcessorMixin): processor(images=your_pil_image, text=["What is that?"], **all_kwargs) - # passing directly any number of kwargs is also supported, but not recommended + # passing directly any number of kwargs flattened is also supported - processor(images=your_pil_image, text=["What is that?"], padding="do_not_pad) + all_kwargs = {"return_tensors": "pt", "crop_size": {"height": 214, "width": 214}, "padding": "max_length", "max_length": 76} + processor(images=your_pil_image, text=["What is that?"], **all_kwargs) ``` Args: @@ -132,10 +120,7 @@ def __call__( images: ImageInput = None, audio=None, videos=None, - text_kwargs: AlignProcessorKwargs.text_kwargs = None, - images_kwargs: AlignProcessorKwargs.images_kwargs = None, - common_kwargs: AlignProcessorKwargs.common_kwargs = None, - **kwargs: AlignProcessorKwargs, + **kwargs: Unpack[AlignProcessorKwargs], ) -> BatchEncoding: """ Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text` @@ -171,9 +156,6 @@ def __call__( raise ValueError("You must specify either text or images.") output_kwargs = self._merge_kwargs( AlignProcessorKwargs, - text_kwargs=text_kwargs, - images_kwargs=images_kwargs, - common_kwargs=common_kwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 0390b1c25c42..49cd2a471b5b 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -244,12 +244,22 @@ class CommonKwargs(TypedDict, total=False): return_tensors: Optional[Union[str, TensorType]] -class ProcessingKwargs(TypedDict, total=False): - common_kwargs: CommonKwargs - text_kwargs: TextKwargs - images_kwargs: ImagesKwargs - audio_kwargs: AudioKwargs - videos_kwargs: VideosKwargs +class ProcessingKwargs(TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, total=False): + common_kwargs: CommonKwargs = { + **CommonKwargs.__annotations__, + } + text_kwargs: TextKwargs = { + **TextKwargs.__annotations__, + } + images_kwargs: ImagesKwargs = { + **ImagesKwargs.__annotations__, + } + videos_kwargs: VideosKwargs = { + **VideosKwargs.__annotations__, + } + audio_kwargs: AudioKwargs = { + **AudioKwargs.__annotations__, + } class ProcessorMixin(PushToHubMixin): @@ -610,11 +620,6 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs): def _merge_kwargs( self, ModelProcessorKwargs: ProcessingKwargs, - text_kwargs: Optional[TextKwargs] = None, - images_kwargs: Optional[ImagesKwargs] = None, - common_kwargs: Optional[CommonKwargs] = None, - videos_kwargs: Optional[VideosKwargs] = None, - audio_kwargs: Optional[AudioKwargs] = None, tokenizer_init_kwargs: Optional[Dict] = None, **kwargs, ) -> Dict[str, Dict]: @@ -648,30 +653,21 @@ def _merge_kwargs( Args: ModelProcessorKwargs (`ProcessingKwargs`): Typed dictionary of kwargs specifically required by the model passed. - text_kwargs (`TextKwargs`, *optional*): - Typed dictionary of kwargs inputs applied to the text modality processor, i.e. the tokenizer. - images_kwargs (`ImagesKwargs`, *optional*): - Typed dictionary of kwargs inputs applied to the images modality processor. - videos_kwargs (`VideosKwargs`, *optional*): - Typed dictionary of kwargs inputs applied to the videos modality processor. - audio_kwargs (`AudioKwargs`, *optional*): - Typed dictionary of kwargs inputs applied to the audio modality processor. tokenizer_init_kwargs (`Dict`, *optional*): - Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over other kwargs. + Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over defaults. Returns: output_kwargs (`Dict`): Dictionary of per-modality kwargs to be passed to each modality-specific processor. """ - # Initialize dictionaries output_kwargs = { - "text_kwargs": text_kwargs or {}, - "images_kwargs": images_kwargs or {}, - "audio_kwargs": audio_kwargs or {}, - "videos_kwargs": videos_kwargs or {}, - "common_kwargs": common_kwargs or {}, + "text_kwargs": {}, + "images_kwargs": {}, + "audio_kwargs": {}, + "videos_kwargs": {}, + "common_kwargs": {}, } default_kwargs = { @@ -685,31 +681,28 @@ def _merge_kwargs( # get defaults from set model processor kwargs if they exist for modality in default_kwargs: default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy() - # then override with tokenizer-level arguments passed - if tokenizer_init_kwargs: - default_kwargs["text_kwargs"].update( - {k: v for k, v in tokenizer_init_kwargs.items() if k in ModelProcessorKwargs.text_kwargs} - ) - - # then get passed per-modality dictionaries if they exist + # update modality kwargs with passed kwargs for modality in output_kwargs: output_kwargs[modality] = { **default_kwargs[modality], - **output_kwargs[modality], - **kwargs.pop(modality, {}), } - # then merge kwargs by name - for modality_key in ModelProcessorKwargs[modality].__annotations__.keys(): - modality_kwarg_value = kwargs.pop(modality_key, None) - if modality_kwarg_value is not None: - output_kwargs[modality] = modality_kwarg_value + for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys(): + # init with tokenizer init kwargs if necessary + if modality_key in tokenizer_init_kwargs: + output_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key] + # check if we received a structured kwarg dict or not to handle it correctly + if modality in kwargs: + kwarg_value = kwargs[modality].pop(modality_key, "__empty__") + else: + kwarg_value = kwargs.pop(modality_key, "__empty__") + if kwarg_value != "__empty__": + output_kwargs[modality][modality_key] = kwarg_value # if something remains in kwargs, it belongs to common output_kwargs["common_kwargs"].update(kwargs) # all modality-specific kwargs are updated with common kwargs for modality in output_kwargs: output_kwargs[modality].update(output_kwargs["common_kwargs"]) - return output_kwargs @classmethod diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py index 70349cf8598f..31413f28c66d 100644 --- a/tests/models/align/test_processor_align.py +++ b/tests/models/align/test_processor_align.py @@ -205,7 +205,8 @@ def test_model_input_names(self): self.assertListEqual(list(inputs.keys()), processor.model_input_names) - def test_defaults_preserved(self): + # TODO move these tests to a common Mixin + def test_defaults_preserved_kwargs(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer(max_length=117) @@ -218,6 +219,19 @@ def test_defaults_preserved(self): self.assertEqual(len(inputs["input_ids"]), 117) + @require_torch + def test_defaults_preserved_image_kwargs(self): + image_processor = self.get_image_processor(crop_size=(234, 234)) + tokenizer = self.get_tokenizer(max_length=117) + + processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + self.assertEqual(len(inputs["pixel_values"][0][0]), 234) + @require_torch def test_structured_kwargs(self): image_processor = self.get_image_processor() @@ -229,12 +243,34 @@ def test_structured_kwargs(self): image_input = self.prepare_image_inputs() # Define the kwargs for each modality - common_kwargs = {"return_tensors": "pt"} - images_kwargs = {"crop_size": {"height": 214, "width": 214}} - text_kwargs = {"padding": "max_length", "max_length": 76} + all_kwargs = { + "return_tensors": "pt", + "crop_size": {"height": 214, "width": 214}, + "padding": "max_length", + "max_length": 76, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.assertEqual(inputs["pixel_values"].shape[2], 214) - # Combine them into a single dictionary - all_kwargs = {"images_kwargs": images_kwargs, "text_kwargs": text_kwargs, "common_kwargs": common_kwargs} + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + def test_structured_kwargs_nested(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } inputs = processor(text=input_str, images=image_input, **all_kwargs) self.assertEqual(inputs["pixel_values"].shape[2], 214) From ce4abcdcc47bafc221e9554bf52dfe00d4ae79bb Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 7 Jun 2024 13:39:11 +0200 Subject: [PATCH 11/45] just get Unpack from extensions --- src/transformers/models/align/processing_align.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index 9934384e2b07..a20f2d027aa5 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -16,7 +16,13 @@ Image/Text processor class for ALIGN """ -from typing import List, Union, Unpack +from typing import List, Union + + +try: + from typing import Unpack +except ImportError: + from typing_extensions import Unpack from ...image_utils import ImageInput from ...processing_utils import ( From 3acdf289630b289efd9657ef6c5574eb86e3107c Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 7 Jun 2024 13:40:15 +0200 Subject: [PATCH 12/45] run-slow[align] From 404239fb63aeeeb4016d30db8deb6ebcf5c5faf7 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 7 Jun 2024 15:20:29 +0200 Subject: [PATCH 13/45] handle kwargs passed as nested dict --- src/transformers/processing_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 49cd2a471b5b..383788275fff 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -698,8 +698,14 @@ def _merge_kwargs( if kwarg_value != "__empty__": output_kwargs[modality][modality_key] = kwarg_value - # if something remains in kwargs, it belongs to common - output_kwargs["common_kwargs"].update(kwargs) + # if something remains in kwargs, it belongs to common after flattening + if set(kwargs) & set(default_kwargs): + # here kwargs is dictionary-based since it shares keys with default set + [output_kwargs["common_kwargs"].update(subdict) for _, subdict in kwargs.items()] + else: + # here it's a flat dict + output_kwargs["common_kwargs"].update(kwargs) + # all modality-specific kwargs are updated with common kwargs for modality in output_kwargs: output_kwargs[modality].update(output_kwargs["common_kwargs"]) From 603be40f520ac555428c50d99b958de84e7087c8 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 7 Jun 2024 15:21:18 +0200 Subject: [PATCH 14/45] add from_pretrained test for nested kwargs handling --- tests/models/align/test_processor_align.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py index 31413f28c66d..3108e48cadf9 100644 --- a/tests/models/align/test_processor_align.py +++ b/tests/models/align/test_processor_align.py @@ -261,7 +261,24 @@ def test_structured_kwargs_nested(self): tokenizer = self.get_tokenizer() processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + def test_structured_kwargs_nested_from_dict(self): + processor = AlignProcessor.from_pretrained("kakaobrain/align-base") input_str = "lower newer" image_input = self.prepare_image_inputs() From 71c9d6c036cde611ba3086d7be38be79db3c87d7 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 7 Jun 2024 15:22:19 +0200 Subject: [PATCH 15/45] [run-slow]align From 26383c52e693613c9266f549d77ceeb2ba5c61de Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 7 Jun 2024 15:47:48 +0200 Subject: [PATCH 16/45] update documentation + imports --- .../models/align/processing_align.py | 3 ++- src/transformers/processing_utils.py | 25 ++++++++++--------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index a20f2d027aa5..7559d58fa1c6 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -31,9 +31,10 @@ ProcessingKwargs, ProcessorMixin, TextKwargs, + is_vision_available, ) from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput -from ...utils import is_torch_available, is_vision_available +from ...utils import is_torch_available # TODO (@molbap) This is a bother, forward references from TypedDict are resolved and need this to work diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 383788275fff..432d476cd572 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -626,30 +626,31 @@ def _merge_kwargs( """ Method to merge dictionaries of kwargs cleanly separated by modality within a Processor instance. The order of operations is as follows: - 1) kwargs passed as before have highest priority to preserve BC. They mix modalities and may not result in - correct behaviour. + 1) kwargs passed as before have highest priority to preserve BC. ```python high_priority_kwargs = {"crop_size" = (224, 224), "padding" = "max_length"} processor(..., **high_priority_kwargs) ``` - 2) kwargs specified as a dictionary and passed to the processor __call__ have second highest priority. - This is the recommended API. - ```python - recommended_priority_kwargs = {"text_kwargs": {"padding":"max_length"}, "images_kwargs": {"crop_size": (224, 224)}} - processor(..., **recommended_priority_kwargs) - ``` - 3) kwargs passed as modality-specific kwargs have third priority. + 2) kwargs passed as modality-specific kwargs have second priority. This is the recommended API. ```python processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": (224, 224)}}) ``` - 4) kwargs passed during instantiation of a modality processor have fourth priority. + 3) kwargs passed during instantiation of a modality processor have fourth priority. ```python tokenizer = tokenizer_class(..., {"padding": "max_length"}) image_processor = image_processor_class(...) processor(tokenizer, image_processor) # will pass max_length unless overriden by kwargs at call ``` - 5) defaults kwargs specified at processor level have lowest priority. - + 4) defaults kwargs specified at processor level have lowest priority. + ```python + class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": "max_length", + "max_length": 64, + }, + } + ``` Args: ModelProcessorKwargs (`ProcessingKwargs`): Typed dictionary of kwargs specifically required by the model passed. From 4521f4fd5f384916a231cd9d97070b3662690445 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 7 Jun 2024 15:51:31 +0200 Subject: [PATCH 17/45] update audio inputs --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 9d401501885e..1471aa25d05a 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -127,7 +127,7 @@ class EncodingFast: EncodedInputPair = Tuple[List[int], List[int]] # Define type aliases for text-related non-text modalities -AudioInput = Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], List[float]] +AudioInput = Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]] # Slow tokenizers used to be saved in three separated files SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" From b96eb6483d91c74633584a38867ee5b4721ff83a Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 7 Jun 2024 15:56:26 +0200 Subject: [PATCH 18/45] protect audio types, silly --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 1471aa25d05a..c0410f75b2c2 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -127,7 +127,7 @@ class EncodingFast: EncodedInputPair = Tuple[List[int], List[int]] # Define type aliases for text-related non-text modalities -AudioInput = Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]] +AudioInput = Union["np.ndarray", "torch.Tensor", List["np.ndarray"], List["torch.Tensor"]] # Slow tokenizers used to be saved in three separated files SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" From 9c5c01cd69ff5a5f035df81db24298fe7125ad38 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 7 Jun 2024 17:31:43 +0200 Subject: [PATCH 19/45] try removing imports --- src/transformers/models/align/processing_align.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index 7559d58fa1c6..04d50f8188ad 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -31,17 +31,17 @@ ProcessingKwargs, ProcessorMixin, TextKwargs, - is_vision_available, ) from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput -from ...utils import is_torch_available +""" # TODO (@molbap) This is a bother, forward references from TypedDict are resolved and need this to work if is_vision_available(): import PIL # noqa: F401 if is_torch_available(): import torch # noqa: F401 +""" class AlignProcessorKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False): From 3ccb50521d3bc8f3d838efef5a0b689ccecb53e9 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 7 Jun 2024 17:42:57 +0200 Subject: [PATCH 20/45] make things simpler --- .../models/align/processing_align.py | 28 ------------------- src/transformers/processing_utils.py | 20 +++++++++++++ 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index 04d50f8188ad..c89c2f5e1b62 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -35,35 +35,7 @@ from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput -""" -# TODO (@molbap) This is a bother, forward references from TypedDict are resolved and need this to work -if is_vision_available(): - import PIL # noqa: F401 -if is_torch_available(): - import torch # noqa: F401 -""" - - class AlignProcessorKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False): - """ - Inherits from `ProcessingKwargs` to provide: - 1) Additional keys that this model requires to process inputs. - 2) Default values for extra keys. - New keys have to be defined as follows to ensure type hinting is done correctly. - - ```python - images_kwargs: ImagesKwargs = {"new_image_kwarg": Optional[bool]} - - _defaults = { - "text_kwargs": { - "padding": "max_length", - "max_length": 64, - }, - } - - ``` - """ - _defaults = { "text_kwargs": { "padding": "max_length", diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 432d476cd572..113abd603601 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -245,6 +245,26 @@ class CommonKwargs(TypedDict, total=False): class ProcessingKwargs(TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, total=False): + """ + Base class for kwargs passing to processors. + A model should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide: + 1) Additional, typed keys and that this model requires to process inputs. + 2) Default values for existing keys. + New keys have to be defined as follows to ensure type hinting is done correctly. + + ```python + images_kwargs: ImagesKwargs = {"new_image_kwarg": Optional[bool]} + + _defaults = { + "text_kwargs": { + "padding": "max_length", + "max_length": 64, + }, + } + + ``` + """ + common_kwargs: CommonKwargs = { **CommonKwargs.__annotations__, } From 142acf302adb346f1c767606f989d270970cd1e7 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 7 Jun 2024 18:02:18 +0200 Subject: [PATCH 21/45] simplerer --- src/transformers/models/align/processing_align.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index c89c2f5e1b62..b2af0997ac94 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -26,16 +26,13 @@ from ...image_utils import ImageInput from ...processing_utils import ( - CommonKwargs, - ImagesKwargs, ProcessingKwargs, ProcessorMixin, - TextKwargs, ) from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput -class AlignProcessorKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False): +class AlignProcessorKwargs(ProcessingKwargs, total=False): _defaults = { "text_kwargs": { "padding": "max_length", From 60a5730877cb5776d834307ca55ec6a97b35e152 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 10 Jun 2024 11:38:10 +0200 Subject: [PATCH 22/45] move out kwargs test to common mixin --- tests/models/align/test_processor_align.py | 97 +--------------- tests/test_processing_common.py | 122 +++++++++++++++++++++ 2 files changed, 128 insertions(+), 91 deletions(-) diff --git a/tests/models/align/test_processor_align.py b/tests/models/align/test_processor_align.py index 3108e48cadf9..3c904e59a883 100644 --- a/tests/models/align/test_processor_align.py +++ b/tests/models/align/test_processor_align.py @@ -23,9 +23,11 @@ from transformers import BertTokenizer, BertTokenizerFast from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES -from transformers.testing_utils import require_torch, require_vision +from transformers.testing_utils import require_vision from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available +from ...test_processing_common import ProcessorTesterMixin + if is_vision_available(): from PIL import Image @@ -34,7 +36,9 @@ @require_vision -class AlignProcessorTest(unittest.TestCase): +class AlignProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = AlignProcessor + def setUp(self): self.tmpdirname = tempfile.mkdtemp() @@ -204,92 +208,3 @@ def test_model_input_names(self): inputs = processor(text=input_str, images=image_input) self.assertListEqual(list(inputs.keys()), processor.model_input_names) - - # TODO move these tests to a common Mixin - def test_defaults_preserved_kwargs(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer(max_length=117) - - processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - - self.assertEqual(len(inputs["input_ids"]), 117) - - @require_torch - def test_defaults_preserved_image_kwargs(self): - image_processor = self.get_image_processor(crop_size=(234, 234)) - tokenizer = self.get_tokenizer(max_length=117) - - processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - self.assertEqual(len(inputs["pixel_values"][0][0]), 234) - - @require_torch - def test_structured_kwargs(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "return_tensors": "pt", - "crop_size": {"height": 214, "width": 214}, - "padding": "max_length", - "max_length": 76, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 76) - - @require_torch - def test_structured_kwargs_nested(self): - image_processor = self.get_image_processor() - tokenizer = self.get_tokenizer() - - processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor) - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 76) - - @require_torch - def test_structured_kwargs_nested_from_dict(self): - processor = AlignProcessor.from_pretrained("kakaobrain/align-base") - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 76) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 402e6a735151..dcd5773eeda6 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -18,6 +18,8 @@ import tempfile import unittest +import numpy as np + from transformers import CLIPTokenizerFast, ProcessorMixin from transformers.models.auto.processing_auto import processor_class_from_name from transformers.testing_utils import ( @@ -30,6 +32,8 @@ if is_vision_available(): + from PIL import Image + from transformers import CLIPImageProcessor @@ -64,6 +68,15 @@ def get_processor(self): processor = self.processor_class(**components, **self.prepare_processor_dict()) return processor + @require_vision + def prepare_image_inputs(self): + """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, + or a list of PyTorch tensors if one specifies torchify=True. + """ + image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] + image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] + return image_inputs + def test_processor_to_json_string(self): processor = self.get_processor() obj = json.loads(processor.to_json_string()) @@ -82,6 +95,115 @@ def test_processor_from_and_save_pretrained(self): self.assertEqual(processor_second.to_dict(), processor_first.to_dict()) + # These kwargs-related tests ensure that processors are correctly instantiated. + # they need to be applied only if an image_processor exists. + @require_vision + @require_torch + def test_defaults_preserved_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertEqual(len(inputs["input_ids"]), 117) + + @require_torch + @require_vision + def test_defaults_preserved_image_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor", crop_size=(234, 234)) + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + self.assertEqual(len(inputs["pixel_values"][0][0]), 234) + + @require_torch + @require_vision + def test_structured_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "return_tensors": "pt", + "crop_size": {"height": 214, "width": 214}, + "padding": "max_length", + "max_length": 76, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + @require_vision + def test_structured_kwargs_nested(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + @require_vision + def test_structured_kwargs_nested_from_dict(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + class MyProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] From be6c141d3e3a992452e2ee0e7bd6ea3afdaa3883 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 10 Jun 2024 11:38:31 +0200 Subject: [PATCH 23/45] [run-slow]align From 84135d739f5156d4707d5c7954f6d09ea92285fb Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 10 Jun 2024 12:15:08 +0200 Subject: [PATCH 24/45] skip tests for old processors --- tests/test_processing_common.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index dcd5773eeda6..cf08561e6956 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -14,8 +14,10 @@ # limitations under the License. +import inspect import json import tempfile +import typing import unittest import numpy as np @@ -97,6 +99,20 @@ def test_processor_from_and_save_pretrained(self): # These kwargs-related tests ensure that processors are correctly instantiated. # they need to be applied only if an image_processor exists. + + def skip_processor_without_typed_kwargs(self, processor): + # TODO this signature check is to test only uniformized processors. + # Once all are updated, remove it. + is_kwargs_typed_dict = False + call_signature = inspect.signature(processor.__call__) + for param in call_signature.parameters.values(): + if param.kind == param.VAR_KEYWORD and param.annotation != param.empty: + is_kwargs_typed_dict = ( + hasattr(param.annotation, "__origin__") and param.annotation.__origin__ == typing.Unpack + ) + if not is_kwargs_typed_dict: + self.skipTest(f"{self.processor_class} doesn't have typed kwargs.") + @require_vision @require_torch def test_defaults_preserved_kwargs(self): @@ -106,13 +122,12 @@ def test_defaults_preserved_kwargs(self): tokenizer = self.get_component("tokenizer", max_length=117) processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - + self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input) - - self.assertEqual(len(inputs["input_ids"]), 117) + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertEqual(len(inputs["input_ids"][0]), 117) @require_torch @require_vision @@ -123,6 +138,7 @@ def test_defaults_preserved_image_kwargs(self): tokenizer = self.get_component("tokenizer", max_length=117) processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() @@ -139,6 +155,7 @@ def test_structured_kwargs(self): tokenizer = self.get_component("tokenizer") processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() @@ -165,6 +182,8 @@ def test_structured_kwargs_nested(self): tokenizer = self.get_component("tokenizer") processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" image_input = self.prepare_image_inputs() @@ -176,6 +195,8 @@ def test_structured_kwargs_nested(self): } inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.skip_processor_without_typed_kwargs(processor) + self.assertEqual(inputs["pixel_values"].shape[2], 214) self.assertEqual(len(inputs["input_ids"][0]), 76) @@ -185,10 +206,12 @@ def test_structured_kwargs_nested(self): def test_structured_kwargs_nested_from_dict(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer") processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() From ce967aca755ed5b9b8d6b0626f8226bacbe11f3f Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 10 Jun 2024 12:15:26 +0200 Subject: [PATCH 25/45] [run-slow]align, clip From f78ec52f744710ccd136d662bbb953ba9585bf90 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 10 Jun 2024 12:25:59 +0200 Subject: [PATCH 26/45] !$#@!! protect imports, darn it --- tests/test_processing_common.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index cf08561e6956..d046da8d1efe 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -17,7 +17,12 @@ import inspect import json import tempfile -import typing + + +try: + from typing import Unpack +except ImportError: + from typing_extensions import Unpack import unittest import numpy as np @@ -108,7 +113,7 @@ def skip_processor_without_typed_kwargs(self, processor): for param in call_signature.parameters.values(): if param.kind == param.VAR_KEYWORD and param.annotation != param.empty: is_kwargs_typed_dict = ( - hasattr(param.annotation, "__origin__") and param.annotation.__origin__ == typing.Unpack + hasattr(param.annotation, "__origin__") and param.annotation.__origin__ == Unpack ) if not is_kwargs_typed_dict: self.skipTest(f"{self.processor_class} doesn't have typed kwargs.") From 52fd5ad6cc68a94d6141114c6c7ba6010ad65b59 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 10 Jun 2024 12:26:09 +0200 Subject: [PATCH 27/45] [run-slow]align, clip From d510030acc89dd02cba79bc6b0d837277c7250ef Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 10 Jun 2024 17:12:30 +0200 Subject: [PATCH 28/45] [run-slow]align, clip From fd43bcd457d22a70eb66ad84c0f1eb164bb654d4 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 11 Jun 2024 13:49:54 +0200 Subject: [PATCH 29/45] update doc --- .../models/align/processing_align.py | 26 +++++-------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index b2af0997ac94..58eb63c45e69 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -54,25 +54,13 @@ class AlignProcessor(ProcessorMixin): model_id = "kakaobrain/align-base" processor = AlignProcessor.from_pretrained(model_id) - # Define the kwargs for each modality - common_kwargs = {"return_tensors": "pt"} - images_kwargs = {"crop_size": {"height": 224, "width": 224}} - text_kwargs = {"padding": "do_not_pad"} - - # Combine them into a single dictionary - - all_kwargs = { - "images_kwargs": images_kwargs, - "text_kwargs": text_kwargs, - "common_kwargs": common_kwargs - } - - processor(images=your_pil_image, text=["What is that?"], **all_kwargs) - - # passing directly any number of kwargs flattened is also supported - - all_kwargs = {"return_tensors": "pt", "crop_size": {"height": 214, "width": 214}, "padding": "max_length", "max_length": 76} - processor(images=your_pil_image, text=["What is that?"], **all_kwargs) + processor( + images=your_pil_image, + text=["What is that?"], + images_kwargs = {"crop_size": {"height": 224, "width": 224}}, + text_kwargs = {"padding": "do_not_pad"}, + common_kwargs = {"return_tensors": "pt"}, + ) ``` Args: From b2cd7c9e044a9b2558a56a159c6cb94da03c9aab Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 11 Jun 2024 13:53:46 +0200 Subject: [PATCH 30/45] improve documentation for default values --- src/transformers/processing_utils.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 113abd603601..d94ef86f2d6c 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -248,19 +248,25 @@ class ProcessingKwargs(TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, Comm """ Base class for kwargs passing to processors. A model should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide: - 1) Additional, typed keys and that this model requires to process inputs. - 2) Default values for existing keys. + 1) Additional typed keys and that this model requires to process inputs. + 2) Default values for existing keys under a `_defaults` attribute. New keys have to be defined as follows to ensure type hinting is done correctly. ```python - images_kwargs: ImagesKwargs = {"new_image_kwarg": Optional[bool]} - - _defaults = { - "text_kwargs": { - "padding": "max_length", - "max_length": 64, - }, - } + # adding a new image kwarg for this model + class ModelImagesKwargs(ImagesKwargs, total=False): + new_image_kwarg: Optional[bool] + + class ModelProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: ModelImagesKwargs + _defaults = { + "images_kwargs: { + "new_image_kwarg": False, + } + "text_kwargs": { + "padding": "max_length", + }, + } ``` """ From bcbd64646ecd8cc863a14cbcd16ac3bca47f3303 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 11 Jun 2024 15:07:53 +0200 Subject: [PATCH 31/45] add model_max_length testing This parameter depends on tokenizers received. --- tests/test_processing_common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index d046da8d1efe..a4c217df2c66 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -171,6 +171,7 @@ def test_structured_kwargs(self): "crop_size": {"height": 214, "width": 214}, "padding": "max_length", "max_length": 76, + "model_max_length": 76, } inputs = processor(text=input_str, images=image_input, **all_kwargs) From 39c1587e23f34b158e665f33c4f7a1eef3932a42 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 11 Jun 2024 15:08:13 +0200 Subject: [PATCH 32/45] Raise if kwargs are specified in two places --- src/transformers/processing_utils.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index d94ef86f2d6c..97060c89a3e7 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -709,7 +709,9 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg for modality in default_kwargs: default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy() # update modality kwargs with passed kwargs - for modality in output_kwargs: + non_modality_kwargs = set(kwargs) - set(output_kwargs) + + for modality in set(output_kwargs): output_kwargs[modality] = { **default_kwargs[modality], } @@ -720,11 +722,17 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg # check if we received a structured kwarg dict or not to handle it correctly if modality in kwargs: kwarg_value = kwargs[modality].pop(modality_key, "__empty__") - else: + # check if this key was passed as a flat kwarg. + if kwarg_value != "__empty__" and modality_key in non_modality_kwargs: + raise ValueError( + f"Keyword argument {modality_key} was passed two times: in a dictionary for {modality} and as a **kwarg." + ) + elif modality_key in kwargs: kwarg_value = kwargs.pop(modality_key, "__empty__") + else: + kwarg_value = "__empty__" if kwarg_value != "__empty__": output_kwargs[modality][modality_key] = kwarg_value - # if something remains in kwargs, it belongs to common after flattening if set(kwargs) & set(default_kwargs): # here kwargs is dictionary-based since it shares keys with default set From 1f73bdf586dcc838498c9b790c0fbaab923386f5 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Tue, 11 Jun 2024 16:21:19 +0200 Subject: [PATCH 33/45] fix From e4d6d12703db9aadc04efe349bfe4729b55dcd1c Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Wed, 12 Jun 2024 18:24:08 +0200 Subject: [PATCH 34/45] expand VideoInput --- src/transformers/image_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index aa09e74558a3..4df2ae1f1633 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -81,8 +81,7 @@ ] # noqa -VideoInput = Union[np.ndarray, "torch.Tensor", List[np.ndarray], List["torch.Tensor"]] # noqa - +VideoInput = Union[List["PIL.Image.Image"], np.ndarray, "torch.Tensor", List[np.ndarray], List["torch.Tensor"]], List[List["PIL.Image.Image"]], List[List[np.ndarrray]], List[List["torch.Tensor"]] # noqa class ChannelDimension(ExplicitEnum): FIRST = "channels_first" From 1e09e4a971858a0eff272e9fbc4c6de68df7b86d Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Wed, 12 Jun 2024 18:24:43 +0200 Subject: [PATCH 35/45] fix --- src/transformers/image_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 4df2ae1f1633..073af604afd7 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -81,7 +81,7 @@ ] # noqa -VideoInput = Union[List["PIL.Image.Image"], np.ndarray, "torch.Tensor", List[np.ndarray], List["torch.Tensor"]], List[List["PIL.Image.Image"]], List[List[np.ndarrray]], List[List["torch.Tensor"]] # noqa +VideoInput = Union[List["PIL.Image.Image"], np.ndarray, "torch.Tensor", List[np.ndarray], List["torch.Tensor"]], List[List["PIL.Image.Image"]], List[List[np.ndarrray]], List[List["torch.Tensor"]]] # noqa class ChannelDimension(ExplicitEnum): FIRST = "channels_first" From d4232f0be677314ebcead5e244911f8753067ec3 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Wed, 12 Jun 2024 18:27:29 +0200 Subject: [PATCH 36/45] fix style --- src/transformers/image_utils.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 073af604afd7..45cc6e09c928 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -81,7 +81,17 @@ ] # noqa -VideoInput = Union[List["PIL.Image.Image"], np.ndarray, "torch.Tensor", List[np.ndarray], List["torch.Tensor"]], List[List["PIL.Image.Image"]], List[List[np.ndarrray]], List[List["torch.Tensor"]]] # noqa +VideoInput = Union[ + List["PIL.Image.Image"], + np.ndarray, + "torch.Tensor", + List[np.ndarray], + List["torch.Tensor"], + List[List["PIL.Image.Image"]], + List[List[np.ndarrray]], + List[List["torch.Tensor"]], +] # noqa + class ChannelDimension(ExplicitEnum): FIRST = "channels_first" From 162b1a702af1be0e21e4dbd9f82954432774e503 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Wed, 12 Jun 2024 18:27:43 +0200 Subject: [PATCH 37/45] remove defaults values --- src/transformers/processing_utils.py | 78 ++++++++++++++++++---------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 97060c89a3e7..cb60e77c87f2 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -74,17 +74,17 @@ class TextKwargs(TypedDict, total=False): docstrings associated. Attributes: - add_special_tokens (`bool`, *optional*, defaults to `True`): + add_special_tokens (`bool`, *optional*) Whether or not to add special tokens when encoding the sequences. - padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*) Activates and controls padding. - truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): + truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*): Activates and controls truncation. max_length (`int`, *optional*): Controls the maximum length to use by one of the truncation/padding parameters. - stride (`int`, *optional*, defaults to 0): + stride (`int`, *optional*): If set, the overflowing tokens will contain some tokens from the end of the truncated sequence. - is_split_into_words (`bool`, *optional*, defaults to `False`): + is_split_into_words (`bool`, *optional*): Whether or not the input is already pre-tokenized. pad_to_multiple_of (`int`, *optional*): If set, will pad the sequence to a multiple of the provided value. @@ -92,15 +92,15 @@ class TextKwargs(TypedDict, total=False): Whether to return token type IDs. return_attention_mask (`bool`, *optional*): Whether to return the attention mask. - return_overflowing_tokens (`bool`, *optional*, defaults to `False`): + return_overflowing_tokens (`bool`, *optional*): Whether or not to return overflowing token sequences. - return_special_tokens_mask (`bool`, *optional*, defaults to `False`): + return_special_tokens_mask (`bool`, *optional*): Whether or not to return special tokens mask information. - return_offsets_mapping (`bool`, *optional*, defaults to `False`): + return_offsets_mapping (`bool`, *optional*): Whether or not to return `(char_start, char_end)` for each token. - return_length (`bool`, *optional*, defaults to `False`): + return_length (`bool`, *optional*): Whether or not to return the lengths of the encoded inputs. - verbose (`bool`, *optional*, defaults to `True`): + verbose (`bool`, *optional*): Whether or not to print more information and warnings. padding_side (`str`, *optional*): The side on which padding will be applied. @@ -129,31 +129,31 @@ class ImagesKwargs(TypedDict, total=False): class methods and docstrings. Attributes: - do_resize (`bool`, *optional*, defaults to `True`): + do_resize (`bool`, *optional*): Whether to resize the image. - size (`Dict[str, int]`, *optional*, defaults to `{'shortest_edge': 288}`): + size (`Dict[str, int]`, *optional*): Resize the shorter side of the input to `size["shortest_edge"]`. - size_divisor (`int`, *optional*, defaults to 32): + size_divisor (`int`, *optional*): The size by which to make sure both the height and width can be divided. crop_size (`Dict[str, int]`, *optional*): Desired output size when applying center-cropping. - resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + resample (`PILImageResampling`, *optional*): Resampling filter to use if resizing the image. - do_rescale (`bool`, *optional*, defaults to `True`): + do_rescale (`bool`, *optional*): Whether to rescale the image by the specified scale `rescale_factor`. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + rescale_factor (`int` or `float`, *optional*): Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `True`): + do_normalize (`bool`, *optional*): Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + image_mean (`float` or `List[float]`, *optional*): Mean to use if normalizing the image. - image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + image_std (`float` or `List[float]`, *optional*): Standard deviation to use if normalizing the image. - do_pad (`bool`, *optional*, defaults to `True`): + do_pad (`bool`, *optional*): Whether to pad the image to the `(max_height, max_width)` of the images in the batch. - do_center_crop (`bool`, *optional*, defaults to `True`): + do_center_crop (`bool`, *optional*): Whether to center crop the image. - data_format (`ChannelDimension` or `str`, *optional*, defaults to `"channels_first"`): + data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the output image. input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the input image. @@ -180,9 +180,32 @@ class VideosKwargs(TypedDict, total=False): Keyword arguments for video processing. Attributes: - do_resize (`bool`, *optional*, defaults to `True`): + do_resize (`bool`): Whether to resize the image. - # ... (Add docstrings for other videos_kwargs) + size (`Dict[str, int]`, *optional*): + Resize the shorter side of the input to `size["shortest_edge"]`. + size_divisor (`int`, *optional*): + The size by which to make sure both the height and width can be divided. + resample (`PILImageResampling`, *optional*): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*): + Whether to rescale the image by the specified scale `rescale_factor`. + rescale_factor (`int` or `float`, *optional*): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*): + Mean to use if normalizing the image. + image_std (`float` or `List[float]`, *optional*): + Standard deviation to use if normalizing the image. + do_pad (`bool`, *optional*): + Whether to pad the image to the `(max_height, max_width)` of the images in the batch. + do_center_crop (`bool`, *optional*): + Whether to center crop the image. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. """ do_resize: Optional[bool] @@ -219,15 +242,14 @@ class AudioKwargs(TypedDict, total=False): sequence if provided). - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different - lengths). + - `False` or `'do_not_pad'` max_length (`int`, *optional*): Maximum length of the returned list and optionally padding length (see above). - truncation (`bool`): + truncation (`bool`, *optional*): Activates truncation to cut input sequences longer than *max_length* to *max_length*. pad_to_multiple_of (`int`, *optional*): If set, will pad the sequence to a multiple of the provided value. - return_attention_mask (`bool`, *optional*, defaults to `False`): + return_attention_mask (`bool`, *optional*): Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`. """ From 0da1dc315c52ad98eb2e68f13ba77d95f2a27e7c Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Wed, 12 Jun 2024 18:44:24 +0200 Subject: [PATCH 38/45] add comment to indicate documentation on adding kwargs --- src/transformers/models/align/processing_align.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index 58eb63c45e69..5fdaf0514048 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -33,6 +33,7 @@ class AlignProcessorKwargs(ProcessingKwargs, total=False): + # see processing_utils.ProcessingKwargs documentation for usage. _defaults = { "text_kwargs": { "padding": "max_length", From f6f1dacd76f02cd9d272323bbd64fa9048526924 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Wed, 12 Jun 2024 18:47:56 +0200 Subject: [PATCH 39/45] protect imports --- src/transformers/image_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 45cc6e09c928..13fdb1d6ebc2 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -83,12 +83,12 @@ VideoInput = Union[ List["PIL.Image.Image"], - np.ndarray, + "np.ndarray", "torch.Tensor", - List[np.ndarray], + List["np.ndarray"], List["torch.Tensor"], List[List["PIL.Image.Image"]], - List[List[np.ndarrray]], + List[List["np.ndarrray"]], List[List["torch.Tensor"]], ] # noqa From c4b7e840b86597e1e5a310851739b3b9a7d41148 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Wed, 12 Jun 2024 18:49:24 +0200 Subject: [PATCH 40/45] [run-slow]align From 3ce3608dced7a7151b85046f30377382778e8e40 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Wed, 12 Jun 2024 19:06:02 +0200 Subject: [PATCH 41/45] fix From 6b83e39dc647a3471e8b86f93ba6edbeb3d08add Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 13 Jun 2024 12:44:12 +0200 Subject: [PATCH 42/45] remove set() that breaks ordering --- src/transformers/processing_utils.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index cb60e77c87f2..9c757ee21c81 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -665,6 +665,15 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs): else: return processor + def update_kwargs(self, kwarg_dict, modality, key, value): + """Helper function to update kwargs and handle conflicts.""" + if key in kwarg_dict[modality]: + raise ValueError( + f"Keyword argument {key} was passed two times: in a dictionary for {modality} and as a **kwarg." + ) + kwarg_dict[modality][key] = value + return kwarg_dict + def _merge_kwargs( self, ModelProcessorKwargs: ProcessingKwargs, @@ -730,17 +739,17 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg # get defaults from set model processor kwargs if they exist for modality in default_kwargs: default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy() - # update modality kwargs with passed kwargs - non_modality_kwargs = set(kwargs) - set(output_kwargs) - - for modality in set(output_kwargs): - output_kwargs[modality] = { - **default_kwargs[modality], - } + # update defaults with arguments from tokenizer init for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys(): # init with tokenizer init kwargs if necessary if modality_key in tokenizer_init_kwargs: - output_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key] + default_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key] + # now defaults kwargs are updated with the tokenizers defaults. + output_kwargs.update(default_kwargs) + # update modality kwargs with passed kwargs + non_modality_kwargs = set(kwargs) - set(output_kwargs) + for modality in output_kwargs: + for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys(): # check if we received a structured kwarg dict or not to handle it correctly if modality in kwargs: kwarg_value = kwargs[modality].pop(modality_key, "__empty__") From 3818b86af9e07b129f090b67e67b13fe709a324c Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 13 Jun 2024 12:44:24 +0200 Subject: [PATCH 43/45] test more --- tests/test_processing_common.py | 105 ++++++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 13 deletions(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index a4c217df2c66..074aa2f1d625 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -44,6 +44,8 @@ from transformers import CLIPImageProcessor +@require_torch +@require_vision @require_torch class ProcessorTesterMixin: processor_class = None @@ -120,7 +122,7 @@ def skip_processor_without_typed_kwargs(self, processor): @require_vision @require_torch - def test_defaults_preserved_kwargs(self): + def test_tokenizer_defaults_preserved_by_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") @@ -136,7 +138,7 @@ def test_defaults_preserved_kwargs(self): @require_torch @require_vision - def test_defaults_preserved_image_kwargs(self): + def test_image_processor_defaults_preserved_by_image_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor", crop_size=(234, 234)) @@ -151,9 +153,42 @@ def test_defaults_preserved_image_kwargs(self): inputs = processor(text=input_str, images=image_input) self.assertEqual(len(inputs["pixel_values"][0][0]), 234) + @require_vision + @require_torch + def test_kwargs_overrides_default_tokenizer_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112) + self.assertEqual(len(inputs["input_ids"][0]), 112) + @require_torch @require_vision - def test_structured_kwargs(self): + def test_kwargs_overrides_default_image_processor_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor", crop_size=(234, 234)) + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, crop_size=[224, 224]) + self.assertEqual(len(inputs["pixel_values"][0][0]), 224) + + @require_torch + @require_vision + def test_unstructured_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") @@ -164,20 +199,64 @@ def test_structured_kwargs(self): input_str = "lower newer" image_input = self.prepare_image_inputs() + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + crop_size={"height": 214, "width": 214}, + padding="max_length", + max_length=76, + ) - # Define the kwargs for each modality - all_kwargs = { - "return_tensors": "pt", - "crop_size": {"height": 214, "width": 214}, - "padding": "max_length", - "max_length": 76, - "model_max_length": 76, - } + self.assertEqual(inputs["pixel_values"].shape[2], 214) + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + @require_vision + def test_unstructured_kwargs_batched(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = ["lower newer", "upper older longer string"] + image_input = self.prepare_image_inputs() * 2 + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + crop_size={"height": 214, "width": 214}, + padding="longest", + max_length=76, + ) - inputs = processor(text=input_str, images=image_input, **all_kwargs) self.assertEqual(inputs["pixel_values"].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) + self.assertEqual(len(inputs["input_ids"][0]), 6) + + @require_torch + @require_vision + def test_doubly_passed_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = ["lower newer"] + image_input = self.prepare_image_inputs() + with self.assertRaises(ValueError): + _ = processor( + text=input_str, + images=image_input, + images_kwargs={"crop_size": {"height": 222, "width": 222}}, + crop_size={"height": 214, "width": 214}, + ) @require_torch @require_vision From 31b7a602274901a6c007c822de26c70c91e33e05 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 13 Jun 2024 12:46:22 +0200 Subject: [PATCH 44/45] removed unused func --- src/transformers/processing_utils.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 9c757ee21c81..03db53c91091 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -665,15 +665,6 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs): else: return processor - def update_kwargs(self, kwarg_dict, modality, key, value): - """Helper function to update kwargs and handle conflicts.""" - if key in kwarg_dict[modality]: - raise ValueError( - f"Keyword argument {key} was passed two times: in a dictionary for {modality} and as a **kwarg." - ) - kwarg_dict[modality][key] = value - return kwarg_dict - def _merge_kwargs( self, ModelProcessorKwargs: ProcessingKwargs, @@ -745,7 +736,9 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg if modality_key in tokenizer_init_kwargs: default_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key] # now defaults kwargs are updated with the tokenizers defaults. + # pass defaults to output dictionary output_kwargs.update(default_kwargs) + # update modality kwargs with passed kwargs non_modality_kwargs = set(kwargs) - set(output_kwargs) for modality in output_kwargs: From 4072336a03e4e98d2c8881d685943fc51e2b9b88 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 13 Jun 2024 15:41:38 +0200 Subject: [PATCH 45/45] [run-slow]align