diff --git a/src/transformers/image_processing_base.py b/src/transformers/image_processing_base.py index 7960d1113d55..60774390bf23 100644 --- a/src/transformers/image_processing_base.py +++ b/src/transformers/image_processing_base.py @@ -362,25 +362,13 @@ def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): """ image_processor_dict = image_processor_dict.copy() return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) - - # The `size` parameter is a dict and was previously an int or tuple in feature extractors. - # We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate - # dict within the image processor and isn't overwritten if `size` is passed in as a kwarg. - if "size" in kwargs and "size" in image_processor_dict: - image_processor_dict["size"] = kwargs.pop("size") - if "crop_size" in kwargs and "crop_size" in image_processor_dict: - image_processor_dict["crop_size"] = kwargs.pop("crop_size") - + image_processor_dict.update({k: v for k, v in kwargs.items() if k in cls.valid_kwargs.__annotations__}) image_processor = cls(**image_processor_dict) - # Update image_processor with kwargs if needed - to_remove = [] - for key, value in kwargs.items(): + # Remove kwargs that are used to initialize the image processor attributes + for key in list(kwargs): if hasattr(image_processor, key): - setattr(image_processor, key, value) - to_remove.append(key) - for key in to_remove: - kwargs.pop(key, None) + kwargs.pop(key) logger.info(f"Image processor {image_processor}") if return_unused_kwargs: diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index a145754d3209..45741efd9517 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -185,6 +185,7 @@ class BaseImageProcessorFast(BaseImageProcessor): input_data_format = None device = None model_input_names = ["pixel_values"] + image_seq_length = None valid_kwargs = ImagesKwargs unused_kwargs = None diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py index 3ec36ebda440..ec5645ee4bb9 100644 --- a/src/transformers/models/pix2struct/image_processing_pix2struct.py +++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py @@ -53,11 +53,18 @@ class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False): """ max_patches (`int`, *optional*): Maximum number of patches to extract. + patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`): + The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16. + is_vqa (`bool`, *optional*, defaults to `False`): + Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is + rendered onto the input images. header_text (`Union[list[str], str]`, *optional*): Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`. """ max_patches: int + patch_size: dict[str, int] + is_vqa: bool header_text: Optional[Union[list[str], str]] diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index a70018c6cf2e..ecba3cac53fe 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -219,6 +219,9 @@ class methods and docstrings. - `'np'`: Return NumPy `np.ndarray` objects. disable_grouping (`bool`, *optional*): Whether to group images by shapes when processing or not, only relevant for fast image processing. + image_seq_length (`int`, *optional*): + The number of image tokens to be used for each image in the input. + Added for backward compatibility but this should be set as a processor attribute in future models. """ do_convert_rgb: Optional[bool] @@ -239,6 +242,7 @@ class methods and docstrings. device: Annotated[Optional[str], device_validator()] return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()] disable_grouping: Optional[bool] + image_seq_length: Optional[int] class VideosKwargs(TypedDict, total=False): diff --git a/tests/models/pix2struct/test_processing_pix2struct.py b/tests/models/pix2struct/test_processing_pix2struct.py index 0582bc857a5a..e4c8d5d63abd 100644 --- a/tests/models/pix2struct/test_processing_pix2struct.py +++ b/tests/models/pix2struct/test_processing_pix2struct.py @@ -172,6 +172,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8}) + print("image_processor", image_processor) tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)