From 09ceb8eabc51934314112bc199a8c32dc1b8097a Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Fri, 10 May 2024 13:04:07 +0000 Subject: [PATCH 1/6] Add resize and pad strategy --- src/transformers/image_processing_utils.py | 8 +- .../models/detr/image_processing_detr.py | 105 ++++++++++++++++-- .../models/detr/test_image_processing_detr.py | 45 ++++++++ 3 files changed, 148 insertions(+), 10 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index e040cdd31aa0..efd1e04a6210 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -662,7 +662,13 @@ def center_crop( ) -VALID_SIZE_DICT_KEYS = ({"height", "width"}, {"shortest_edge"}, {"shortest_edge", "longest_edge"}, {"longest_edge"}) +VALID_SIZE_DICT_KEYS = ( + {"height", "width"}, + {"shortest_edge"}, + {"shortest_edge", "longest_edge"}, + {"longest_edge"}, + {"max_height", "max_width"}, +) def is_valid_size_dict(size_dict): diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index e0e59cbc7c40..50bb1975a536 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -116,6 +116,64 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in return (oh, ow) +def _get_image_size_for_max_height_width( + image_size: Tuple[int, int], max_height: int, max_width: int +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + image_size (`Tuple[int, int]`): + The input image size. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + """ + height, width = image_size + height_ratio = max_height / height + width_ratio = max_width / width + ratio = min(height_ratio, width_ratio) + new_height = int(height * ratio) + new_width = int(width * ratio) + return new_height, new_width + + +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + return _get_image_size_for_max_height_width(image_size, max_height, max_width) + + def get_resize_output_image_size( input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int]], @@ -753,7 +811,15 @@ class DetrImageProcessor(BaseImageProcessor): overridden by the `do_resize` parameter in the `preprocess` method. size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter - in the `preprocess` method. + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): @@ -777,8 +843,10 @@ class DetrImageProcessor(BaseImageProcessor): Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` - method. If `True` will pad the images in the batch to the largest height and width in the batch. - Padding will be applied to the bottom and right of the image with zeros. + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `max_height` and `max_width` are provided in the `size` parameter, the image will be padded to the + `max_height` and `max_width` dimensions. Otherwise, the image will be padded to the maximum height and width + of the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -954,18 +1022,27 @@ def resize( max_size = None size = get_size_dict(size, max_size=max_size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( + new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) elif "height" in size and "width" in size: - size = (size["height"], size["width"]) + new_size = (size["height"], size["width"]) else: raise ValueError( "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" f" {size.keys()}." ) image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, ) return image @@ -1104,6 +1181,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, + padded_size: Optional[Tuple[int, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -1133,8 +1211,12 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. + padded_size (`List[int, int]`, *optional*): + The size to pad the images to. If not provided, the images will be padded to the largest height and + width in the batch. """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) + if padded_size is None: + padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] @@ -1142,7 +1224,7 @@ def pad( for image, annotation in zip(images, annotation_list): padded_image, padded_annotation = self._pad_image( image, - pad_size, + padded_size, annotation, constant_values=constant_values, data_format=data_format, @@ -1156,7 +1238,7 @@ def pad( if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks @@ -1397,6 +1479,10 @@ def preprocess( if do_pad: # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + if "max_height" in size and "max_width" in size: + padded_size = (size["max_height"], size["max_width"]) + else: + padded_size = None encoded_inputs = self.pad( images, annotations=annotations, @@ -1405,6 +1491,7 @@ def preprocess( input_data_format=input_data_format, update_bboxes=do_convert_annotations, return_tensors=return_tensors, + padded_size=padded_size, ) else: images = [ diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index c79c1d7b0196..8e5ec579a9f6 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -547,3 +547,48 @@ def test_batched_coco_panoptic_annotations(self): ).T self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + def test_max_width_max_height_resizing_and_pad_strategy(self): + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = DetrImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = DetrImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = DetrImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=True, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = DetrImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 300, 100])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = DetrImageProcessor( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) From fbbd97fe99926110276ac16eae77912f6c179a65 Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Wed, 15 May 2024 11:36:42 +0000 Subject: [PATCH 2/6] Merge get_size functions --- .../models/detr/image_processing_detr.py | 37 ++++--------------- 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index 50bb1975a536..c6d85ebe1ba6 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -116,35 +116,6 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in return (oh, ow) -def _get_image_size_for_max_height_width( - image_size: Tuple[int, int], max_height: int, max_width: int -) -> Tuple[int, int]: - """ - Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. - Important, even if image_height < max_height and image_width < max_width, the image will be resized - to at least one of the edges be equal to max_height or max_width. - - For example: - - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) - - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) - - Args: - image_size (`Tuple[int, int]`): - The input image size. - max_height (`int`): - The maximum allowed height. - max_width (`int`): - The maximum allowed width. - """ - height, width = image_size - height_ratio = max_height / height - width_ratio = max_width / width - ratio = min(height_ratio, width_ratio) - new_height = int(height * ratio) - new_width = int(width * ratio) - return new_height, new_width - - def get_image_size_for_max_height_width( input_image: np.ndarray, max_height: int, @@ -171,7 +142,13 @@ def get_image_size_for_max_height_width( The channel dimension format of the input image. If not provided, it will be inferred from the input image. """ image_size = get_image_size(input_image, input_data_format) - return _get_image_size_for_max_height_width(image_size, max_height, max_width) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width def get_resize_output_image_size( From 95bcfd66defbbe5d6e1e70b65963363205a24e93 Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Wed, 15 May 2024 15:24:16 +0000 Subject: [PATCH 3/6] Add pad_size + tests to object detection models --- .../image_processing_conditional_detr.py | 79 +++++++++++++++-- .../image_processing_deformable_detr.py | 88 ++++++++++++++++--- .../models/deta/image_processing_deta.py | 82 ++++++++++++++--- .../models/detr/image_processing_detr.py | 40 +++++---- .../image_processing_grounding_dino.py | 88 ++++++++++++++++--- .../models/yolos/image_processing_yolos.py | 87 +++++++++++++++--- .../test_image_processing_conditional_detr.py | 49 +++++++++++ .../test_image_processing_deformable_detr.py | 49 +++++++++++ .../models/deta/test_image_processing_deta.py | 49 +++++++++++ .../models/detr/test_image_processing_detr.py | 7 +- .../test_image_processing_grounding_dino.py | 49 +++++++++++ .../yolos/test_image_processing_yolos.py | 49 +++++++++++ 12 files changed, 651 insertions(+), 65 deletions(-) diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index e88bfc8fe230..1cd153cc58d0 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -147,6 +147,42 @@ def get_resize_output_image_size( return get_size_with_aspect_ratio(image_size, size, max_size) +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + # Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn def get_numpy_to_framework_fn(arr) -> Callable: """ @@ -813,6 +849,7 @@ def __init__( image_std: Union[float, List[float]] = None, do_convert_annotations: Optional[bool] = None, do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: @@ -846,6 +883,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad + self.pad_size = pad_size self._valid_processor_keys = [ "images", "annotations", @@ -861,6 +899,7 @@ def __init__( "image_mean", "image_std", "do_pad", + "pad_size", "format", "return_tensors", "data_format", @@ -978,18 +1017,27 @@ def resize( max_size = None size = get_size_dict(size, max_size=max_size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( + new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) elif "height" in size and "width" in size: - size = (size["height"], size["width"]) + new_size = (size["height"], size["width"]) else: raise ValueError( "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" f" {size.keys()}." ) image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, ) return image @@ -1133,6 +1181,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, + pad_size: Optional[Dict[str, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -1162,8 +1211,15 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] @@ -1171,7 +1227,7 @@ def pad( for image, annotation in zip(images, annotation_list): padded_image, padded_annotation = self._pad_image( image, - pad_size, + padded_size, annotation, constant_values=constant_values, data_format=data_format, @@ -1185,7 +1241,7 @@ def pad( if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks @@ -1220,6 +1276,7 @@ def preprocess( return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> BatchFeature: """ @@ -1265,8 +1322,9 @@ def preprocess( image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): Standard deviation to use when normalizing the image. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch - and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): @@ -1282,6 +1340,9 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( @@ -1311,6 +1372,7 @@ def preprocess( self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations ) do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size format = self.format if format is None else format images = make_list_of_images(images) @@ -1435,6 +1497,7 @@ def preprocess( input_data_format=input_data_format, update_bboxes=do_convert_annotations, return_tensors=return_tensors, + pad_size=pad_size, ) else: images = [ diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index 5525eeeb8c58..875814596d6d 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -145,6 +145,43 @@ def get_resize_output_image_size( return get_size_with_aspect_ratio(image_size, size, max_size) + +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + # Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn def get_numpy_to_framework_fn(arr) -> Callable: """ @@ -791,8 +828,12 @@ class DeformableDetrImageProcessor(BaseImageProcessor): Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` - method. If `True` will pad the images in the batch to the largest height and width in the batch. - Padding will be applied to the bottom and right of the image with zeros. + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -811,6 +852,7 @@ def __init__( image_std: Union[float, List[float]] = None, do_convert_annotations: Optional[bool] = None, do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: @@ -844,6 +886,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad + self.pad_size = pad_size self._valid_processor_keys = [ "images", "annotations", @@ -859,6 +902,7 @@ def __init__( "image_mean", "image_std", "do_pad", + "pad_size", "format", "return_tensors", "data_format", @@ -976,18 +1020,27 @@ def resize( max_size = None size = get_size_dict(size, max_size=max_size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( + new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) elif "height" in size and "width" in size: - size = (size["height"], size["width"]) + new_size = (size["height"], size["width"]) else: raise ValueError( "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" f" {size.keys()}." ) image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, ) return image @@ -1131,6 +1184,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, + pad_size: Optional[Dict[str, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -1160,8 +1214,15 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] @@ -1169,7 +1230,7 @@ def pad( for image, annotation in zip(images, annotation_list): padded_image, padded_annotation = self._pad_image( image, - pad_size, + padded_size, annotation, constant_values=constant_values, data_format=data_format, @@ -1183,7 +1244,7 @@ def pad( if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks @@ -1218,6 +1279,7 @@ def preprocess( return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> BatchFeature: """ @@ -1263,8 +1325,9 @@ def preprocess( image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): Standard deviation to use when normalizing the image. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch - and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): @@ -1280,6 +1343,9 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( @@ -1309,6 +1375,7 @@ def preprocess( self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations ) do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size format = self.format if format is None else format images = make_list_of_images(images) @@ -1433,6 +1500,7 @@ def preprocess( input_data_format=input_data_format, update_bboxes=do_convert_annotations, return_tensors=return_tensors, + pad_size=pad_size, ) else: images = [ diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py index 45c5c6cb285a..6797b2a91c8f 100644 --- a/src/transformers/models/deta/image_processing_deta.py +++ b/src/transformers/models/deta/image_processing_deta.py @@ -139,6 +139,43 @@ def get_resize_output_image_size( return get_size_with_aspect_ratio(image_size, size, max_size) + +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + # Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn def get_numpy_to_framework_fn(arr) -> Callable: """ @@ -500,8 +537,12 @@ class DetaImageProcessor(BaseImageProcessor): Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` - method. If `True` will pad the images in the batch to the largest height and width in the batch. - Padding will be applied to the bottom and right of the image with zeros. + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -519,6 +560,7 @@ def __init__( image_std: Union[float, List[float]] = None, do_convert_annotations: bool = True, do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: @@ -542,6 +584,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad + self.pad_size = pad_size # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DETA def prepare_annotation( @@ -630,18 +673,22 @@ def resize( """ size = get_size_dict(size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( + new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format ) elif "height" in size and "width" in size: - size = (size["height"], size["width"]) + new_size = (size["height"], size["width"]) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) else: raise ValueError( "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" f" {size.keys()}." ) image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format + image, size=new_size, resample=resample, data_format=data_format, input_data_format=input_data_format ) return image @@ -785,6 +832,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, + pad_size: Optional[Dict[str, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -814,8 +862,15 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] @@ -823,7 +878,7 @@ def pad( for image, annotation in zip(images, annotation_list): padded_image, padded_annotation = self._pad_image( image, - pad_size, + padded_size, annotation, constant_values=constant_values, data_format=data_format, @@ -837,7 +892,7 @@ def pad( if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks @@ -871,6 +926,7 @@ def preprocess( return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> BatchFeature: """ @@ -916,8 +972,9 @@ def preprocess( boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` and in relative coordinates. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch - and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): @@ -933,6 +990,9 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( @@ -954,6 +1014,7 @@ def preprocess( self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations ) do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size format = self.format if format is None else format # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated. @@ -1076,6 +1137,7 @@ def preprocess( input_data_format=input_data_format, return_tensors=return_tensors, update_bboxes=do_convert_annotations, + pad_size=pad_size, ) else: images = [ diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index c6d85ebe1ba6..c8b6115d6282 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -821,9 +821,11 @@ class DetrImageProcessor(BaseImageProcessor): do_pad (`bool`, *optional*, defaults to `True`): Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` method. If `True`, padding will be applied to the bottom and right of the image with zeros. - If `max_height` and `max_width` are provided in the `size` parameter, the image will be padded to the - `max_height` and `max_width` dimensions. Otherwise, the image will be padded to the maximum height and width - of the batch. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -841,6 +843,7 @@ def __init__( image_std: Union[float, List[float]] = None, do_convert_annotations: Optional[bool] = None, do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: @@ -874,6 +877,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad + self.pad_size = pad_size self._valid_processor_keys = [ "images", "annotations", @@ -889,6 +893,7 @@ def __init__( "image_mean", "image_std", "do_pad", + "pad_size", "format", "return_tensors", "data_format", @@ -1158,7 +1163,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, - padded_size: Optional[Tuple[int, int]] = None, + pad_size: Optional[Dict[str, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -1188,11 +1193,14 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. - padded_size (`List[int, int]`, *optional*): - The size to pad the images to. If not provided, the images will be padded to the largest height and - width in the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ - if padded_size is None: + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) @@ -1249,6 +1257,7 @@ def preprocess( return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> BatchFeature: """ @@ -1294,8 +1303,9 @@ def preprocess( image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): Standard deviation to use when normalizing the image. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch - and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): @@ -1311,6 +1321,9 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( @@ -1340,6 +1353,7 @@ def preprocess( self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations ) do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size format = self.format if format is None else format images = make_list_of_images(images) @@ -1456,10 +1470,6 @@ def preprocess( if do_pad: # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} - if "max_height" in size and "max_width" in size: - padded_size = (size["max_height"], size["max_width"]) - else: - padded_size = None encoded_inputs = self.pad( images, annotations=annotations, @@ -1468,7 +1478,7 @@ def preprocess( input_data_format=input_data_format, update_bboxes=do_convert_annotations, return_tensors=return_tensors, - padded_size=padded_size, + pad_size=pad_size, ) else: images = [ diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index 8b39d6801ca0..29ee6ce394b3 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -152,6 +152,42 @@ def get_resize_output_image_size( return get_size_with_aspect_ratio(image_size, size, max_size) +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + # Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn def get_numpy_to_framework_fn(arr) -> Callable: """ @@ -798,8 +834,13 @@ class GroundingDinoImageProcessor(BaseImageProcessor): bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): - Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be - overridden by the `do_pad` parameter in the `preprocess` method. + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -818,6 +859,7 @@ def __init__( image_std: Union[float, List[float]] = None, do_convert_annotations: Optional[bool] = None, do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: @@ -851,6 +893,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad + self.pad_size = pad_size self._valid_processor_keys = [ "images", "annotations", @@ -866,6 +909,7 @@ def __init__( "image_mean", "image_std", "do_pad", + "pad_size", "format", "return_tensors", "data_format", @@ -983,18 +1027,27 @@ def resize( max_size = None size = get_size_dict(size, max_size=max_size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( + new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) elif "height" in size and "width" in size: - size = (size["height"], size["width"]) + new_size = (size["height"], size["width"]) else: raise ValueError( "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" f" {size.keys()}." ) image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, ) return image @@ -1138,6 +1191,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, + pad_size: Optional[Dict[str, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -1167,8 +1221,15 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] @@ -1176,7 +1237,7 @@ def pad( for image, annotation in zip(images, annotation_list): padded_image, padded_annotation = self._pad_image( image, - pad_size, + padded_size, annotation, constant_values=constant_values, data_format=data_format, @@ -1190,7 +1251,7 @@ def pad( if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks @@ -1225,6 +1286,7 @@ def preprocess( return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> BatchFeature: """ @@ -1270,8 +1332,9 @@ def preprocess( image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): Standard deviation to use when normalizing the image. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch - and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): @@ -1287,6 +1350,9 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( @@ -1316,6 +1382,7 @@ def preprocess( self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations ) do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size format = self.format if format is None else format images = make_list_of_images(images) @@ -1440,6 +1507,7 @@ def preprocess( input_data_format=input_data_format, update_bboxes=do_convert_annotations, return_tensors=return_tensors, + pad_size=pad_size, ) else: images = [ diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index b74819c7a1c9..b4169173b673 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -133,6 +133,42 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in return (height, width) +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + # Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size def get_resize_output_image_size( input_image: np.ndarray, @@ -699,8 +735,12 @@ class YolosImageProcessor(BaseImageProcessor): for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` - method. If `True` will pad the images in the batch to the largest height and width in the batch. - Padding will be applied to the bottom and right of the image with zeros. + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -718,6 +758,7 @@ def __init__( image_std: Union[float, List[float]] = None, do_convert_annotations: Optional[bool] = None, do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: @@ -751,6 +792,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad + self.pad_size = pad_size self._valid_processor_keys = [ "images", "annotations", @@ -766,6 +808,7 @@ def __init__( "image_std", "do_convert_annotations", "do_pad", + "pad_size", "format", "return_tensors", "data_format", @@ -883,18 +926,27 @@ def resize( max_size = None size = get_size_dict(size, max_size=max_size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( + new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) elif "height" in size and "width" in size: - size = (size["height"], size["width"]) + new_size = (size["height"], size["width"]) else: raise ValueError( "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" f" {size.keys()}." ) image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, ) return image @@ -1037,6 +1089,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, + pad_size: Optional[Dict[str, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -1067,8 +1120,15 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] @@ -1076,7 +1136,7 @@ def pad( for image, annotation in zip(images, annotation_list): padded_image, padded_annotation = self._pad_image( image, - pad_size, + padded_size, annotation, constant_values=constant_values, data_format=data_format, @@ -1090,7 +1150,7 @@ def pad( if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks @@ -1124,6 +1184,7 @@ def preprocess( return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> BatchFeature: """ @@ -1169,8 +1230,9 @@ def preprocess( boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` and in relative coordinates. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch - and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): @@ -1183,6 +1245,9 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( @@ -1212,6 +1277,7 @@ def preprocess( self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations ) do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size format = self.format if format is None else format validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) @@ -1335,6 +1401,7 @@ def preprocess( input_data_format=input_data_format, update_bboxes=do_convert_annotations, return_tensors=return_tensors, + pad_size=pad_size, ) else: images = [ diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py index e340f4247d47..5dc3913f9eeb 100644 --- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py +++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py @@ -490,3 +490,52 @@ def test_batched_coco_panoptic_annotations(self): ).T self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr + def test_max_width_max_height_resizing_and_pad_strategy(self): + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = ConditionalDetrImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = ConditionalDetrImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = ConditionalDetrImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=True, + pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = ConditionalDetrImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = ConditionalDetrImageProcessor( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py index 50df72496ffc..887c40cd2e10 100644 --- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py +++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py @@ -492,3 +492,52 @@ def test_batched_coco_panoptic_annotations(self): ).T self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr + def test_max_width_max_height_resizing_and_pad_strategy(self): + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = DeformableDetrImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = DeformableDetrImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = DeformableDetrImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=True, + pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = DeformableDetrImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = DeformableDetrImageProcessor( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) diff --git a/tests/models/deta/test_image_processing_deta.py b/tests/models/deta/test_image_processing_deta.py index ad17f0b5a178..d9f90a4b9eb1 100644 --- a/tests/models/deta/test_image_processing_deta.py +++ b/tests/models/deta/test_image_processing_deta.py @@ -486,3 +486,52 @@ def test_batched_coco_panoptic_annotations(self): ).T self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Deta + def test_max_width_max_height_resizing_and_pad_strategy(self): + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = DetaImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = DetaImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = DetaImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=True, + pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = DetaImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = DetaImageProcessor( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index 8e5ec579a9f6..82d62d1fe453 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -569,7 +569,8 @@ def test_max_width_max_height_resizing_and_pad_strategy(self): # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 image_processor = DetrImageProcessor( size={"max_height": 100, "max_width": 100}, - do_pad=True, + do_pad=True, + pad_size={"height": 100, "width": 100} ) inputs = image_processor(images=[image_1], return_tensors="pt") self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) @@ -578,9 +579,10 @@ def test_max_width_max_height_resizing_and_pad_strategy(self): image_processor = DetrImageProcessor( size={"max_height": 300, "max_width": 100}, do_pad=True, + pad_size={"height": 301, "width": 101}, ) inputs = image_processor(images=[image_1], return_tensors="pt") - self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 300, 100])) + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) ### Check for batch image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) @@ -589,6 +591,7 @@ def test_max_width_max_height_resizing_and_pad_strategy(self): image_processor = DetrImageProcessor( size={"max_height": 150, "max_width": 100}, do_pad=True, + pad_size={"height": 150, "width": 100}, ) inputs = image_processor(images=[image_1, image_2], return_tensors="pt") self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py index df69784bbb45..7ec4a8c248e4 100644 --- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -528,3 +528,52 @@ def test_batched_coco_panoptic_annotations(self): ).T self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->GroundingDino + def test_max_width_max_height_resizing_and_pad_strategy(self): + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = GroundingDinoImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = GroundingDinoImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = GroundingDinoImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=True, + pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = GroundingDinoImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = GroundingDinoImageProcessor( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py index f7465779b594..0cabb2ccd309 100644 --- a/tests/models/yolos/test_image_processing_yolos.py +++ b/tests/models/yolos/test_image_processing_yolos.py @@ -546,3 +546,52 @@ def test_batched_coco_panoptic_annotations(self): ).T self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos + def test_max_width_max_height_resizing_and_pad_strategy(self): + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = YolosImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = YolosImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = YolosImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=True, + pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = YolosImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = YolosImageProcessor( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) From 47f418418566e309f6602ecee09ad0b878db880d Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Wed, 15 May 2024 15:27:14 +0000 Subject: [PATCH 4/6] Fixup --- .../image_processing_conditional_detr.py | 4 ++-- .../image_processing_deformable_detr.py | 7 +++---- src/transformers/models/deta/image_processing_deta.py | 9 ++++----- src/transformers/models/detr/image_processing_detr.py | 10 +++++----- .../grounding_dino/image_processing_grounding_dino.py | 6 +++--- .../models/yolos/image_processing_yolos.py | 10 +++++----- .../test_image_processing_conditional_detr.py | 4 +--- .../test_image_processing_deformable_detr.py | 4 +--- tests/models/deta/test_image_processing_deta.py | 4 +--- tests/models/detr/test_image_processing_detr.py | 4 +--- .../test_image_processing_grounding_dino.py | 4 +--- tests/models/yolos/test_image_processing_yolos.py | 4 +--- 12 files changed, 28 insertions(+), 42 deletions(-) diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index 1cd153cc58d0..e8ad90cb2b71 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -1322,8 +1322,8 @@ def preprocess( image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): Standard deviation to use when normalizing the image. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True`, padding will be applied to the bottom and right of - the image with zeros. If `pad_size` is provided, the image will be padded to the specified + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index 875814596d6d..ea0d7c0a649d 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -145,7 +145,6 @@ def get_resize_output_image_size( return get_size_with_aspect_ratio(image_size, size, max_size) - # Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width def get_image_size_for_max_height_width( input_image: np.ndarray, @@ -832,7 +831,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ @@ -1325,8 +1324,8 @@ def preprocess( image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): Standard deviation to use when normalizing the image. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True`, padding will be applied to the bottom and right of - the image with zeros. If `pad_size` is provided, the image will be padded to the specified + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py index 6797b2a91c8f..8d9be776e49f 100644 --- a/src/transformers/models/deta/image_processing_deta.py +++ b/src/transformers/models/deta/image_processing_deta.py @@ -139,7 +139,6 @@ def get_resize_output_image_size( return get_size_with_aspect_ratio(image_size, size, max_size) - # Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width def get_image_size_for_max_height_width( input_image: np.ndarray, @@ -541,7 +540,7 @@ class DetaImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ @@ -972,8 +971,8 @@ def preprocess( boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` and in relative coordinates. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True`, padding will be applied to the bottom and right of - the image with zeros. If `pad_size` is provided, the image will be padded to the specified + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. @@ -991,7 +990,7 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index c8b6115d6282..bb1c562387f2 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -824,7 +824,7 @@ class DetrImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ @@ -1194,7 +1194,7 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ pad_size = pad_size if pad_size is not None else self.pad_size @@ -1303,8 +1303,8 @@ def preprocess( image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): Standard deviation to use when normalizing the image. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True`, padding will be applied to the bottom and right of - the image with zeros. If `pad_size` is provided, the image will be padded to the specified + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. @@ -1322,7 +1322,7 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index 29ee6ce394b3..25614da53c06 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -839,7 +839,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ @@ -1332,8 +1332,8 @@ def preprocess( image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): Standard deviation to use when normalizing the image. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True`, padding will be applied to the bottom and right of - the image with zeros. If `pad_size` is provided, the image will be padded to the specified + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index b4169173b673..b015c54a1e22 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -739,7 +739,7 @@ class YolosImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ @@ -1121,7 +1121,7 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ pad_size = pad_size if pad_size is not None else self.pad_size @@ -1230,8 +1230,8 @@ def preprocess( boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` and in relative coordinates. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True`, padding will be applied to the bottom and right of - the image with zeros. If `pad_size` is provided, the image will be padded to the specified + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. @@ -1246,7 +1246,7 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image in `images`. If not provided, the images will be padded to the largest height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py index 5dc3913f9eeb..61dcdc873dc3 100644 --- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py +++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py @@ -512,9 +512,7 @@ def test_max_width_max_height_resizing_and_pad_strategy(self): # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 image_processor = ConditionalDetrImageProcessor( - size={"max_height": 100, "max_width": 100}, - do_pad=True, - pad_size={"height": 100, "width": 100} + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} ) inputs = image_processor(images=[image_1], return_tensors="pt") self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py index 887c40cd2e10..49139c753938 100644 --- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py +++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py @@ -514,9 +514,7 @@ def test_max_width_max_height_resizing_and_pad_strategy(self): # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 image_processor = DeformableDetrImageProcessor( - size={"max_height": 100, "max_width": 100}, - do_pad=True, - pad_size={"height": 100, "width": 100} + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} ) inputs = image_processor(images=[image_1], return_tensors="pt") self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) diff --git a/tests/models/deta/test_image_processing_deta.py b/tests/models/deta/test_image_processing_deta.py index d9f90a4b9eb1..3ea5885b0e09 100644 --- a/tests/models/deta/test_image_processing_deta.py +++ b/tests/models/deta/test_image_processing_deta.py @@ -508,9 +508,7 @@ def test_max_width_max_height_resizing_and_pad_strategy(self): # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 image_processor = DetaImageProcessor( - size={"max_height": 100, "max_width": 100}, - do_pad=True, - pad_size={"height": 100, "width": 100} + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} ) inputs = image_processor(images=[image_1], return_tensors="pt") self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index 82d62d1fe453..ede06be6c521 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -568,9 +568,7 @@ def test_max_width_max_height_resizing_and_pad_strategy(self): # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 image_processor = DetrImageProcessor( - size={"max_height": 100, "max_width": 100}, - do_pad=True, - pad_size={"height": 100, "width": 100} + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} ) inputs = image_processor(images=[image_1], return_tensors="pt") self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py index 7ec4a8c248e4..5cd09ce23816 100644 --- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -550,9 +550,7 @@ def test_max_width_max_height_resizing_and_pad_strategy(self): # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 image_processor = GroundingDinoImageProcessor( - size={"max_height": 100, "max_width": 100}, - do_pad=True, - pad_size={"height": 100, "width": 100} + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} ) inputs = image_processor(images=[image_1], return_tensors="pt") self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py index 0cabb2ccd309..f04015ac0c9b 100644 --- a/tests/models/yolos/test_image_processing_yolos.py +++ b/tests/models/yolos/test_image_processing_yolos.py @@ -568,9 +568,7 @@ def test_max_width_max_height_resizing_and_pad_strategy(self): # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 image_processor = YolosImageProcessor( - size={"max_height": 100, "max_width": 100}, - do_pad=True, - pad_size={"height": 100, "width": 100} + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} ) inputs = image_processor(images=[image_1], return_tensors="pt") self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) From 787377951409e67fb52b7ba9f6d92270f3a583cb Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Wed, 15 May 2024 17:15:23 +0000 Subject: [PATCH 5/6] Update docstrings --- .../image_processing_conditional_detr.py | 52 +++++++++++++++---- .../image_processing_deformable_detr.py | 48 +++++++++++++---- .../models/deta/image_processing_deta.py | 47 +++++++++++++---- .../models/detr/image_processing_detr.py | 36 +++++++++---- .../image_processing_grounding_dino.py | 48 +++++++++++++---- .../models/yolos/image_processing_yolos.py | 48 +++++++++++++---- 6 files changed, 216 insertions(+), 63 deletions(-) diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index e8ad90cb2b71..9cf170ee241b 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -804,8 +804,16 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the `do_resize` parameter in the `preprocess` method. size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): - Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in - the `preprocess` method. + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): @@ -829,8 +837,13 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` - method. If `True` will pad the images in the batch to the largest height and width in the batch. - Padding will be applied to the bottom and right of the image with zeros. + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -997,8 +1010,15 @@ def resize( image (`np.ndarray`): Image to resize. size (`Dict[str, int]`): - Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or - `height` and `width`. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. data_format (`str` or `ChannelDimension`, *optional*): @@ -1212,8 +1232,9 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ pad_size = pad_size if pad_size is not None else self.pad_size if pad_size is not None: @@ -1304,7 +1325,15 @@ def preprocess( do_resize (`bool`, *optional*, defaults to self.do_resize): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to self.resample): Resampling filter to use when resizing the image. do_rescale (`bool`, *optional*, defaults to self.do_rescale): @@ -1341,8 +1370,9 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index ea0d7c0a649d..d0394282f64c 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -802,8 +802,16 @@ class DeformableDetrImageProcessor(BaseImageProcessor): Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the `do_resize` parameter in the `preprocess` method. size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): - Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in - the `preprocess` method. + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): @@ -831,8 +839,9 @@ class DeformableDetrImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -999,8 +1008,15 @@ def resize( image (`np.ndarray`): Image to resize. size (`Dict[str, int]`): - Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or - `height` and `width`. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. data_format (`str` or `ChannelDimension`, *optional*): @@ -1214,8 +1230,9 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ pad_size = pad_size if pad_size is not None else self.pad_size if pad_size is not None: @@ -1306,7 +1323,15 @@ def preprocess( do_resize (`bool`, *optional*, defaults to self.do_resize): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to self.resample): Resampling filter to use when resizing the image. do_rescale (`bool`, *optional*, defaults to self.do_rescale): @@ -1343,8 +1368,9 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py index 8d9be776e49f..cfbf2aa390c7 100644 --- a/src/transformers/models/deta/image_processing_deta.py +++ b/src/transformers/models/deta/image_processing_deta.py @@ -511,8 +511,16 @@ class DetaImageProcessor(BaseImageProcessor): Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the `do_resize` parameter in the `preprocess` method. size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): - Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in - the `preprocess` method. + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): @@ -540,8 +548,9 @@ class DetaImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -660,7 +669,15 @@ def resize( image (`np.ndarray`): Image to resize. size (`Dict[str, int]`): - The desired output size. Can contain keys `shortest_edge` and `longest_edge` or `height` and `width`. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. data_format (`ChannelDimension`, *optional*): @@ -862,8 +879,9 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ pad_size = pad_size if pad_size is not None else self.pad_size if pad_size is not None: @@ -953,7 +971,15 @@ def preprocess( do_resize (`bool`, *optional*, defaults to self.do_resize): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to self.resample): Resampling filter to use when resizing the image. do_rescale (`bool`, *optional*, defaults to self.do_rescale): @@ -990,8 +1016,9 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index bb1c562387f2..3f17bfb3a974 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -824,8 +824,9 @@ class DetrImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -984,8 +985,15 @@ def resize( image (`np.ndarray`): Image to resize. size (`Dict[str, int]`): - Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or - `height` and `width`. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. data_format (`str` or `ChannelDimension`, *optional*): @@ -1194,8 +1202,9 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ pad_size = pad_size if pad_size is not None else self.pad_size if pad_size is not None: @@ -1285,7 +1294,15 @@ def preprocess( do_resize (`bool`, *optional*, defaults to self.do_resize): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to self.resample): Resampling filter to use when resizing the image. do_rescale (`bool`, *optional*, defaults to self.do_rescale): @@ -1322,8 +1339,9 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index 25614da53c06..2c9c4c4bc693 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -809,8 +809,16 @@ class GroundingDinoImageProcessor(BaseImageProcessor): Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the `do_resize` parameter in the `preprocess` method. size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): - Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in - the `preprocess` method. + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): @@ -839,8 +847,9 @@ class GroundingDinoImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -1007,8 +1016,15 @@ def resize( image (`np.ndarray`): Image to resize. size (`Dict[str, int]`): - Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or - `height` and `width`. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. data_format (`str` or `ChannelDimension`, *optional*): @@ -1222,8 +1238,9 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ pad_size = pad_size if pad_size is not None else self.pad_size if pad_size is not None: @@ -1314,7 +1331,15 @@ def preprocess( do_resize (`bool`, *optional*, defaults to self.do_resize): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to self.resample): Resampling filter to use when resizing the image. do_rescale (`bool`, *optional*, defaults to self.do_rescale): @@ -1351,8 +1376,9 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index b015c54a1e22..0548fa1a8ea4 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -714,8 +714,16 @@ class YolosImageProcessor(BaseImageProcessor): Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the `do_resize` parameter in the `preprocess` method. size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): - Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in - the `preprocess` method. + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): @@ -739,8 +747,9 @@ class YolosImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -906,8 +915,15 @@ def resize( image (`np.ndarray`): Image to resize. size (`Dict[str, int]`): - Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or - `height` and `width`. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. data_format (`str` or `ChannelDimension`, *optional*): @@ -1121,8 +1137,9 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ pad_size = pad_size if pad_size is not None else self.pad_size if pad_size is not None: @@ -1212,7 +1229,15 @@ def preprocess( do_resize (`bool`, *optional*, defaults to self.do_resize): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to self.resample): Resampling filter to use when resizing the image. do_rescale (`bool`, *optional*, defaults to self.do_rescale): @@ -1246,8 +1271,9 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any - image in `images`. If not provided, the images will be padded to the largest height and width in the batch. + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( From bd018084723d8e9f564fbc99a8c2d656740ed2f8 Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Wed, 15 May 2024 17:28:15 +0000 Subject: [PATCH 6/6] Fixup --- .../conditional_detr/image_processing_conditional_detr.py | 6 +++--- .../deformable_detr/image_processing_deformable_detr.py | 6 +++--- src/transformers/models/deta/image_processing_deta.py | 6 +++--- src/transformers/models/detr/image_processing_detr.py | 6 +++--- .../grounding_dino/image_processing_grounding_dino.py | 6 +++--- src/transformers/models/yolos/image_processing_yolos.py | 6 +++--- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index 9cf170ee241b..174d6280504d 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -841,7 +841,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ @@ -1232,7 +1232,7 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ @@ -1370,7 +1370,7 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index d0394282f64c..f9440714c9e2 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -839,7 +839,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ @@ -1230,7 +1230,7 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ @@ -1368,7 +1368,7 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py index cfbf2aa390c7..c4a314ea7c06 100644 --- a/src/transformers/models/deta/image_processing_deta.py +++ b/src/transformers/models/deta/image_processing_deta.py @@ -548,7 +548,7 @@ class DetaImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ @@ -879,7 +879,7 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ @@ -1016,7 +1016,7 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index 3f17bfb3a974..76d00e900374 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -824,7 +824,7 @@ class DetrImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ @@ -1202,7 +1202,7 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ @@ -1339,7 +1339,7 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index 2c9c4c4bc693..32ca0b05d7d0 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -847,7 +847,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ @@ -1238,7 +1238,7 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ @@ -1376,7 +1376,7 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index 0548fa1a8ea4..1b0969f5c89d 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -747,7 +747,7 @@ class YolosImageProcessor(BaseImageProcessor): If `pad_size` is provided, the image will be padded to the specified dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ @@ -1137,7 +1137,7 @@ def pad( bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """ @@ -1271,7 +1271,7 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest height and width in the batch. """