amyeroberts · amyeroberts · Aug 17, 2022 · Jul 27, 2022 · Jul 27, 2022 · Jul 27, 2022
diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
@@ -19,10 +19,13 @@ Most of those are only useful if you are studying the code of the image processo
 
 ## Image Transformations
 
-[[autodoc]] image_transforms.to_pil_image
+[[autodoc]] image_transforms.rescale
 
 [[autodoc]] image_transforms.resize
 
+[[autodoc]] image_transforms.to_pil_image
+
+
 
 ## ImageProcessorMixin
 

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -635,7 +635,7 @@
     ]
 else:
     _import_structure["image_processing_utils"] = ["ImageProcessorMixin"]
-    _import_structure["image_transforms"] = ["resize", "to_pil_image"]
+    _import_structure["image_transforms"] = ["rescale", "resize", "to_pil_image"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
     _import_structure["models.clip"].append("CLIPFeatureExtractor")
@@ -3372,7 +3372,7 @@
         from .utils.dummy_vision_objects import *
     else:
         from .image_processing_utils import ImageProcessorMixin
-        from .image_transforms import resize, to_pil_image
+        from .image_transforms import rescale, resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor, CLIPProcessor

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
@@ -13,13 +13,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
 from .feature_extraction_utils import FeatureExtractionMixin
 from .utils import logging
 
 
 logger = logging.get_logger(__name__)
 
 
+# TODO: Move BatchFeature to be imported by both feature_extraction_utils and image_processing_utils
+# We override the class string here, but logic is the same.
+class BatchFeature(BaseBatchFeature):
+    r"""
+    Holds the output of the image processor specific `__call__` methods.
+
+    This class is derived from a python dictionary and can be used as a dictionary.
+
+    Args:
+        data (`dict`):
+            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('pixel_values', 'attention_mask',
+            etc.).
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
+            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
+            initialization.
+    """
+
+
 # We use aliasing whilst we phase out the old API. Once feature extractors for vision models
 # are deprecated, ImageProcessor mixin will be implemented. Any shared logic will be abstracted out.
 ImageProcessorMixin = FeatureExtractionMixin
+
+
+class BaseImageProcessor(ImageProcessorMixin):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __call__(self, images, **kwargs) -> BatchFeature:
+        return self.preprocess(images, **kwargs)
+
+    def preprocess(self, images, **kwargs) -> BatchFeature:
+        raise NotImplementedError("Each image processor must implement its own preprocess method")
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
@@ -64,15 +64,42 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim
     raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
 
 
+def rescale(
+    image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None, dtype=np.float32
+) -> np.ndarray:
+    """
+    Rescales `image` by `scale`.
+
+    Args:
+        image (`np.ndarray`):
+            The image to rescale.
+        scale (`float` or `int`, *optional*, defaults to 255):
+            The scale to use for rescaling the image.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the image. If not provided, it will be the same as the input image.
+        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
+            The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature
+            extractors.
+
+    Returns:
+        image: A rescaled np.ndarray image.
+    """
+    rescaled_image = image * scale
+    if data_format is not None:
+        rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
+    rescaled_image = rescaled_image.astype(dtype)
+    return rescaled_image
+
+
 def to_pil_image(
-    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"], rescale=None
+    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.Tensor"], do_rescale=None
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
     needed.
 
     Args:
-        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
+        image (`PIL.Image.Image`, `numpy.ndarray`, `torch.Tensor`, `tf.Tensor`):
             The image to convert to the PIL Image format.
         rescale (`bool`, *optional*):
             Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
@@ -87,15 +114,15 @@ def to_pil_image(
         image = np.array(image)
 
     if not isinstance(image, np.ndarray):
-        raise ValueError("Input image must be of type PIL.Image.Image, numpy.ndarray or torch.Tensor")
+        raise ValueError("Input image type not supported: {}".format(type(image)))
 
     # If the channel as been moved to first dim, we put it back at the end.
     image = to_channel_dimension_format(image, ChannelDimension.LAST)
 
     # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
-    rescale = isinstance(image.flat[0], float) if rescale is None else rescale
-    if rescale:
-        rescale = image * 255
+    do_rescale = isinstance(image.flat[0], float) if do_rescale is None else do_rescale
+    if do_rescale:
+        image = rescale(image, 255)
     image = image.astype(np.uint8)
     return PIL.Image.fromarray(image)
 
@@ -186,8 +213,7 @@ def resize(
         data_format (`ChannelDimension`, *optional*, defaults to `None`):
             The channel dimension format of the output image. If `None`, will use the inferred format from the input.
         return_numpy (`bool`, *optional*, defaults to `True`):
-            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is
-            returned.
+            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is returned.
 
     Returns:
         image: A resized np.ndarray.

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
@@ -29,14 +29,19 @@
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
 )
-from .utils.generic import ExplicitEnum, _is_jax, _is_tensorflow, _is_torch
+from .utils.generic import ExplicitEnum, _is_jax, _is_tensorflow, _is_torch, to_numpy
 
 
 ImageInput = Union[
     PIL.Image.Image, np.ndarray, "torch.Tensor", List[PIL.Image.Image], List[np.ndarray], List["torch.Tensor"]  # noqa
 ]
 
 
+class ChannelDimension(ExplicitEnum):
+    FIRST = "channels_first"
+    LAST = "channels_last"
+
+
 def is_torch_tensor(obj):
     return _is_torch(obj) if is_torch_available() else False
 
@@ -49,9 +54,29 @@ def is_jax_tensor(obj):
     return _is_jax(obj) if is_flax_available() else False
 
 
-class ChannelDimension(ExplicitEnum):
-    FIRST = "channels_first"
-    LAST = "channels_last"
+def is_valid_image(img):
+    return (
+        isinstance(img, (PIL.Image.Image, np.ndarray))
+        or is_torch_tensor(img)
+        or is_tf_tensor(img)
+        or is_jax_tensor(img)
+    )
+
+
+def valid_images(imgs):
+    return all(is_valid_image(img) for img in imgs)
+
+
+def is_batched(img):
+    if isinstance(img, (list, tuple)):
+        return is_valid_image(img[0])
+    return False
+
+
+def to_numpy_array(img) -> np.ndarray:
+    if isinstance(img, PIL.Image.Image):
+        return np.array(img)
+    return to_numpy(img)
 
 
 def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:

diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
@@ -0,0 +1,180 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for GLPN."""
+
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from transformers.utils.generic import TensorType
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import rescale, resize, to_channel_dimension_format
+from ...image_utils import ChannelDimension, get_image_size, is_batched, to_numpy_array, valid_images
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GLPNImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a GLPN image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Set the class default for the `do_resize` parameter. Controls whether to resize the image's (height, width)
+            dimensions, rounding them down to the closest multiple of `size_divisor`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Set the class default for the `do_rescale` parameter. Controls whether or not to apply the scaling factor
+            (to make pixel values floats between 0. and 1.).
+        size_divisor (`int`, *optional*, defaults to 32):
+            Set the class default for the `size_divisor` parameter. When `do_resize` is `True`, images are resized so
+            their height and width are rounded down to the closest multiple of `size_divisor`.
+        resample (`PIL.Image.Resampling`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
+            Set the class default for `resample`. Defines the resampling filter to use if resizing the image.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs
+    ) -> None:
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.size_divisor = size_divisor
+        self.resample = resample
+        super().__init__(**kwargs)
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size_divisor: int,
+        resample: PIL.Image.Resampling,
+        data_format: Optional[ChannelDimension] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor.
+
+        If the image is of dimension (3, 260, 170) and size_divisor is 32, the image will be resized to (3, 256, 160).
+
+        Args:
+            image (`np.ndarray`):
+                The image to resize.
+            size_divisor (`int`):
+                The image is resized so its height and width are rounded down to the closest multiple of
+                `size_divisor`.
+            resample (`PIL.Image.Resampling`):
+                Resampling filter to use when resizing the image.
+            data_format (`ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If `None`, the channel dimension format of the input
+                image is used. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        height, width = get_image_size(image)
+        # Rounds the height and width down to the closest multiple of size_divisor
+        new_h = height // size_divisor * size_divisor
+        new_w = width // size_divisor * size_divisor
+        image = resize(image, (new_h, new_w), resample=resample, data_format=data_format, **kwargs)
+        return image
+
+    def rescale(
+        self, image: np.ndarray, scale: Union[int, float], data_format: Optional[ChannelDimension] = None, **kwargs
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given scaling factor `scale`.
+
+        Args:
+            image (`np.ndarray`):
+                The image to rescale.
+            scale (`int` or `float`):
+                The scaling factor to rescale pixel values by.
+            data_format (`ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If `None`, the channel dimension format of the input
+                image is used. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image=image, scale=scale, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: Union["PIL.Image.Image", TensorType, List["PIL.Image.Image"], List[TensorType]],
+        do_resize: bool = None,
+        do_rescale: bool = None,
+        size_divisor: int = None,
+        resample: PIL.Image.Resampling = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs
+    ) -> BatchFeature:
+        """
+        Preprocess the given images.
+
+        Args:
+            images (`PIL.Image.Image` or `TensorType` or `List[np.ndarray]` or `List[TensorType]`):
+                The image or images to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the input such that the (height, width) dimensions are a multiple of `size_divisor`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
+            size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
+                When `do_resize` is `True`, images are resized so their height and width are rounded down to the
+                closest multiple of `size_divisor`.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PIL.Image.Resampling`,
+                Only has an effect if `do_resize` is set to `True`.
+            return_tensors (`str`, *optional*, defaults to `None`):
+                The type of tensors to return. Can be one of:
+                    - `None`: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        size_divisor = size_divisor if size_divisor is not None else self.size_divisor
+        resample = resample if resample is not None else self.resample
+
+        if do_resize and size_divisor is None:
+            raise ValueError("size_divisor is required for resizing")
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError("Invalid image(s)")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(img) for img in images]
+
+        if do_resize:
+            images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image, scale=1 / 255) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
@@ -10,6 +10,10 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+def rescale(*args, **kwargs):
+    requires_backends(rescale, ["vision"])
+
+
 def resize(*args, **kwargs):
     requires_backends(resize, ["vision"])