From 54aed8b374dd0766abe0dc762d4bbe543afdcb4e Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 16:12:56 +0100
Subject: [PATCH 01/31] Base processor skeleton

---
 src/transformers/image_processing_utils.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index e053f4f486a7..908216cd4634 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -424,3 +424,14 @@ def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
 ImageProcessorMixin.push_to_hub.__doc__ = ImageProcessorMixin.push_to_hub.__doc__.format(
     object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
 )
+
+
+class BaseImageProcessor(ImageProcessorMixin):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __call__(self, *args, **kwargs):
+        return self.preprocess(*args, **kwargs)
+
+    def preprocess(self, *args, **kwargs):
+        raise NotImplementedError("Each image processor must implement its own preprocess method")

From ba55c8996ae83e3f066ac9cd9e557e9ad66e1a74 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 17:30:59 +0100
Subject: [PATCH 02/31] BatchFeature for packaging image processor outputs

---
 src/transformers/image_processing_utils.py | 164 ++++++++++++++++++++-
 1 file changed, 159 insertions(+), 5 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 908216cd4634..32477c7f4f77 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -16,30 +16,184 @@
 import copy
 import os
 import json
-from typing import Any, Dict, Tuple, Union
+from collections import UserDict
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 from requests import HTTPError
 
 from .dynamic_module_utils import custom_object_save
 from .utils import (
-    IMAGE_PROCESSOR_NAME,
-    PushToHubMixin,
-    logging,
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    IMAGE_PROCESSOR_NAME,
     EntryNotFoundError,
+    PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
+    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
+    is_flax_available,
     is_remote_url,
-    is_offline_mode
+    is_torch_available,
+    is_tf_available,
+    is_offline_mode,
+    logging,
+    torch_required,
 )
+from .utils.generic import _is_jax, _is_numpy, _is_torch_device
 
 logger = logging.get_logger(__name__)
 
 
+class BatchFeature(UserDict):
+    r"""
+    Holds the output of the image processor specific `__call__` methods.
+
+    This class is derived from a python dictionary and can be used as a dictionary.
+
+    Args:
+        data (`dict`):
+            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
+            etc.).
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
+            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
+            initialization.
+    """
+
+    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
+        super().__init__(data)
+        self.convert_to_tensors(tensor_type=tensor_type)
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.__getitem__
+    def __getitem__(self, item: str) -> Any:
+        """
+        If the key is a string, returns the value of the dict associated to `key` ('input_values', 'attention_mask',
+        etc.).
+        """
+        if isinstance(item, str):
+            return self.data[item]
+        else:
+            raise KeyError("Indexing with integers is not available when using Python based feature extractors")
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.__getattr__
+    def __getattr__(self, item: str):
+        try:
+            return self.data[item]
+        except KeyError:
+            raise AttributeError
+
+    # Copied from transformers.feature_extraction_utils.BatchFeature.__getstate__
+    def __getstate__(self):
+        return {"data": self.data}
+
+    # Copied from transformers.feature_extraction_utils.BatchFeature.__setstate__
+    def __setstate__(self, state):
+        if "data" in state:
+            self.data = state["data"]
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.keys
+    def keys(self):
+        return self.data.keys()
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.values
+    def values(self):
+        return self.data.values()
+
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.items
+    def items(self):
+        return self.data.items()
+
+    # Copied from transformers.feature_extraction_utils.BatchFeature.convert_to_tensors
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+        """
+        Convert the inner content to tensors.
+
+        Args:
+            tensor_type (`str` or [`~utils.TensorType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
+                `None`, no modification is done.
+        """
+        if tensor_type is None:
+            return self
+
+        # Convert to TensorType
+        if not isinstance(tensor_type, TensorType):
+            tensor_type = TensorType(tensor_type)
+
+        # Get a function reference for the correct framework
+        if tensor_type == TensorType.TENSORFLOW:
+            if not is_tf_available():
+                raise ImportError(
+                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
+                )
+            import tensorflow as tf
+
+            as_tensor = tf.constant
+            is_tensor = tf.is_tensor
+        elif tensor_type == TensorType.PYTORCH:
+            if not is_torch_available():
+                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
+            import torch
+
+            def as_tensor(value):
+                if isinstance(value, (list, tuple)) and len(value) > 0 and isinstance(value[0], np.ndarray):
+                    value = np.array(value)
+                return torch.tensor(value)
+
+            is_tensor = torch.is_tensor
+        elif tensor_type == TensorType.JAX:
+            if not is_flax_available():
+                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
+            import jax.numpy as jnp  # noqa: F811
+
+            as_tensor = jnp.array
+            is_tensor = _is_jax
+        else:
+            as_tensor = np.asarray
+            is_tensor = _is_numpy
+
+        # Do the tensor conversion in batch
+        for key, value in self.items():
+            try:
+                if not is_tensor(value):
+                    tensor = as_tensor(value)
+
+                    self[key] = tensor
+            except:  # noqa E722
+                if key == "overflowing_values":
+                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate padding "
+                    "with 'padding=True' to have batched tensors with the same length."
+                )
+
+        return self
+
+    @torch_required
+    # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature
+    def to(self, device: Union[str, "torch.device"]) -> "BatchFeature":
+        """
+        Send all values to device by calling `v.to(device)` (PyTorch only).
+
+        Args:
+            device (`str` or `torch.device`): The device to put the tensors on.
+
+        Returns:
+            [`BatchFeature`]: The same instance after modification.
+        """
+
+        # This check catches things like APEX blindly calling "to" on all inputs to a module
+        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
+        # into a HalfTensor
+        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
+            self.data = {k: v.to(device=device) for k, v in self.data.items()}
+        else:
+            logger.warning(f"Attempting to cast a BatchFeature to type {str(device)}. This is not supported.")
+        return self
+
+
 class ImageProcessorMixin(PushToHubMixin):
     """
     Image processor mixin used to provide saving/loading functionality

From 4b430d4de00b1ddb21882eaddce5e2f5c61842f7 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 19:38:50 +0100
Subject: [PATCH 03/31] Initial image processor for GLPN

---
 src/transformers/image_processing_utils.py    | 24 +++---
 src/transformers/image_transforms.py          | 26 +++++--
 src/transformers/image_utils.py               | 61 +++++++++++++--
 .../models/glpn/image_processing_glpn.py      | 76 +++++++++++++++++++
 4 files changed, 165 insertions(+), 22 deletions(-)
 create mode 100644 src/transformers/models/glpn/image_processing_glpn.py

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 32477c7f4f77..2f1377b3773f 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -23,6 +23,7 @@
 from requests import HTTPError
 
 from .dynamic_module_utils import custom_object_save
+from .image_utils import ImageType
 from .utils import (
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     IMAGE_PROCESSOR_NAME,
@@ -105,25 +106,24 @@ def values(self):
     def items(self):
         return self.data.items()
 
-    # Copied from transformers.feature_extraction_utils.BatchFeature.convert_to_tensors
-    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, ImageType]] = None):
         """
         Convert the inner content to tensors.
 
         Args:
-            tensor_type (`str` or [`~utils.TensorType`], *optional*):
-                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
+            tensor_type (`str` or [`~utils.ImageType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.ImageType`]. If
                 `None`, no modification is done.
         """
         if tensor_type is None:
             return self
 
         # Convert to TensorType
-        if not isinstance(tensor_type, TensorType):
-            tensor_type = TensorType(tensor_type)
+        if not isinstance(tensor_type, ImageType):
+            tensor_type = ImageType(tensor_type)
 
         # Get a function reference for the correct framework
-        if tensor_type == TensorType.TENSORFLOW:
+        if tensor_type == ImageType.TENSORFLOW:
             if not is_tf_available():
                 raise ImportError(
                     "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
@@ -132,7 +132,7 @@ def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = Non
 
             as_tensor = tf.constant
             is_tensor = tf.is_tensor
-        elif tensor_type == TensorType.PYTORCH:
+        elif tensor_type == ImageType.PYTORCH:
             if not is_torch_available():
                 raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
             import torch
@@ -143,7 +143,7 @@ def as_tensor(value):
                 return torch.tensor(value)
 
             is_tensor = torch.is_tensor
-        elif tensor_type == TensorType.JAX:
+        elif tensor_type == ImageType.JAX:
             if not is_flax_available():
                 raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
             import jax.numpy as jnp  # noqa: F811
@@ -584,8 +584,8 @@ class BaseImageProcessor(ImageProcessorMixin):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def __call__(self, *args, **kwargs):
-        return self.preprocess(*args, **kwargs)
+    def __call__(self, images, **kwargs) -> BatchFeature:
+        return self.preprocess(images, **kwargs)
 
-    def preprocess(self, *args, **kwargs):
+    def preprocess(self, images, **kwargs) -> BatchFeature:
         raise NotImplementedError("Each image processor must implement its own preprocess method")
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 7585aff76432..55d788d47531 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -28,10 +28,26 @@
 )
 
 
+def rescale(image: np.ndarray, scale: Union[float, int] = 255) -> np.ndarray:
+    """
+    Rescales `image` by `scale`.
+
+    Args:
+        image (`np.ndarray``):
+            The image to rescale.
+        scale (`float`, `int`):
+            The scale to use for rescaling the image.
+
+    Returns:
+        image: A rescaled np.ndarray image.
+    """
+    return image * scale
+
+
 def to_pil_image(
     image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"],
     channel_dim: Optional[ChannelDimension] = None,
-    rescale=None
+    do_rescale: Optional[bool] = None
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
@@ -61,9 +77,9 @@ def to_pil_image(
         image = image.transpose((1, 2, 0))
 
     # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
-    rescale = isinstance(image.flat[0], float) if rescale is None else rescale
-    if rescale:
-        rescale = image * 255
+    do_rescale = isinstance(image.flat[0], float) if do_rescale is None else do_rescale
+    if do_rescale:
+        image = rescale(image, 255)
     image = image.astype(np.uint8)
     return PIL.Image.fromarray(image)
 
@@ -107,7 +123,7 @@ def get_resize_output_image_size(
     return (new_short, new_long) if width <= height else (new_long, new_short)
 
 
-def resize(image, size: Tuple[int, int], resample=PIL.Image.BILINEAR):
+def resize(image, size: Tuple[int, int], resample=PIL.Image.Resampling.BILINEAR):
     """
     Resizes `image`. Enforces conversion of input to PIL.Image.
 
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index e23321be4780..e5afb3122bcd 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -23,8 +23,8 @@
 
 import requests
 
-from .utils import is_torch_available, is_tf_available, is_jax_available
-from .utils.generic import _is_torch, _is_tensorflow, _is_jax
+from .utils import TensorType, is_torch_available, is_tf_available, is_jax_available
+from .utils.generic import ExplicitEnum, _is_torch, _is_tensorflow, _is_jax, _is_numpy, to_numpy
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -36,6 +36,21 @@
     PIL.Image.Image, np.ndarray, "torch.Tensor", List[PIL.Image.Image], List[np.ndarray], List["torch.Tensor"]  # noqa
 ]
 
+class ChannelDimension(enum.Enum):
+    FIRST = 1
+    LAST = 3
+
+
+class ImageType(ExplicitEnum):
+    """
+    Possible image data formats that can be fed into an image processor
+    """
+    PYTORCH = "pt"
+    TENSORFLOW = "tf"
+    NUMPY = "np"
+    JAX = "jax"
+    PIL = "pillow"
+
 
 def is_torch_tensor(obj):
     return _is_torch(obj) if is_torch_available() else False
@@ -49,9 +64,45 @@ def is_jax_tensor(obj):
     return _is_jax(obj) if is_jax_available() else False
 
 
-class ChannelDimension(enum.Enum):
-    FIRST = 1
-    LAST = 3
+def is_valid_image(img):
+    return (
+        isinstance(img, (PIL.Image.Image, np.ndarray))
+        or is_torch_tensor(img)
+        or is_tf_tensor(img)
+        or is_jax_tensor(img)
+    )
+
+
+def valid_images(imgs):
+    return all(is_valid_image(img) for img in imgs)
+
+
+def is_batched(img):
+    if isinstance(img, (list, tuple)):
+        return is_valid_image(img[0])
+    return False
+
+
+def get_image_type(obj) -> TensorType:
+    if is_torch_tensor(obj):
+        return TensorType.TORCH
+    elif is_tf_tensor(obj):
+        return TensorType.TF
+    elif is_jax_tensor(obj):
+        return TensorType.JAX
+    elif _is_numpy(obj):
+        return TensorType.NUMPY
+    elif isinstance(obj, PIL.Image.Image):
+        return TensorType.PIL
+    else:
+        raise ValueError("Could not infer tensor type")
+
+
+def to_numpy_array(img) -> np.ndarray:
+    input_type = get_image_type(img)
+    if input_type == ImageType.PIL:
+        return np.array(img)
+    return to_numpy(img)
 
 
 def infer_channel_dimension(image: np.ndarray) -> ChannelDimension:
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
new file mode 100644
index 000000000000..9942845e5e45
--- /dev/null
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -0,0 +1,76 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for GLPN."""
+
+from tkinter import Image
+from typing import Union
+
+from numpy import np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import resize, rescale
+from ...image_utils import ImageType, is_batched, to_numpy_array, valid_images, get_image_size
+from ...utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class GLPNImageProcessor(BaseImageProcessor):
+    def __init__(self, do_resize=True, do_rescale=True, size_divisor=32, resample=Image.Resampling.BILINEAR, **kwargs) -> None:
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.size_divisor = size_divisor
+        self.resample = resample
+        super().__init__(**kwargs)
+
+    def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: Image.Resampling, **kwargs) -> np.ndarray:
+        height, width = get_image_size(image)
+        new_h = height // size_divisor * size_divisor
+        new_w = width // size_divisor * size_divisor
+        image = resize(image, (new_h, new_w), resample=resample, **kwargs)
+        return image
+
+    def rescale(self, image: np.ndarray, scale: Union[int, float], **kwargs) -> np.ndarray:
+        return rescale(image, scale, **kwargs)
+
+    def preprocess(self, images, do_resize=None, do_rescale=None, size_divisor=None, resample=None, return_tensors=None, **kwargs) -> BatchFeature:
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        size_divisor = size_divisor if size_divisor is not None else self.size_divisor
+        resample = resample if resample is not None else self.resample
+
+        # If a return type isn't specified, default to numpy arrays.
+        return_tensors = ImageType.NUMPY if return_tensors is None else return_tensors
+
+        if do_resize and size_divisor is None:
+            raise ValueError("size_divisor is required for resizing")
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError("Invalid image(s)")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(img) for img in images]
+
+        if do_resize:
+            images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(**data, return_tensors=return_tensors)

From b1c8b59fbe165d8a3f542c280fe65486eef0c2ad Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 20:02:51 +0100
Subject: [PATCH 04/31] REmove accidental import

---
 src/transformers/models/glpn/image_processing_glpn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 9942845e5e45..bdc33e80ce9f 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -14,9 +14,9 @@
 # limitations under the License.
 """Image processor class for GLPN."""
 
-from tkinter import Image
 from typing import Union
 
+import PIL.Image
 from numpy import np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
@@ -28,14 +28,14 @@
 
 
 class GLPNImageProcessor(BaseImageProcessor):
-    def __init__(self, do_resize=True, do_rescale=True, size_divisor=32, resample=Image.Resampling.BILINEAR, **kwargs) -> None:
+    def __init__(self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs) -> None:
         self.do_resize = do_resize
         self.do_rescale = do_rescale
         self.size_divisor = size_divisor
         self.resample = resample
         super().__init__(**kwargs)
 
-    def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: Image.Resampling, **kwargs) -> np.ndarray:
+    def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: PIL.Image.Resampling, **kwargs) -> np.ndarray:
         height, width = get_image_size(image)
         new_h = height // size_divisor * size_divisor
         new_w = width // size_divisor * size_divisor

From b9ce4a00399e5d177c1db07bab8ba47595abd12f Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 14:59:34 +0100
Subject: [PATCH 05/31] Import BatchFeature from feature_extraction_utils

---
 src/transformers/image_processing_utils.py | 173 +++------------------
 src/transformers/image_transforms.py       |  29 ++--
 src/transformers/image_utils.py            |   6 +-
 3 files changed, 42 insertions(+), 166 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 32477c7f4f77..0e8b02c56b52 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -14,15 +14,16 @@
 # limitations under the License.
 
 import copy
-import os
 import json
-from collections import UserDict
-from typing import Any, Dict, Optional, Tuple, Union
+import os
+from typing import Any, Dict, Tuple, Union
 
 import numpy as np
+
 from requests import HTTPError
 
 from .dynamic_module_utils import custom_object_save
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
 from .utils import (
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     IMAGE_PROCESSOR_NAME,
@@ -30,24 +31,21 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
-    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
-    is_flax_available,
-    is_remote_url,
-    is_torch_available,
-    is_tf_available,
     is_offline_mode,
+    is_remote_url,
     logging,
-    torch_required,
 )
-from .utils.generic import _is_jax, _is_numpy, _is_torch_device
+
 
 logger = logging.get_logger(__name__)
 
 
-class BatchFeature(UserDict):
+# TODO: Move BatchFeature to be imported by both feature_extraction_utils and image_processing_utils
+# We override the class string here, but logic is the same.
+class BatchFeature(BaseBatchFeature):
     r"""
     Holds the output of the image processor specific `__call__` methods.
 
@@ -55,144 +53,13 @@ class BatchFeature(UserDict):
 
     Args:
         data (`dict`):
-            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
+            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('pixel_values', 'attention_mask',
             etc.).
         tensor_type (`Union[None, str, TensorType]`, *optional*):
             You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
             initialization.
     """
 
-    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
-        super().__init__(data)
-        self.convert_to_tensors(tensor_type=tensor_type)
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.__getitem__
-    def __getitem__(self, item: str) -> Any:
-        """
-        If the key is a string, returns the value of the dict associated to `key` ('input_values', 'attention_mask',
-        etc.).
-        """
-        if isinstance(item, str):
-            return self.data[item]
-        else:
-            raise KeyError("Indexing with integers is not available when using Python based feature extractors")
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.__getattr__
-    def __getattr__(self, item: str):
-        try:
-            return self.data[item]
-        except KeyError:
-            raise AttributeError
-
-    # Copied from transformers.feature_extraction_utils.BatchFeature.__getstate__
-    def __getstate__(self):
-        return {"data": self.data}
-
-    # Copied from transformers.feature_extraction_utils.BatchFeature.__setstate__
-    def __setstate__(self, state):
-        if "data" in state:
-            self.data = state["data"]
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.keys
-    def keys(self):
-        return self.data.keys()
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.values
-    def values(self):
-        return self.data.values()
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.items
-    def items(self):
-        return self.data.items()
-
-    # Copied from transformers.feature_extraction_utils.BatchFeature.convert_to_tensors
-    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
-        """
-        Convert the inner content to tensors.
-
-        Args:
-            tensor_type (`str` or [`~utils.TensorType`], *optional*):
-                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
-                `None`, no modification is done.
-        """
-        if tensor_type is None:
-            return self
-
-        # Convert to TensorType
-        if not isinstance(tensor_type, TensorType):
-            tensor_type = TensorType(tensor_type)
-
-        # Get a function reference for the correct framework
-        if tensor_type == TensorType.TENSORFLOW:
-            if not is_tf_available():
-                raise ImportError(
-                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
-                )
-            import tensorflow as tf
-
-            as_tensor = tf.constant
-            is_tensor = tf.is_tensor
-        elif tensor_type == TensorType.PYTORCH:
-            if not is_torch_available():
-                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
-            import torch
-
-            def as_tensor(value):
-                if isinstance(value, (list, tuple)) and len(value) > 0 and isinstance(value[0], np.ndarray):
-                    value = np.array(value)
-                return torch.tensor(value)
-
-            is_tensor = torch.is_tensor
-        elif tensor_type == TensorType.JAX:
-            if not is_flax_available():
-                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
-            import jax.numpy as jnp  # noqa: F811
-
-            as_tensor = jnp.array
-            is_tensor = _is_jax
-        else:
-            as_tensor = np.asarray
-            is_tensor = _is_numpy
-
-        # Do the tensor conversion in batch
-        for key, value in self.items():
-            try:
-                if not is_tensor(value):
-                    tensor = as_tensor(value)
-
-                    self[key] = tensor
-            except:  # noqa E722
-                if key == "overflowing_values":
-                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
-                raise ValueError(
-                    "Unable to create tensor, you should probably activate padding "
-                    "with 'padding=True' to have batched tensors with the same length."
-                )
-
-        return self
-
-    @torch_required
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature
-    def to(self, device: Union[str, "torch.device"]) -> "BatchFeature":
-        """
-        Send all values to device by calling `v.to(device)` (PyTorch only).
-
-        Args:
-            device (`str` or `torch.device`): The device to put the tensors on.
-
-        Returns:
-            [`BatchFeature`]: The same instance after modification.
-        """
-
-        # This check catches things like APEX blindly calling "to" on all inputs to a module
-        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
-        # into a HalfTensor
-        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
-            self.data = {k: v.to(device=device) for k, v in self.data.items()}
-        else:
-            logger.warning(f"Attempting to cast a BatchFeature to type {str(device)}. This is not supported.")
-        return self
-
 
 class ImageProcessorMixin(PushToHubMixin):
     """
@@ -218,12 +85,10 @@ def _set_processor_class(self, processor_class: str):
         self._processor_class = processor_class
 
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
         r"""
-        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a
-        derived class of [`BaseImageProcessor`].
+        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a derived
+        class of [`BaseImageProcessor`].
 
         Args:
             pretrained_model_name_or_path (`str` or `os.PathLike`):
@@ -241,8 +106,8 @@ def from_pretrained(
                 Path to a directory in which a downloaded pretrained model image processor should be cached if the
                 standard cache should not be used.
             force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force to (re-)download the image processor files and override the cached versions
-                if they exist.
+                Whether or not to force to (re-)download the image processor files and override the cached versions if
+                they exist.
             resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                 exists.
@@ -496,16 +361,16 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]):
         """
-        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
-        a JSON file of parameters.
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON
+        file of parameters.
 
         Args:
             json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
-            object instantiated from that JSON file.
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object
+            instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 7585aff76432..7551a431b4ba 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -13,32 +13,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, List, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
-import PIL
 import numpy as np
+import PIL
+
+from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available
 
 from .image_utils import (
     ChannelDimension,
     get_image_size,
     infer_channel_dimension,
-    is_torch_tensor,
+    is_jax_tensor,
     is_tf_tensor,
-    is_jax_tensor
+    is_torch_tensor,
 )
 
 
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+    if is_tf_available():
+        import tensorflow as tf
+    if is_flax_available():
+        import jax.numpy as jnp
+
+
 def to_pil_image(
-    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"],
+    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
     channel_dim: Optional[ChannelDimension] = None,
-    rescale=None
+    rescale=None,
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
     needed.
 
     Args:
-        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
+        image (`PIL.Image.Image`, `numpy.ndarray`, `torch.Tensor`, `tf.Tensor`):
             The image to convert to the PIL Image format.
         rescale (`bool`, *optional*):
             Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
@@ -53,7 +64,7 @@ def to_pil_image(
         image = np.array(image)
 
     if not isinstance(image, np.ndarray):
-        raise ValueError("Input image must be of type PIL.Image.Image, numpy.ndarray or torch.Tensor")
+        raise ValueError("Input image type not supported: {}".format(type(image)))
 
     # If the channel as been moved to first dim, we put it back at the end.
     channel_dim = infer_channel_dimension(image) if channel_dim is None else channel_dim
@@ -72,7 +83,7 @@ def get_resize_output_image_size(
     input_image: np.ndarray,
     size: Union[int, Tuple[int, int], List[int]],
     default_to_square: bool = True,
-    max_size: int = None
+    max_size: int = None,
 ) -> np.ndarray:
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index e23321be4780..15bcf9954261 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -23,8 +23,8 @@
 
 import requests
 
-from .utils import is_torch_available, is_tf_available, is_jax_available
-from .utils.generic import _is_torch, _is_tensorflow, _is_jax
+from .utils import is_flax_available, is_tf_available, is_torch_available
+from .utils.generic import _is_jax, _is_tensorflow, _is_torch
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -46,7 +46,7 @@ def is_tf_tensor(obj):
 
 
 def is_jax_tensor(obj):
-    return _is_jax(obj) if is_jax_available() else False
+    return _is_jax(obj) if is_flax_available() else False
 
 
 class ChannelDimension(enum.Enum):

From 6b678fb4397a5dc2ddffcf93271dd7c534a964ca Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:18:18 +0100
Subject: [PATCH 06/31] Fixup and docs

---
 .../en/internal/image_processing_utils.mdx    | 24 +++++++++
 src/transformers/image_transforms.py          | 50 ++++++++++++++++---
 src/transformers/image_utils.py               |  6 +--
 3 files changed, 70 insertions(+), 10 deletions(-)
 create mode 100644 docs/source/en/internal/image_processing_utils.mdx

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
new file mode 100644
index 000000000000..ae4f826517aa
--- /dev/null
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -0,0 +1,24 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Utilities for Image Processors
+
+This page lists all the utility functions used by the image processors, mainly the functional
+transformations used to process the images.
+
+Most of those are only useful if you are studying the code of the image processors in the library.
+
+## Image Transformations
+
+[[autodoc]] image_transforms.to_pil_image
+
+[[autodoc]] image_transforms.resize
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 7585aff76432..9112506e8ebf 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -13,25 +13,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, List, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
-import PIL
 import numpy as np
+import PIL
+
+from transformers.utils.import_utils import is_tf_available, is_torch_available
 
 from .image_utils import (
     ChannelDimension,
     get_image_size,
     infer_channel_dimension,
-    is_torch_tensor,
+    is_jax_tensor,
     is_tf_tensor,
-    is_jax_tensor
+    is_torch_tensor,
 )
 
 
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+    if is_tf_available():
+        import tensorflow as tf
+
+
 def to_pil_image(
     image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"],
     channel_dim: Optional[ChannelDimension] = None,
-    rescale=None
+    rescale=None,
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
@@ -70,10 +79,37 @@ def to_pil_image(
 
 def get_resize_output_image_size(
     input_image: np.ndarray,
-    size: Union[int, Tuple[int, int], List[int]],
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
     default_to_square: bool = True,
-    max_size: int = None
+    max_size: int = None,
 ) -> np.ndarray:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
+            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
+            this.
+
+            If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+            `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
+            number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            The filter to user for resampling.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
+            (`size`,`size`). If set to `False`, will replicate
+            [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+            with support for resizing only the smallest edge and providing an optional `max_size`.
+        max_size (`int`, *optional*, defaults to `None`):
+            The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
+            than `max_size` after being resized according to `size`, then the image is resized again so that the longer
+            edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
+            than `size`. Only used if `default_to_square` is `False`.
+    """
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
             return size
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index e23321be4780..15bcf9954261 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -23,8 +23,8 @@
 
 import requests
 
-from .utils import is_torch_available, is_tf_available, is_jax_available
-from .utils.generic import _is_torch, _is_tensorflow, _is_jax
+from .utils import is_flax_available, is_tf_available, is_torch_available
+from .utils.generic import _is_jax, _is_tensorflow, _is_torch
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -46,7 +46,7 @@ def is_tf_tensor(obj):
 
 
 def is_jax_tensor(obj):
-    return _is_jax(obj) if is_jax_available() else False
+    return _is_jax(obj) if is_flax_available() else False
 
 
 class ChannelDimension(enum.Enum):

From db9343777571f0ee77307646755cab377ea6d388 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:24:56 +0100
Subject: [PATCH 07/31] Fixup and docs

---
 .../en/internal/image_processing_utils.mdx    |  5 +++
 src/transformers/__init__.py                  |  2 ++
 src/transformers/image_processing_utils.py    | 32 +++++++++----------
 .../utils/dummy_vision_objects.py             |  7 ++++
 4 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
index ae4f826517aa..4d5831a12fd6 100644
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -22,3 +22,8 @@ Most of those are only useful if you are studying the code of the image processo
 [[autodoc]] image_transforms.to_pil_image
 
 [[autodoc]] image_transforms.resize
+
+
+## ImageProcessorMixin
+
+[[autodoc]] image_processing_utils.ImageProcessorMixin
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index dcacc328e8a8..a5c3e0d90575 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -632,6 +632,7 @@
         name for name in dir(dummy_vision_objects) if not name.startswith("_")
     ]
 else:
+    _import_structure["image_processing_utils"] = ["ImageProcessorMixin"]
     _import_structure["image_transforms"] = ["resize", "to_pil_image"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
@@ -3340,6 +3341,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_vision_objects import *
     else:
+        from .image_processing_utils import ImageProcessorMixin
         from .image_transforms import resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 908216cd4634..0ec4a429e1a5 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -14,29 +14,31 @@
 # limitations under the License.
 
 import copy
-import os
 import json
+import os
 from typing import Any, Dict, Tuple, Union
 
 import numpy as np
+
 from requests import HTTPError
 
 from .dynamic_module_utils import custom_object_save
 from .utils import (
-    IMAGE_PROCESSOR_NAME,
-    PushToHubMixin,
-    logging,
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    IMAGE_PROCESSOR_NAME,
     EntryNotFoundError,
+    PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
     cached_path,
     copy_func,
     hf_bucket_url,
+    is_offline_mode,
     is_remote_url,
-    is_offline_mode
+    logging,
 )
 
+
 logger = logging.get_logger(__name__)
 
 
@@ -64,12 +66,10 @@ def _set_processor_class(self, processor_class: str):
         self._processor_class = processor_class
 
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
         r"""
-        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a
-        derived class of [`BaseImageProcessor`].
+        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a derived
+        class of [`BaseImageProcessor`].
 
         Args:
             pretrained_model_name_or_path (`str` or `os.PathLike`):
@@ -87,8 +87,8 @@ def from_pretrained(
                 Path to a directory in which a downloaded pretrained model image processor should be cached if the
                 standard cache should not be used.
             force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force to (re-)download the image processor files and override the cached versions
-                if they exist.
+                Whether or not to force to (re-)download the image processor files and override the cached versions if
+                they exist.
             resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                 exists.
@@ -342,16 +342,16 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]):
         """
-        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
-        a JSON file of parameters.
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON
+        file of parameters.
 
         Args:
             json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
-            object instantiated from that JSON file.
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object
+            instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 0d9da1b7a16d..5756f67326b7 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -3,6 +3,13 @@
 from ..utils import DummyObject, requires_backends
 
 
+class ImageProcessorMixin(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 def resize(*args, **kwargs):
     requires_backends(resize, ["vision"])
 

From bd890d599d3929403c68cd65164200e695bf56e9 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:18:18 +0100
Subject: [PATCH 08/31] Fixup and docs

---
 .../en/internal/image_processing_utils.mdx    | 24 +++++++++++++++
 src/transformers/image_transforms.py          | 29 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/internal/image_processing_utils.mdx

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
new file mode 100644
index 000000000000..ae4f826517aa
--- /dev/null
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -0,0 +1,24 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Utilities for Image Processors
+
+This page lists all the utility functions used by the image processors, mainly the functional
+transformations used to process the images.
+
+Most of those are only useful if you are studying the code of the image processors in the library.
+
+## Image Transformations
+
+[[autodoc]] image_transforms.to_pil_image
+
+[[autodoc]] image_transforms.resize
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 7551a431b4ba..38640029eaa0 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -81,10 +81,37 @@ def to_pil_image(
 
 def get_resize_output_image_size(
     input_image: np.ndarray,
-    size: Union[int, Tuple[int, int], List[int]],
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
     default_to_square: bool = True,
     max_size: int = None,
 ) -> np.ndarray:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
+            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
+            this.
+
+            If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+            `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
+            number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            The filter to user for resampling.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
+            (`size`,`size`). If set to `False`, will replicate
+            [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+            with support for resizing only the smallest edge and providing an optional `max_size`.
+        max_size (`int`, *optional*, defaults to `None`):
+            The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
+            than `max_size` after being resized according to `size`, then the image is resized again so that the longer
+            edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
+            than `size`. Only used if `default_to_square` is `False`.
+    """
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
             return size

From 4b27a340e451b0a402e78886f965c1c617f1fcf4 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:24:56 +0100
Subject: [PATCH 09/31] Fixup and docs

---
 docs/source/en/internal/image_processing_utils.mdx | 5 +++++
 src/transformers/__init__.py                       | 2 ++
 src/transformers/utils/dummy_vision_objects.py     | 7 +++++++
 3 files changed, 14 insertions(+)

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
index ae4f826517aa..4d5831a12fd6 100644
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -22,3 +22,8 @@ Most of those are only useful if you are studying the code of the image processo
 [[autodoc]] image_transforms.to_pil_image
 
 [[autodoc]] image_transforms.resize
+
+
+## ImageProcessorMixin
+
+[[autodoc]] image_processing_utils.ImageProcessorMixin
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index dcacc328e8a8..a5c3e0d90575 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -632,6 +632,7 @@
         name for name in dir(dummy_vision_objects) if not name.startswith("_")
     ]
 else:
+    _import_structure["image_processing_utils"] = ["ImageProcessorMixin"]
     _import_structure["image_transforms"] = ["resize", "to_pil_image"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
@@ -3340,6 +3341,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_vision_objects import *
     else:
+        from .image_processing_utils import ImageProcessorMixin
         from .image_transforms import resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 0d9da1b7a16d..5756f67326b7 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -3,6 +3,13 @@
 from ..utils import DummyObject, requires_backends
 
 
+class ImageProcessorMixin(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 def resize(*args, **kwargs):
     requires_backends(resize, ["vision"])
 

From ff0d49ec10c4d7ecda79248567a88a4285efbbd3 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 17:30:59 +0100
Subject: [PATCH 10/31] BatchFeature for packaging image processor outputs

---
 src/transformers/image_processing_utils.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 0e8b02c56b52..5370b9d35cad 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -15,8 +15,8 @@
 
 import copy
 import json
-import os
-from typing import Any, Dict, Tuple, Union
+from collections import UserDict
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 
@@ -31,13 +31,19 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
+    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
-    is_offline_mode,
+    is_flax_available,
     is_remote_url,
+    is_torch_available,
+    is_tf_available,
+    is_offline_mode,
     logging,
+    torch_required,
 )
+from .utils.generic import _is_jax, _is_numpy, _is_torch_device
 
 
 logger = logging.get_logger(__name__)

From 2c2fa9aa63f734f67fde6876fc1db5a961020e08 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 14:59:34 +0100
Subject: [PATCH 11/31] Import BatchFeature from feature_extraction_utils

---
 src/transformers/image_processing_utils.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 5370b9d35cad..9aa4bebf89bf 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -15,8 +15,8 @@
 
 import copy
 import json
-from collections import UserDict
-from typing import Any, Dict, Optional, Tuple, Union
+import os
+from typing import Any, Dict, Tuple, Union
 
 import numpy as np
 
@@ -31,19 +31,14 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
-    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
-    is_flax_available,
-    is_remote_url,
-    is_torch_available,
-    is_tf_available,
     is_offline_mode,
+    is_remote_url,
     logging,
-    torch_required,
 )
-from .utils.generic import _is_jax, _is_numpy, _is_torch_device
+
 
 
 logger = logging.get_logger(__name__)

From 7faf2e69167453d8c095d62790b3cd79b7ec3987 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 14:59:34 +0100
Subject: [PATCH 12/31] Import BatchFeature from feature_extraction_utils

---
 src/transformers/image_processing_utils.py | 173 +++------------------
 src/transformers/image_transforms.py       |  41 +++--
 src/transformers/image_utils.py            |   6 +-
 3 files changed, 40 insertions(+), 180 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 2f1377b3773f..872ce352c3de 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -14,16 +14,16 @@
 # limitations under the License.
 
 import copy
-import os
 import json
-from collections import UserDict
-from typing import Any, Dict, Optional, Tuple, Union
+import os
+from typing import Any, Dict, Tuple, Union
 
 import numpy as np
+
 from requests import HTTPError
 
 from .dynamic_module_utils import custom_object_save
-from .image_utils import ImageType
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
 from .utils import (
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     IMAGE_PROCESSOR_NAME,
@@ -31,24 +31,21 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
-    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
-    is_flax_available,
-    is_remote_url,
-    is_torch_available,
-    is_tf_available,
     is_offline_mode,
+    is_remote_url,
     logging,
-    torch_required,
 )
-from .utils.generic import _is_jax, _is_numpy, _is_torch_device
+
 
 logger = logging.get_logger(__name__)
 
 
-class BatchFeature(UserDict):
+# TODO: Move BatchFeature to be imported by both feature_extraction_utils and image_processing_utils
+# We override the class string here, but logic is the same.
+class BatchFeature(BaseBatchFeature):
     r"""
     Holds the output of the image processor specific `__call__` methods.
 
@@ -56,143 +53,13 @@ class BatchFeature(UserDict):
 
     Args:
         data (`dict`):
-            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
+            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('pixel_values', 'attention_mask',
             etc.).
         tensor_type (`Union[None, str, TensorType]`, *optional*):
             You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
             initialization.
     """
 
-    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
-        super().__init__(data)
-        self.convert_to_tensors(tensor_type=tensor_type)
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.__getitem__
-    def __getitem__(self, item: str) -> Any:
-        """
-        If the key is a string, returns the value of the dict associated to `key` ('input_values', 'attention_mask',
-        etc.).
-        """
-        if isinstance(item, str):
-            return self.data[item]
-        else:
-            raise KeyError("Indexing with integers is not available when using Python based feature extractors")
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.__getattr__
-    def __getattr__(self, item: str):
-        try:
-            return self.data[item]
-        except KeyError:
-            raise AttributeError
-
-    # Copied from transformers.feature_extraction_utils.BatchFeature.__getstate__
-    def __getstate__(self):
-        return {"data": self.data}
-
-    # Copied from transformers.feature_extraction_utils.BatchFeature.__setstate__
-    def __setstate__(self, state):
-        if "data" in state:
-            self.data = state["data"]
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.keys
-    def keys(self):
-        return self.data.keys()
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.values
-    def values(self):
-        return self.data.values()
-
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.items
-    def items(self):
-        return self.data.items()
-
-    def convert_to_tensors(self, tensor_type: Optional[Union[str, ImageType]] = None):
-        """
-        Convert the inner content to tensors.
-
-        Args:
-            tensor_type (`str` or [`~utils.ImageType`], *optional*):
-                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.ImageType`]. If
-                `None`, no modification is done.
-        """
-        if tensor_type is None:
-            return self
-
-        # Convert to TensorType
-        if not isinstance(tensor_type, ImageType):
-            tensor_type = ImageType(tensor_type)
-
-        # Get a function reference for the correct framework
-        if tensor_type == ImageType.TENSORFLOW:
-            if not is_tf_available():
-                raise ImportError(
-                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
-                )
-            import tensorflow as tf
-
-            as_tensor = tf.constant
-            is_tensor = tf.is_tensor
-        elif tensor_type == ImageType.PYTORCH:
-            if not is_torch_available():
-                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
-            import torch
-
-            def as_tensor(value):
-                if isinstance(value, (list, tuple)) and len(value) > 0 and isinstance(value[0], np.ndarray):
-                    value = np.array(value)
-                return torch.tensor(value)
-
-            is_tensor = torch.is_tensor
-        elif tensor_type == ImageType.JAX:
-            if not is_flax_available():
-                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
-            import jax.numpy as jnp  # noqa: F811
-
-            as_tensor = jnp.array
-            is_tensor = _is_jax
-        else:
-            as_tensor = np.asarray
-            is_tensor = _is_numpy
-
-        # Do the tensor conversion in batch
-        for key, value in self.items():
-            try:
-                if not is_tensor(value):
-                    tensor = as_tensor(value)
-
-                    self[key] = tensor
-            except:  # noqa E722
-                if key == "overflowing_values":
-                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
-                raise ValueError(
-                    "Unable to create tensor, you should probably activate padding "
-                    "with 'padding=True' to have batched tensors with the same length."
-                )
-
-        return self
-
-    @torch_required
-    # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature
-    def to(self, device: Union[str, "torch.device"]) -> "BatchFeature":
-        """
-        Send all values to device by calling `v.to(device)` (PyTorch only).
-
-        Args:
-            device (`str` or `torch.device`): The device to put the tensors on.
-
-        Returns:
-            [`BatchFeature`]: The same instance after modification.
-        """
-
-        # This check catches things like APEX blindly calling "to" on all inputs to a module
-        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
-        # into a HalfTensor
-        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
-            self.data = {k: v.to(device=device) for k, v in self.data.items()}
-        else:
-            logger.warning(f"Attempting to cast a BatchFeature to type {str(device)}. This is not supported.")
-        return self
-
 
 class ImageProcessorMixin(PushToHubMixin):
     """
@@ -218,12 +85,10 @@ def _set_processor_class(self, processor_class: str):
         self._processor_class = processor_class
 
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ):
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
         r"""
-        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a
-        derived class of [`BaseImageProcessor`].
+        Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a derived
+        class of [`BaseImageProcessor`].
 
         Args:
             pretrained_model_name_or_path (`str` or `os.PathLike`):
@@ -241,8 +106,8 @@ def from_pretrained(
                 Path to a directory in which a downloaded pretrained model image processor should be cached if the
                 standard cache should not be used.
             force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force to (re-)download the image processor files and override the cached versions
-                if they exist.
+                Whether or not to force to (re-)download the image processor files and override the cached versions if
+                they exist.
             resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                 exists.
@@ -496,16 +361,16 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]):
         """
-        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
-        a JSON file of parameters.
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON
+        file of parameters.
 
         Args:
             json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
-            object instantiated from that JSON file.
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object
+            instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 55d788d47531..e7e99c09f850 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -13,48 +13,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, List, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
-import PIL
 import numpy as np
+import PIL
+
+from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available
 
 from .image_utils import (
     ChannelDimension,
     get_image_size,
     infer_channel_dimension,
-    is_torch_tensor,
+    is_jax_tensor,
     is_tf_tensor,
-    is_jax_tensor
+    is_torch_tensor,
 )
 
 
-def rescale(image: np.ndarray, scale: Union[float, int] = 255) -> np.ndarray:
-    """
-    Rescales `image` by `scale`.
-
-    Args:
-        image (`np.ndarray``):
-            The image to rescale.
-        scale (`float`, `int`):
-            The scale to use for rescaling the image.
-
-    Returns:
-        image: A rescaled np.ndarray image.
-    """
-    return image * scale
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+    if is_tf_available():
+        import tensorflow as tf
+    if is_flax_available():
+        import jax.numpy as jnp
 
 
 def to_pil_image(
-    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"],
+    image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
     channel_dim: Optional[ChannelDimension] = None,
-    do_rescale: Optional[bool] = None
+    rescale=None,
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
     needed.
 
     Args:
-        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
+        image (`PIL.Image.Image`, `numpy.ndarray`, `torch.Tensor`, `tf.Tensor`):
             The image to convert to the PIL Image format.
         rescale (`bool`, *optional*):
             Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
@@ -69,7 +64,7 @@ def to_pil_image(
         image = np.array(image)
 
     if not isinstance(image, np.ndarray):
-        raise ValueError("Input image must be of type PIL.Image.Image, numpy.ndarray or torch.Tensor")
+        raise ValueError("Input image type not supported: {}".format(type(image)))
 
     # If the channel as been moved to first dim, we put it back at the end.
     channel_dim = infer_channel_dimension(image) if channel_dim is None else channel_dim
@@ -88,7 +83,7 @@ def get_resize_output_image_size(
     input_image: np.ndarray,
     size: Union[int, Tuple[int, int], List[int]],
     default_to_square: bool = True,
-    max_size: int = None
+    max_size: int = None,
 ) -> np.ndarray:
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index e5afb3122bcd..8fdf7aadac4d 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -23,8 +23,8 @@
 
 import requests
 
-from .utils import TensorType, is_torch_available, is_tf_available, is_jax_available
-from .utils.generic import ExplicitEnum, _is_torch, _is_tensorflow, _is_jax, _is_numpy, to_numpy
+from .utils import is_flax_available, is_tf_available, is_torch_available
+from .utils.generic import _is_jax, _is_tensorflow, _is_torch
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -61,7 +61,7 @@ def is_tf_tensor(obj):
 
 
 def is_jax_tensor(obj):
-    return _is_jax(obj) if is_jax_available() else False
+    return _is_jax(obj) if is_flax_available() else False
 
 
 def is_valid_image(img):

From ccc15fb6887f748b98c67a3ad4521cfb96a0ddf7 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:18:18 +0100
Subject: [PATCH 13/31] Fixup and docs

---
 .../en/internal/image_processing_utils.mdx    | 24 +++++++++++++++
 src/transformers/image_transforms.py          | 29 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/internal/image_processing_utils.mdx

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
new file mode 100644
index 000000000000..ae4f826517aa
--- /dev/null
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -0,0 +1,24 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Utilities for Image Processors
+
+This page lists all the utility functions used by the image processors, mainly the functional
+transformations used to process the images.
+
+Most of those are only useful if you are studying the code of the image processors in the library.
+
+## Image Transformations
+
+[[autodoc]] image_transforms.to_pil_image
+
+[[autodoc]] image_transforms.resize
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index e7e99c09f850..c7e9c0ec9e20 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -81,10 +81,37 @@ def to_pil_image(
 
 def get_resize_output_image_size(
     input_image: np.ndarray,
-    size: Union[int, Tuple[int, int], List[int]],
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
     default_to_square: bool = True,
     max_size: int = None,
 ) -> np.ndarray:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
+            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
+            this.
+
+            If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+            `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
+            number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            The filter to user for resampling.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
+            (`size`,`size`). If set to `False`, will replicate
+            [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+            with support for resizing only the smallest edge and providing an optional `max_size`.
+        max_size (`int`, *optional*, defaults to `None`):
+            The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
+            than `max_size` after being resized according to `size`, then the image is resized again so that the longer
+            edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
+            than `size`. Only used if `default_to_square` is `False`.
+    """
     if isinstance(size, (tuple, list)):
         if len(size) == 2:
             return size

From c8f8eb6e0c918ea2f82b0f0faba141489a508c42 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:24:56 +0100
Subject: [PATCH 14/31] Fixup and docs

---
 docs/source/en/internal/image_processing_utils.mdx | 5 +++++
 src/transformers/__init__.py                       | 2 ++
 src/transformers/utils/dummy_vision_objects.py     | 7 +++++++
 3 files changed, 14 insertions(+)

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
index ae4f826517aa..4d5831a12fd6 100644
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -22,3 +22,8 @@ Most of those are only useful if you are studying the code of the image processo
 [[autodoc]] image_transforms.to_pil_image
 
 [[autodoc]] image_transforms.resize
+
+
+## ImageProcessorMixin
+
+[[autodoc]] image_processing_utils.ImageProcessorMixin
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index dcacc328e8a8..a5c3e0d90575 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -632,6 +632,7 @@
         name for name in dir(dummy_vision_objects) if not name.startswith("_")
     ]
 else:
+    _import_structure["image_processing_utils"] = ["ImageProcessorMixin"]
     _import_structure["image_transforms"] = ["resize", "to_pil_image"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
@@ -3340,6 +3341,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_vision_objects import *
     else:
+        from .image_processing_utils import ImageProcessorMixin
         from .image_transforms import resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 0d9da1b7a16d..5756f67326b7 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -3,6 +3,13 @@
 from ..utils import DummyObject, requires_backends
 
 
+class ImageProcessorMixin(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 def resize(*args, **kwargs):
     requires_backends(resize, ["vision"])
 

From 90093f4b989869cc4191c26ba27cd428af521cae Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 17:30:59 +0100
Subject: [PATCH 15/31] BatchFeature for packaging image processor outputs

---
 src/transformers/image_processing_utils.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 872ce352c3de..587753856420 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -15,8 +15,8 @@
 
 import copy
 import json
-import os
-from typing import Any, Dict, Tuple, Union
+from collections import UserDict
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 
@@ -31,13 +31,19 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
+    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
-    is_offline_mode,
+    is_flax_available,
     is_remote_url,
+    is_torch_available,
+    is_tf_available,
+    is_offline_mode,
     logging,
+    torch_required,
 )
+from .utils.generic import _is_jax, _is_numpy, _is_torch_device
 
 
 logger = logging.get_logger(__name__)

From d89c0513ba7e52c1de91870c6179d392a0d7114b Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 14:59:34 +0100
Subject: [PATCH 16/31] Import BatchFeature from feature_extraction_utils

---
 src/transformers/image_processing_utils.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 587753856420..38778039aee2 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -15,8 +15,8 @@
 
 import copy
 import json
-from collections import UserDict
-from typing import Any, Dict, Optional, Tuple, Union
+import os
+from typing import Any, Dict, Tuple, Union
 
 import numpy as np
 
@@ -31,19 +31,14 @@
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
-    TensorType,
     cached_path,
     copy_func,
     hf_bucket_url,
-    is_flax_available,
-    is_remote_url,
-    is_torch_available,
-    is_tf_available,
     is_offline_mode,
+    is_remote_url,
     logging,
-    torch_required,
 )
-from .utils.generic import _is_jax, _is_numpy, _is_torch_device
+
 
 
 logger = logging.get_logger(__name__)

From 9bc91578fd60c5a2662440946807162159c327e8 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:24:56 +0100
Subject: [PATCH 17/31] Fixup and docs

---
 src/transformers/image_processing_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 38778039aee2..872ce352c3de 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -40,7 +40,6 @@
 )
 
 
-
 logger = logging.get_logger(__name__)
 
 

From 6ec382acb30842b9a1871e907c923b87293ca63e Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Wed, 27 Jul 2022 15:55:48 +0100
Subject: [PATCH 18/31] Mixin for saving the image processor

---
 src/transformers/image_processing_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 872ce352c3de..6e7c8e530d72 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -361,16 +361,16 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]):
         """
-        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON
-        file of parameters.
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
+        a JSON file of parameters.
 
         Args:
             json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object
-            instantiated from that JSON file.
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
+            object instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()

From 56ee6ad282f0d17f3e100655ecef58b0ae0cb354 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 13:24:56 +0100
Subject: [PATCH 19/31] Fixup and docs

---
 src/transformers/image_processing_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 6e7c8e530d72..872ce352c3de 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -361,16 +361,16 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]):
         """
-        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to
-        a JSON file of parameters.
+        Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON
+        file of parameters.
 
         Args:
             json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor
-            object instantiated from that JSON file.
+            A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object
+            instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()

From 6b88d5f8e09d3ea7d22317eb241df67b7286697d Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 15:57:24 +0100
Subject: [PATCH 20/31] Add rescale back and remove ImageType

---
 .../en/internal/image_processing_utils.mdx    |  5 ++-
 src/transformers/__init__.py                  |  4 +--
 src/transformers/image_transforms.py          | 18 ++++++++++-
 src/transformers/image_utils.py               | 32 ++-----------------
 .../models/glpn/image_processing_glpn.py      | 23 +++++++++----
 .../utils/dummy_vision_objects.py             |  4 +++
 6 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx
index 4d5831a12fd6..8bdf0ed11099 100644
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.mdx
@@ -19,10 +19,13 @@ Most of those are only useful if you are studying the code of the image processo
 
 ## Image Transformations
 
-[[autodoc]] image_transforms.to_pil_image
+[[autodoc]] image_transforms.rescale
 
 [[autodoc]] image_transforms.resize
 
+[[autodoc]] image_transforms.to_pil_image
+
+
 
 ## ImageProcessorMixin
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a5c3e0d90575..9b34e4cea7f7 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -633,7 +633,7 @@
     ]
 else:
     _import_structure["image_processing_utils"] = ["ImageProcessorMixin"]
-    _import_structure["image_transforms"] = ["resize", "to_pil_image"]
+    _import_structure["image_transforms"] = ["rescale", "resize", "to_pil_image"]
     _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
     _import_structure["models.beit"].append("BeitFeatureExtractor")
     _import_structure["models.clip"].append("CLIPFeatureExtractor")
@@ -3342,7 +3342,7 @@
         from .utils.dummy_vision_objects import *
     else:
         from .image_processing_utils import ImageProcessorMixin
-        from .image_transforms import resize, to_pil_image
+        from .image_transforms import rescale, resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor, CLIPProcessor
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index c7e9c0ec9e20..b15a1372a953 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -39,10 +39,26 @@
         import jax.numpy as jnp
 
 
+def rescale(image: np.ndarray, scale: Union[float, int] = 255) -> np.ndarray:
+    """
+    Rescales `image` by `scale`.
+
+    Args:
+        image (`np.ndarray``):
+            The image to rescale.
+        scale (`float`, `int`):
+            The scale to use for rescaling the image.
+
+    Returns:
+        image: A rescaled np.ndarray image.
+    """
+    return image * scale
+
+
 def to_pil_image(
     image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.ndarray"],
     channel_dim: Optional[ChannelDimension] = None,
-    rescale=None,
+    do_rescale=None,
 ) -> PIL.Image.Image:
     """
     Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 8fdf7aadac4d..3bb72816ced2 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -24,7 +24,7 @@
 import requests
 
 from .utils import is_flax_available, is_tf_available, is_torch_available
-from .utils.generic import _is_jax, _is_tensorflow, _is_torch
+from .utils.generic import _is_jax, _is_tensorflow, _is_torch, to_numpy
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
@@ -36,22 +36,12 @@
     PIL.Image.Image, np.ndarray, "torch.Tensor", List[PIL.Image.Image], List[np.ndarray], List["torch.Tensor"]  # noqa
 ]
 
+
 class ChannelDimension(enum.Enum):
     FIRST = 1
     LAST = 3
 
 
-class ImageType(ExplicitEnum):
-    """
-    Possible image data formats that can be fed into an image processor
-    """
-    PYTORCH = "pt"
-    TENSORFLOW = "tf"
-    NUMPY = "np"
-    JAX = "jax"
-    PIL = "pillow"
-
-
 def is_torch_tensor(obj):
     return _is_torch(obj) if is_torch_available() else False
 
@@ -83,24 +73,8 @@ def is_batched(img):
     return False
 
 
-def get_image_type(obj) -> TensorType:
-    if is_torch_tensor(obj):
-        return TensorType.TORCH
-    elif is_tf_tensor(obj):
-        return TensorType.TF
-    elif is_jax_tensor(obj):
-        return TensorType.JAX
-    elif _is_numpy(obj):
-        return TensorType.NUMPY
-    elif isinstance(obj, PIL.Image.Image):
-        return TensorType.PIL
-    else:
-        raise ValueError("Could not infer tensor type")
-
-
 def to_numpy_array(img) -> np.ndarray:
-    input_type = get_image_type(img)
-    if input_type == ImageType.PIL:
+    if isinstance(img, PIL.Image.Image):
         return np.array(img)
     return to_numpy(img)
 
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index bdc33e80ce9f..c07163a7e1d9 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -19,23 +19,30 @@
 import PIL.Image
 from numpy import np
 
+from transformers.utils.generic import TensorType
+
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import resize, rescale
-from ...image_utils import ImageType, is_batched, to_numpy_array, valid_images, get_image_size
+from ...image_transforms import rescale, resize
+from ...image_utils import get_image_size, is_batched, to_numpy_array, valid_images
 from ...utils import logging
 
+
 logger = logging.get_logger(__name__)
 
 
 class GLPNImageProcessor(BaseImageProcessor):
-    def __init__(self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs) -> None:
+    def __init__(
+        self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs
+    ) -> None:
         self.do_resize = do_resize
         self.do_rescale = do_rescale
         self.size_divisor = size_divisor
         self.resample = resample
         super().__init__(**kwargs)
 
-    def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: PIL.Image.Resampling, **kwargs) -> np.ndarray:
+    def resize(
+        self, image: np.ndarray, size_divisor: Union[int, float], resample: PIL.Image.Resampling, **kwargs
+    ) -> np.ndarray:
         height, width = get_image_size(image)
         new_h = height // size_divisor * size_divisor
         new_w = width // size_divisor * size_divisor
@@ -45,14 +52,16 @@ def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: P
     def rescale(self, image: np.ndarray, scale: Union[int, float], **kwargs) -> np.ndarray:
         return rescale(image, scale, **kwargs)
 
-    def preprocess(self, images, do_resize=None, do_rescale=None, size_divisor=None, resample=None, return_tensors=None, **kwargs) -> BatchFeature:
+    def preprocess(
+        self, images, do_resize=None, do_rescale=None, size_divisor=None, resample=None, return_tensors=None, **kwargs
+    ) -> BatchFeature:
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         size_divisor = size_divisor if size_divisor is not None else self.size_divisor
         resample = resample if resample is not None else self.resample
 
         # If a return type isn't specified, default to numpy arrays.
-        return_tensors = ImageType.NUMPY if return_tensors is None else return_tensors
+        return_tensors = TensorType.NUMPY if return_tensors is None else return_tensors
 
         if do_resize and size_divisor is None:
             raise ValueError("size_divisor is required for resizing")
@@ -70,7 +79,7 @@ def preprocess(self, images, do_resize=None, do_rescale=None, size_divisor=None,
             images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images]
 
         if do_rescale:
-            images = [self.rescale(image) for image in images]
+            images = [self.rescale(image, scale=255) for image in images]
 
         data = {"pixel_values": images}
         return BatchFeature(**data, return_tensors=return_tensors)
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 5756f67326b7..6622564eafd6 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -10,6 +10,10 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+def rescale(*args, **kwargs):
+    requires_backends(rescale, ["vision"])
+
+
 def resize(*args, **kwargs):
     requires_backends(resize, ["vision"])
 

From 67077f1de95425da52541078f0ce0eb121158889 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Thu, 28 Jul 2022 16:19:42 +0100
Subject: [PATCH 21/31] fix import mistake

---
 src/transformers/models/glpn/image_processing_glpn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index c07163a7e1d9..8ba6568bfcf0 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -17,7 +17,7 @@
 from typing import Union
 
 import PIL.Image
-from numpy import np
+import numpy as np
 
 from transformers.utils.generic import TensorType
 

From 60c56e5dc4d428b83ca1f23b4162596232ade716 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Fri, 29 Jul 2022 15:53:37 +0100
Subject: [PATCH 22/31] Data format flag for rescale

---
 src/transformers/image_transforms.py | 37 ++++++++++++++++------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 16b297f3abfc..950a75e08141 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -39,22 +39,6 @@
         import jax.numpy as jnp
 
 
-def rescale(image: np.ndarray, scale: Union[float, int] = 255) -> np.ndarray:
-    """
-    Rescales `image` by `scale`.
-
-    Args:
-        image (`np.ndarray``):
-            The image to rescale.
-        scale (`float`, `int`):
-            The scale to use for rescaling the image.
-
-    Returns:
-        image: A rescaled np.ndarray image.
-    """
-    return image * scale
-
-
 def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDimension, str]) -> np.ndarray:
     """
     Converts `image` to the channel dimension format specified by `channel_dim`.
@@ -82,6 +66,27 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim
     raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
 
 
+def rescale(
+    image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None
+) -> np.ndarray:
+    """
+    Rescales `image` by `scale`.
+
+    Args:
+        image (`np.ndarray``):
+            The image to rescale.
+        scale (`float`, `int`):
+            The scale to use for rescaling the image.
+
+    Returns:
+        image: A rescaled np.ndarray image.
+    """
+    rescaled_image = image * scale
+    if data_format is not None:
+        rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
+    return rescaled_image
+
+
 def to_pil_image(
     image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.Tensor"], do_rescale=None
 ) -> PIL.Image.Image:

From 9294dbcef3e4cebb4ac68efd614c47e8ee8a6638 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Fri, 29 Jul 2022 15:59:31 +0100
Subject: [PATCH 23/31] Fix typo

---
 src/transformers/image_transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 950a75e08141..1863c01d60ec 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -194,7 +194,7 @@ def resize(
     size: Tuple[int, int],
     resample=PIL.Image.Resampling.BILINEAR,
     data_format: Optional[ChannelDimension] = None,
-) -> np.np.ndarray:
+) -> np.ndarray:
     """
     Resizes `image` to (h, w) specified by `size` using the PIL library.
 

From 88b82e936793a2746c56bdc0c47c3c4b4c8b1590 Mon Sep 17 00:00:00 2001
From: Amy Roberts <amyeroberts@users.noreply.github.com>
Date: Fri, 29 Jul 2022 16:47:43 +0100
Subject: [PATCH 24/31] Fixes to make IP and FE outputs match

---
 src/transformers/image_transforms.py          |  8 +++-
 .../models/glpn/image_processing_glpn.py      | 37 ++++++++++++++-----
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 1863c01d60ec..9f7e1f48520f 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -67,7 +67,7 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim
 
 
 def rescale(
-    image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None
+    image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None, dtype=np.float32
 ) -> np.ndarray:
     """
     Rescales `image` by `scale`.
@@ -77,6 +77,11 @@ def rescale(
             The image to rescale.
         scale (`float`, `int`):
             The scale to use for rescaling the image.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the image. If not provided, it will be the same as the input image.
+        dtype (`np.dtype`, *optional*):
+            The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility
+            with feature extractors
 
     Returns:
         image: A rescaled np.ndarray image.
@@ -84,6 +89,7 @@ def rescale(
     rescaled_image = image * scale
     if data_format is not None:
         rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
+    rescaled_image = rescaled_image.astype(dtype)
     return rescaled_image
 
 
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index cce1dccf6aff..6ab3d85221eb 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Image processor class for GLPN."""
 
-from typing import Union
+from typing import Optional, Union
 
 import numpy as np
 import PIL.Image
@@ -22,8 +22,8 @@
 from transformers.utils.generic import TensorType
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature
-from ...image_transforms import rescale, resize
-from ...image_utils import get_image_size, is_batched, to_numpy_array, valid_images
+from ...image_transforms import rescale, resize, to_channel_dimension_format
+from ...image_utils import ChannelDimension, get_image_size, is_batched, to_numpy_array, valid_images
 from ...utils import logging
 
 
@@ -41,19 +41,34 @@ def __init__(
         super().__init__(**kwargs)
 
     def resize(
-        self, image: np.ndarray, size_divisor: Union[int, float], resample: PIL.Image.Resampling, **kwargs
+        self,
+        image: np.ndarray,
+        size_divisor: Union[int, float],
+        resample: PIL.Image.Resampling,
+        data_format: Optional[ChannelDimension] = None,
+        **kwargs
     ) -> np.ndarray:
         height, width = get_image_size(image)
         new_h = height // size_divisor * size_divisor
         new_w = width // size_divisor * size_divisor
-        image = resize(image, (new_h, new_w), resample=resample, **kwargs)
+        image = resize(image, (new_h, new_w), resample=resample, data_format=data_format, **kwargs)
         return image
 
-    def rescale(self, image: np.ndarray, scale: Union[int, float], **kwargs) -> np.ndarray:
-        return rescale(image, scale, **kwargs)
+    def rescale(
+        self, image: np.ndarray, scale: Union[int, float], data_format: Optional[ChannelDimension] = None, **kwargs
+    ) -> np.ndarray:
+        return rescale(image=image, scale=scale, data_format=data_format, **kwargs)
 
     def preprocess(
-        self, images, do_resize=None, do_rescale=None, size_divisor=None, resample=None, return_tensors=None, **kwargs
+        self,
+        images,
+        do_resize: bool = None,
+        do_rescale: bool = None,
+        size_divisor: int = None,
+        resample: PIL.Image.Resampling = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs
     ) -> BatchFeature:
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
@@ -79,7 +94,9 @@ def preprocess(
             images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images]
 
         if do_rescale:
-            images = [self.rescale(image, scale=255) for image in images]
+            images = [self.rescale(image, scale=1/255) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
 
         data = {"pixel_values": images}
-        return BatchFeature(**data, return_tensors=return_tensors)
+        return BatchFeature(data=data, tensor_type=return_tensors)

From 082e4ff9631b934c7eb25ba211ed4fd351171fbb Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 2 Aug 2022 19:18:53 +0100
Subject: [PATCH 25/31] Remove default to numpy batching

---
 src/transformers/models/glpn/image_processing_glpn.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 6ab3d85221eb..eae2607334db 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -75,9 +75,6 @@ def preprocess(
         size_divisor = size_divisor if size_divisor is not None else self.size_divisor
         resample = resample if resample is not None else self.resample
 
-        # If a return type isn't specified, default to numpy arrays.
-        return_tensors = TensorType.NUMPY if return_tensors is None else return_tensors
-
         if do_resize and size_divisor is None:
             raise ValueError("size_divisor is required for resizing")
 

From bf7335821e3e262d2a3b56749c5442d4235dc6a1 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 3 Aug 2022 12:53:39 +0100
Subject: [PATCH 26/31] Fix up

---
 src/transformers/image_processing_utils.py            | 3 +--
 src/transformers/image_transforms.py                  | 4 ++--
 src/transformers/models/glpn/image_processing_glpn.py | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index a15b188349e2..756a4cee7823 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -313,8 +313,7 @@ def get_image_processor_dict(
     @classmethod
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
-        Instantiates a type of [`~image_processing_utils.ImageProcessorMixin`] from a Python dictionary of
-        parameters.
+        Instantiates a type of [`~image_processing_utils.ImageProcessorMixin`] from a Python dictionary of parameters.
 
         Args:
             image_processor_dict (`Dict[str, Any]`):
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 871665455f3c..99149682616b 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -80,8 +80,8 @@ def rescale(
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the image. If not provided, it will be the same as the input image.
         dtype (`np.dtype`, *optional*):
-            The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility
-            with feature extractors
+            The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility with feature
+            extractors
 
     Returns:
         image: A rescaled np.ndarray image.
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index eae2607334db..6c13650c2a5f 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -91,7 +91,7 @@ def preprocess(
             images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images]
 
         if do_rescale:
-            images = [self.rescale(image, scale=1/255) for image in images]
+            images = [self.rescale(image, scale=1 / 255) for image in images]
 
         images = [to_channel_dimension_format(image, data_format) for image in images]
 

From 34b6b2fa64ef200192f03927997bb01bcb92122f Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Thu, 4 Aug 2022 09:22:42 +0100
Subject: [PATCH 27/31] Add docstring and model_input_types

---
 .../models/glpn/image_processing_glpn.py      | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 6c13650c2a5f..1cdb455b5052 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -31,6 +31,25 @@
 
 
 class GLPNImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a GLPN image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input based on certain `size_divisor`.
+        size_divisor (`int` or `Tuple(int)`, *optional*, defaults to 32):
+            Make sure the input is divisible by this value. Only has an effect if `do_resize` is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
+            `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
+            `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
+            to `True`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
+    """
+
+    model_input_names = ["pixel_values"]
+
     def __init__(
         self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs
     ) -> None:

From 7a4d22a880dd7ae176138f7e829e875e2be6d7b2 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 8 Aug 2022 12:31:38 +0100
Subject: [PATCH 28/31] Fix up

---
 src/transformers/image_processing_utils.py | 3 ++-
 src/transformers/image_transforms.py       | 7 ++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 3b8d380d53c4..74ed9c31c397 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .feature_extraction_utils import FeatureExtractionMixin, BatchFeature as BaseBatchFeature
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
+from .feature_extraction_utils import FeatureExtractionMixin
 from .utils import logging
 
 
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 73fa4e5f6e8d..d09ef526084a 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -18,7 +18,7 @@
 import numpy as np
 import PIL
 
-from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available
+from transformers.utils.import_utils import is_tf_available, is_torch_available
 
 from .image_utils import (
     ChannelDimension,
@@ -35,8 +35,6 @@
         import torch
     if is_tf_available():
         import tensorflow as tf
-    if is_flax_available():
-        import jax.numpy as jnp
 
 
 def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDimension, str]) -> np.ndarray:
@@ -188,8 +186,7 @@ def resize(
         data_format (`ChannelDimension`, *optional*, defaults to `None`):
             The channel dimension format of the output image. If `None`, will use the inferred format from the input.
         return_numpy (`bool`, *optional*, defaults to `True`):
-            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is
-            returned.
+            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is returned.
 
     Returns:
         image: A resized np.ndarray.

From 790c2c6d997ae263f0813ac0e499edfc140c8cb4 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 10 Aug 2022 15:41:47 +0100
Subject: [PATCH 29/31] Apply suggestions from code review

Co-authored-by: Sylvain Gugger <Sylvain.gugger@gmail.com>

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/image_transforms.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 315a6634fa3e..558b253addcd 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -73,13 +73,13 @@ def rescale(
     Rescales `image` by `scale`.
 
     Args:
-        image (`np.ndarray``):
+        image (`np.ndarray`):
             The image to rescale.
-        scale (`float`, `int`):
+        scale (`float` or `int`, *optional*, defaults to 255):
             The scale to use for rescaling the image.
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the image. If not provided, it will be the same as the input image.
-        dtype (`np.dtype`, *optional*):
+        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
             The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility with feature
             extractors
 

From 2e929cfdba42dedf29aaa8c98b1c94e6f3c0a566 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 12 Aug 2022 11:06:01 +0100
Subject: [PATCH 30/31] Update src/transformers/image_transforms.py

Co-authored-by: Alara Dirik <8944735+alaradirik@users.noreply.github.com>
---
 src/transformers/image_transforms.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 558b253addcd..6b2ed5522b7e 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -80,8 +80,7 @@ def rescale(
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the image. If not provided, it will be the same as the input image.
         dtype (`np.dtype`, *optional*, defaults to `np.float32`):
-            The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility with feature
-            extractors
+            The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature extractors.
 
     Returns:
         image: A rescaled np.ndarray image.

From ae358735685986862d410a269780cc7986aee86d Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 17 Aug 2022 13:08:48 +0100
Subject: [PATCH 31/31] Add in docstrings

---
 src/transformers/image_processing_utils.py    |  3 +-
 src/transformers/image_transforms.py          |  6 +-
 .../models/glpn/image_processing_glpn.py      | 86 ++++++++++++++++---
 3 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index c81600511a0a..721fc86f0ec5 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .feature_extraction_utils import FeatureExtractionMixin, BatchFeature as BaseBatchFeature
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
+from .feature_extraction_utils import FeatureExtractionMixin
 from .utils import logging
 
 
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 6b2ed5522b7e..05112d0118f8 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -80,7 +80,8 @@ def rescale(
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the image. If not provided, it will be the same as the input image.
         dtype (`np.dtype`, *optional*, defaults to `np.float32`):
-            The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature extractors.
+            The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature
+            extractors.
 
     Returns:
         image: A rescaled np.ndarray image.
@@ -214,8 +215,7 @@ def resize(
         data_format (`ChannelDimension`, *optional*, defaults to `None`):
             The channel dimension format of the output image. If `None`, will use the inferred format from the input.
         return_numpy (`bool`, *optional*, defaults to `True`):
-            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is
-            returned.
+            Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is returned.
 
     Returns:
         image: A resized np.ndarray.
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 1cdb455b5052..15605781d11a 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Image processor class for GLPN."""
 
-from typing import Optional, Union
+from typing import List, Optional, Union
 
 import numpy as np
 import PIL.Image
@@ -36,16 +36,16 @@ class GLPNImageProcessor(BaseImageProcessor):
 
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the input based on certain `size_divisor`.
-        size_divisor (`int` or `Tuple(int)`, *optional*, defaults to 32):
-            Make sure the input is divisible by this value. Only has an effect if `do_resize` is set to `True`.
-        resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
-            An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
-            `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
-            `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
-            to `True`.
+            Set the class default for the `do_resize` parameter. Controls whether to resize the image's (height, width)
+            dimensions, rounding them down to the closest multiple of `size_divisor`.
         do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
+            Set the class default for the `do_rescale` parameter. Controls whether or not to apply the scaling factor
+            (to make pixel values floats between 0. and 1.).
+        size_divisor (`int`, *optional*, defaults to 32):
+            Set the class default for the `size_divisor` parameter. When `do_resize` is `True`, images are resized so
+            their height and width are rounded down to the closest multiple of `size_divisor`.
+        resample (`PIL.Image.Resampling`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
+            Set the class default for `resample`. Defines the resampling filter to use if resizing the image.
     """
 
     model_input_names = ["pixel_values"]
@@ -62,12 +62,32 @@ def __init__(
     def resize(
         self,
         image: np.ndarray,
-        size_divisor: Union[int, float],
+        size_divisor: int,
         resample: PIL.Image.Resampling,
         data_format: Optional[ChannelDimension] = None,
         **kwargs
     ) -> np.ndarray:
+        """
+        Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor.
+
+        If the image is of dimension (3, 260, 170) and size_divisor is 32, the image will be resized to (3, 256, 160).
+
+        Args:
+            image (`np.ndarray`):
+                The image to resize.
+            size_divisor (`int`):
+                The image is resized so its height and width are rounded down to the closest multiple of
+                `size_divisor`.
+            resample (`PIL.Image.Resampling`):
+                Resampling filter to use when resizing the image.
+            data_format (`ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If `None`, the channel dimension format of the input
+                image is used. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
         height, width = get_image_size(image)
+        # Rounds the height and width down to the closest multiple of size_divisor
         new_h = height // size_divisor * size_divisor
         new_w = width // size_divisor * size_divisor
         image = resize(image, (new_h, new_w), resample=resample, data_format=data_format, **kwargs)
@@ -76,11 +96,25 @@ def resize(
     def rescale(
         self, image: np.ndarray, scale: Union[int, float], data_format: Optional[ChannelDimension] = None, **kwargs
     ) -> np.ndarray:
+        """
+        Rescale the image by the given scaling factor `scale`.
+
+        Args:
+            image (`np.ndarray`):
+                The image to rescale.
+            scale (`int` or `float`):
+                The scaling factor to rescale pixel values by.
+            data_format (`ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If `None`, the channel dimension format of the input
+                image is used. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
         return rescale(image=image, scale=scale, data_format=data_format, **kwargs)
 
     def preprocess(
         self,
-        images,
+        images: Union["PIL.Image.Image", TensorType, List["PIL.Image.Image"], List[TensorType]],
         do_resize: bool = None,
         do_rescale: bool = None,
         size_divisor: int = None,
@@ -89,6 +123,34 @@ def preprocess(
         data_format: ChannelDimension = ChannelDimension.FIRST,
         **kwargs
     ) -> BatchFeature:
+        """
+        Preprocess the given images.
+
+        Args:
+            images (`PIL.Image.Image` or `TensorType` or `List[np.ndarray]` or `List[TensorType]`):
+                The image or images to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the input such that the (height, width) dimensions are a multiple of `size_divisor`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
+            size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
+                When `do_resize` is `True`, images are resized so their height and width are rounded down to the
+                closest multiple of `size_divisor`.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PIL.Image.Resampling`,
+                Only has an effect if `do_resize` is set to `True`.
+            return_tensors (`str`, *optional*, defaults to `None`):
+                The type of tensors to return. Can be one of:
+                    - `None`: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         size_divisor = size_divisor if size_divisor is not None else self.size_divisor