diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx index 4d5831a12fd6..8bdf0ed11099 100644 --- a/docs/source/en/internal/image_processing_utils.mdx +++ b/docs/source/en/internal/image_processing_utils.mdx @@ -19,10 +19,13 @@ Most of those are only useful if you are studying the code of the image processo ## Image Transformations -[[autodoc]] image_transforms.to_pil_image +[[autodoc]] image_transforms.rescale [[autodoc]] image_transforms.resize +[[autodoc]] image_transforms.to_pil_image + + ## ImageProcessorMixin diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index dc2a94572fe8..d58bc6f1847e 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -635,7 +635,7 @@ ] else: _import_structure["image_processing_utils"] = ["ImageProcessorMixin"] - _import_structure["image_transforms"] = ["resize", "to_pil_image"] + _import_structure["image_transforms"] = ["rescale", "resize", "to_pil_image"] _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"] _import_structure["models.beit"].append("BeitFeatureExtractor") _import_structure["models.clip"].append("CLIPFeatureExtractor") @@ -3372,7 +3372,7 @@ from .utils.dummy_vision_objects import * else: from .image_processing_utils import ImageProcessorMixin - from .image_transforms import resize, to_pil_image + from .image_transforms import rescale, resize, to_pil_image from .image_utils import ImageFeatureExtractionMixin from .models.beit import BeitFeatureExtractor from .models.clip import CLIPFeatureExtractor, CLIPProcessor diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 62a6178d1f80..721fc86f0ec5 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .feature_extraction_utils import BatchFeature as BaseBatchFeature from .feature_extraction_utils import FeatureExtractionMixin from .utils import logging @@ -20,6 +21,35 @@ logger = logging.get_logger(__name__) +# TODO: Move BatchFeature to be imported by both feature_extraction_utils and image_processing_utils +# We override the class string here, but logic is the same. +class BatchFeature(BaseBatchFeature): + r""" + Holds the output of the image processor specific `__call__` methods. + + This class is derived from a python dictionary and can be used as a dictionary. + + Args: + data (`dict`): + Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('pixel_values', 'attention_mask', + etc.). + tensor_type (`Union[None, str, TensorType]`, *optional*): + You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at + initialization. + """ + + # We use aliasing whilst we phase out the old API. Once feature extractors for vision models # are deprecated, ImageProcessor mixin will be implemented. Any shared logic will be abstracted out. ImageProcessorMixin = FeatureExtractionMixin + + +class BaseImageProcessor(ImageProcessorMixin): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def __call__(self, images, **kwargs) -> BatchFeature: + return self.preprocess(images, **kwargs) + + def preprocess(self, images, **kwargs) -> BatchFeature: + raise NotImplementedError("Each image processor must implement its own preprocess method") diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 1561b1c4662b..df17362bdc59 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -64,15 +64,42 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim raise ValueError("Unsupported channel dimension format: {}".format(channel_dim)) +def rescale( + image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None, dtype=np.float32 +) -> np.ndarray: + """ + Rescales `image` by `scale`. + + Args: + image (`np.ndarray`): + The image to rescale. + scale (`float` or `int`, *optional*, defaults to 255): + The scale to use for rescaling the image. + data_format (`ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + dtype (`np.dtype`, *optional*, defaults to `np.float32`): + The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature + extractors. + + Returns: + image: A rescaled np.ndarray image. + """ + rescaled_image = image * scale + if data_format is not None: + rescaled_image = to_channel_dimension_format(rescaled_image, data_format) + rescaled_image = rescaled_image.astype(dtype) + return rescaled_image + + def to_pil_image( - image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"], rescale=None + image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.Tensor"], do_rescale=None ) -> PIL.Image.Image: """ Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if needed. Args: - image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`): + image (`PIL.Image.Image`, `numpy.ndarray`, `torch.Tensor`, `tf.Tensor`): The image to convert to the PIL Image format. rescale (`bool`, *optional*): Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default @@ -87,15 +114,15 @@ def to_pil_image( image = np.array(image) if not isinstance(image, np.ndarray): - raise ValueError("Input image must be of type PIL.Image.Image, numpy.ndarray or torch.Tensor") + raise ValueError("Input image type not supported: {}".format(type(image))) # If the channel as been moved to first dim, we put it back at the end. image = to_channel_dimension_format(image, ChannelDimension.LAST) # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed. - rescale = isinstance(image.flat[0], float) if rescale is None else rescale - if rescale: - rescale = image * 255 + do_rescale = isinstance(image.flat[0], float) if do_rescale is None else do_rescale + if do_rescale: + image = rescale(image, 255) image = image.astype(np.uint8) return PIL.Image.fromarray(image) @@ -186,8 +213,7 @@ def resize( data_format (`ChannelDimension`, *optional*, defaults to `None`): The channel dimension format of the output image. If `None`, will use the inferred format from the input. return_numpy (`bool`, *optional*, defaults to `True`): - Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is - returned. + Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is returned. Returns: image: A resized np.ndarray. diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 875e74444d36..5baf33329c43 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -29,7 +29,7 @@ IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ) -from .utils.generic import ExplicitEnum, _is_jax, _is_tensorflow, _is_torch +from .utils.generic import ExplicitEnum, _is_jax, _is_tensorflow, _is_torch, to_numpy ImageInput = Union[ @@ -37,6 +37,11 @@ ] +class ChannelDimension(ExplicitEnum): + FIRST = "channels_first" + LAST = "channels_last" + + def is_torch_tensor(obj): return _is_torch(obj) if is_torch_available() else False @@ -49,9 +54,29 @@ def is_jax_tensor(obj): return _is_jax(obj) if is_flax_available() else False -class ChannelDimension(ExplicitEnum): - FIRST = "channels_first" - LAST = "channels_last" +def is_valid_image(img): + return ( + isinstance(img, (PIL.Image.Image, np.ndarray)) + or is_torch_tensor(img) + or is_tf_tensor(img) + or is_jax_tensor(img) + ) + + +def valid_images(imgs): + return all(is_valid_image(img) for img in imgs) + + +def is_batched(img): + if isinstance(img, (list, tuple)): + return is_valid_image(img[0]) + return False + + +def to_numpy_array(img) -> np.ndarray: + if isinstance(img, PIL.Image.Image): + return np.array(img) + return to_numpy(img) def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension: diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py new file mode 100644 index 000000000000..15605781d11a --- /dev/null +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -0,0 +1,180 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for GLPN.""" + +from typing import List, Optional, Union + +import numpy as np +import PIL.Image + +from transformers.utils.generic import TensorType + +from ...image_processing_utils import BaseImageProcessor, BatchFeature +from ...image_transforms import rescale, resize, to_channel_dimension_format +from ...image_utils import ChannelDimension, get_image_size, is_batched, to_numpy_array, valid_images +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class GLPNImageProcessor(BaseImageProcessor): + r""" + Constructs a GLPN image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Set the class default for the `do_resize` parameter. Controls whether to resize the image's (height, width) + dimensions, rounding them down to the closest multiple of `size_divisor`. + do_rescale (`bool`, *optional*, defaults to `True`): + Set the class default for the `do_rescale` parameter. Controls whether or not to apply the scaling factor + (to make pixel values floats between 0. and 1.). + size_divisor (`int`, *optional*, defaults to 32): + Set the class default for the `size_divisor` parameter. When `do_resize` is `True`, images are resized so + their height and width are rounded down to the closest multiple of `size_divisor`. + resample (`PIL.Image.Resampling`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`): + Set the class default for `resample`. Defines the resampling filter to use if resizing the image. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs + ) -> None: + self.do_resize = do_resize + self.do_rescale = do_rescale + self.size_divisor = size_divisor + self.resample = resample + super().__init__(**kwargs) + + def resize( + self, + image: np.ndarray, + size_divisor: int, + resample: PIL.Image.Resampling, + data_format: Optional[ChannelDimension] = None, + **kwargs + ) -> np.ndarray: + """ + Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor. + + If the image is of dimension (3, 260, 170) and size_divisor is 32, the image will be resized to (3, 256, 160). + + Args: + image (`np.ndarray`): + The image to resize. + size_divisor (`int`): + The image is resized so its height and width are rounded down to the closest multiple of + `size_divisor`. + resample (`PIL.Image.Resampling`): + Resampling filter to use when resizing the image. + data_format (`ChannelDimension`, *optional*): + The channel dimension format for the output image. If `None`, the channel dimension format of the input + image is used. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + height, width = get_image_size(image) + # Rounds the height and width down to the closest multiple of size_divisor + new_h = height // size_divisor * size_divisor + new_w = width // size_divisor * size_divisor + image = resize(image, (new_h, new_w), resample=resample, data_format=data_format, **kwargs) + return image + + def rescale( + self, image: np.ndarray, scale: Union[int, float], data_format: Optional[ChannelDimension] = None, **kwargs + ) -> np.ndarray: + """ + Rescale the image by the given scaling factor `scale`. + + Args: + image (`np.ndarray`): + The image to rescale. + scale (`int` or `float`): + The scaling factor to rescale pixel values by. + data_format (`ChannelDimension`, *optional*): + The channel dimension format for the output image. If `None`, the channel dimension format of the input + image is used. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + return rescale(image=image, scale=scale, data_format=data_format, **kwargs) + + def preprocess( + self, + images: Union["PIL.Image.Image", TensorType, List["PIL.Image.Image"], List[TensorType]], + do_resize: bool = None, + do_rescale: bool = None, + size_divisor: int = None, + resample: PIL.Image.Resampling = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + **kwargs + ) -> BatchFeature: + """ + Preprocess the given images. + + Args: + images (`PIL.Image.Image` or `TensorType` or `List[np.ndarray]` or `List[TensorType]`): + The image or images to preprocess. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the input such that the (height, width) dimensions are a multiple of `size_divisor`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). + size_divisor (`int`, *optional*, defaults to `self.size_divisor`): + When `do_resize` is `True`, images are resized so their height and width are rounded down to the + closest multiple of `size_divisor`. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PIL.Image.Resampling`, + Only has an effect if `do_resize` is set to `True`. + return_tensors (`str`, *optional*, defaults to `None`): + The type of tensors to return. Can be one of: + - `None`: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + size_divisor = size_divisor if size_divisor is not None else self.size_divisor + resample = resample if resample is not None else self.resample + + if do_resize and size_divisor is None: + raise ValueError("size_divisor is required for resizing") + + if not is_batched(images): + images = [images] + + if not valid_images(images): + raise ValueError("Invalid image(s)") + + # All transformations expect numpy arrays. + images = [to_numpy_array(img) for img in images] + + if do_resize: + images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images] + + if do_rescale: + images = [self.rescale(image, scale=1 / 255) for image in images] + + images = [to_channel_dimension_format(image, data_format) for image in images] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 359b434253a6..6b00f3b3d76d 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -10,6 +10,10 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +def rescale(*args, **kwargs): + requires_backends(rescale, ["vision"]) + + def resize(*args, **kwargs): requires_backends(resize, ["vision"])