From 54aed8b374dd0766abe0dc762d4bbe543afdcb4e Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Wed, 27 Jul 2022 16:12:56 +0100 Subject: [PATCH 01/31] Base processor skeleton --- src/transformers/image_processing_utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index e053f4f486a7..908216cd4634 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -424,3 +424,14 @@ def register_for_auto_class(cls, auto_class="AutoImageProcessor"): ImageProcessorMixin.push_to_hub.__doc__ = ImageProcessorMixin.push_to_hub.__doc__.format( object="image processor", object_class="AutoImageProcessor", object_files="image processor file" ) + + +class BaseImageProcessor(ImageProcessorMixin): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def __call__(self, *args, **kwargs): + return self.preprocess(*args, **kwargs) + + def preprocess(self, *args, **kwargs): + raise NotImplementedError("Each image processor must implement its own preprocess method") From ba55c8996ae83e3f066ac9cd9e557e9ad66e1a74 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Wed, 27 Jul 2022 17:30:59 +0100 Subject: [PATCH 02/31] BatchFeature for packaging image processor outputs --- src/transformers/image_processing_utils.py | 164 ++++++++++++++++++++- 1 file changed, 159 insertions(+), 5 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 908216cd4634..32477c7f4f77 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -16,30 +16,184 @@ import copy import os import json -from typing import Any, Dict, Tuple, Union +from collections import UserDict +from typing import Any, Dict, Optional, Tuple, Union import numpy as np from requests import HTTPError from .dynamic_module_utils import custom_object_save from .utils import ( - IMAGE_PROCESSOR_NAME, - PushToHubMixin, - logging, HUGGINGFACE_CO_RESOLVE_ENDPOINT, + IMAGE_PROCESSOR_NAME, EntryNotFoundError, + PushToHubMixin, RepositoryNotFoundError, RevisionNotFoundError, + TensorType, cached_path, copy_func, hf_bucket_url, + is_flax_available, is_remote_url, - is_offline_mode + is_torch_available, + is_tf_available, + is_offline_mode, + logging, + torch_required, ) +from .utils.generic import _is_jax, _is_numpy, _is_torch_device logger = logging.get_logger(__name__) +class BatchFeature(UserDict): + r""" + Holds the output of the image processor specific `__call__` methods. + + This class is derived from a python dictionary and can be used as a dictionary. + + Args: + data (`dict`): + Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask', + etc.). + tensor_type (`Union[None, str, TensorType]`, *optional*): + You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at + initialization. + """ + + def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None): + super().__init__(data) + self.convert_to_tensors(tensor_type=tensor_type) + + # Copied from transformers.tokenization_utils_base.BatchEncoding.__getitem__ + def __getitem__(self, item: str) -> Any: + """ + If the key is a string, returns the value of the dict associated to `key` ('input_values', 'attention_mask', + etc.). + """ + if isinstance(item, str): + return self.data[item] + else: + raise KeyError("Indexing with integers is not available when using Python based feature extractors") + + # Copied from transformers.tokenization_utils_base.BatchEncoding.__getattr__ + def __getattr__(self, item: str): + try: + return self.data[item] + except KeyError: + raise AttributeError + + # Copied from transformers.feature_extraction_utils.BatchFeature.__getstate__ + def __getstate__(self): + return {"data": self.data} + + # Copied from transformers.feature_extraction_utils.BatchFeature.__setstate__ + def __setstate__(self, state): + if "data" in state: + self.data = state["data"] + + # Copied from transformers.tokenization_utils_base.BatchEncoding.keys + def keys(self): + return self.data.keys() + + # Copied from transformers.tokenization_utils_base.BatchEncoding.values + def values(self): + return self.data.values() + + # Copied from transformers.tokenization_utils_base.BatchEncoding.items + def items(self): + return self.data.items() + + # Copied from transformers.feature_extraction_utils.BatchFeature.convert_to_tensors + def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None): + """ + Convert the inner content to tensors. + + Args: + tensor_type (`str` or [`~utils.TensorType`], *optional*): + The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If + `None`, no modification is done. + """ + if tensor_type is None: + return self + + # Convert to TensorType + if not isinstance(tensor_type, TensorType): + tensor_type = TensorType(tensor_type) + + # Get a function reference for the correct framework + if tensor_type == TensorType.TENSORFLOW: + if not is_tf_available(): + raise ImportError( + "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed." + ) + import tensorflow as tf + + as_tensor = tf.constant + is_tensor = tf.is_tensor + elif tensor_type == TensorType.PYTORCH: + if not is_torch_available(): + raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") + import torch + + def as_tensor(value): + if isinstance(value, (list, tuple)) and len(value) > 0 and isinstance(value[0], np.ndarray): + value = np.array(value) + return torch.tensor(value) + + is_tensor = torch.is_tensor + elif tensor_type == TensorType.JAX: + if not is_flax_available(): + raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.") + import jax.numpy as jnp # noqa: F811 + + as_tensor = jnp.array + is_tensor = _is_jax + else: + as_tensor = np.asarray + is_tensor = _is_numpy + + # Do the tensor conversion in batch + for key, value in self.items(): + try: + if not is_tensor(value): + tensor = as_tensor(value) + + self[key] = tensor + except: # noqa E722 + if key == "overflowing_values": + raise ValueError("Unable to create tensor returning overflowing values of different lengths. ") + raise ValueError( + "Unable to create tensor, you should probably activate padding " + "with 'padding=True' to have batched tensors with the same length." + ) + + return self + + @torch_required + # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature + def to(self, device: Union[str, "torch.device"]) -> "BatchFeature": + """ + Send all values to device by calling `v.to(device)` (PyTorch only). + + Args: + device (`str` or `torch.device`): The device to put the tensors on. + + Returns: + [`BatchFeature`]: The same instance after modification. + """ + + # This check catches things like APEX blindly calling "to" on all inputs to a module + # Otherwise it passes the casts down and casts the LongTensor containing the token idxs + # into a HalfTensor + if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int): + self.data = {k: v.to(device=device) for k, v in self.data.items()} + else: + logger.warning(f"Attempting to cast a BatchFeature to type {str(device)}. This is not supported.") + return self + + class ImageProcessorMixin(PushToHubMixin): """ Image processor mixin used to provide saving/loading functionality From 4b430d4de00b1ddb21882eaddce5e2f5c61842f7 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Wed, 27 Jul 2022 19:38:50 +0100 Subject: [PATCH 03/31] Initial image processor for GLPN --- src/transformers/image_processing_utils.py | 24 +++--- src/transformers/image_transforms.py | 26 +++++-- src/transformers/image_utils.py | 61 +++++++++++++-- .../models/glpn/image_processing_glpn.py | 76 +++++++++++++++++++ 4 files changed, 165 insertions(+), 22 deletions(-) create mode 100644 src/transformers/models/glpn/image_processing_glpn.py diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 32477c7f4f77..2f1377b3773f 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -23,6 +23,7 @@ from requests import HTTPError from .dynamic_module_utils import custom_object_save +from .image_utils import ImageType from .utils import ( HUGGINGFACE_CO_RESOLVE_ENDPOINT, IMAGE_PROCESSOR_NAME, @@ -105,25 +106,24 @@ def values(self): def items(self): return self.data.items() - # Copied from transformers.feature_extraction_utils.BatchFeature.convert_to_tensors - def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None): + def convert_to_tensors(self, tensor_type: Optional[Union[str, ImageType]] = None): """ Convert the inner content to tensors. Args: - tensor_type (`str` or [`~utils.TensorType`], *optional*): - The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If + tensor_type (`str` or [`~utils.ImageType`], *optional*): + The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.ImageType`]. If `None`, no modification is done. """ if tensor_type is None: return self # Convert to TensorType - if not isinstance(tensor_type, TensorType): - tensor_type = TensorType(tensor_type) + if not isinstance(tensor_type, ImageType): + tensor_type = ImageType(tensor_type) # Get a function reference for the correct framework - if tensor_type == TensorType.TENSORFLOW: + if tensor_type == ImageType.TENSORFLOW: if not is_tf_available(): raise ImportError( "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed." @@ -132,7 +132,7 @@ def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = Non as_tensor = tf.constant is_tensor = tf.is_tensor - elif tensor_type == TensorType.PYTORCH: + elif tensor_type == ImageType.PYTORCH: if not is_torch_available(): raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") import torch @@ -143,7 +143,7 @@ def as_tensor(value): return torch.tensor(value) is_tensor = torch.is_tensor - elif tensor_type == TensorType.JAX: + elif tensor_type == ImageType.JAX: if not is_flax_available(): raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.") import jax.numpy as jnp # noqa: F811 @@ -584,8 +584,8 @@ class BaseImageProcessor(ImageProcessorMixin): def __init__(self, **kwargs): super().__init__(**kwargs) - def __call__(self, *args, **kwargs): - return self.preprocess(*args, **kwargs) + def __call__(self, images, **kwargs) -> BatchFeature: + return self.preprocess(images, **kwargs) - def preprocess(self, *args, **kwargs): + def preprocess(self, images, **kwargs) -> BatchFeature: raise NotImplementedError("Each image processor must implement its own preprocess method") diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 7585aff76432..55d788d47531 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -28,10 +28,26 @@ ) +def rescale(image: np.ndarray, scale: Union[float, int] = 255) -> np.ndarray: + """ + Rescales `image` by `scale`. + + Args: + image (`np.ndarray``): + The image to rescale. + scale (`float`, `int`): + The scale to use for rescaling the image. + + Returns: + image: A rescaled np.ndarray image. + """ + return image * scale + + def to_pil_image( image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"], channel_dim: Optional[ChannelDimension] = None, - rescale=None + do_rescale: Optional[bool] = None ) -> PIL.Image.Image: """ Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if @@ -61,9 +77,9 @@ def to_pil_image( image = image.transpose((1, 2, 0)) # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed. - rescale = isinstance(image.flat[0], float) if rescale is None else rescale - if rescale: - rescale = image * 255 + do_rescale = isinstance(image.flat[0], float) if do_rescale is None else do_rescale + if do_rescale: + image = rescale(image, 255) image = image.astype(np.uint8) return PIL.Image.fromarray(image) @@ -107,7 +123,7 @@ def get_resize_output_image_size( return (new_short, new_long) if width <= height else (new_long, new_short) -def resize(image, size: Tuple[int, int], resample=PIL.Image.BILINEAR): +def resize(image, size: Tuple[int, int], resample=PIL.Image.Resampling.BILINEAR): """ Resizes `image`. Enforces conversion of input to PIL.Image. diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index e23321be4780..e5afb3122bcd 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -23,8 +23,8 @@ import requests -from .utils import is_torch_available, is_tf_available, is_jax_available -from .utils.generic import _is_torch, _is_tensorflow, _is_jax +from .utils import TensorType, is_torch_available, is_tf_available, is_jax_available +from .utils.generic import ExplicitEnum, _is_torch, _is_tensorflow, _is_jax, _is_numpy, to_numpy IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406] @@ -36,6 +36,21 @@ PIL.Image.Image, np.ndarray, "torch.Tensor", List[PIL.Image.Image], List[np.ndarray], List["torch.Tensor"] # noqa ] +class ChannelDimension(enum.Enum): + FIRST = 1 + LAST = 3 + + +class ImageType(ExplicitEnum): + """ + Possible image data formats that can be fed into an image processor + """ + PYTORCH = "pt" + TENSORFLOW = "tf" + NUMPY = "np" + JAX = "jax" + PIL = "pillow" + def is_torch_tensor(obj): return _is_torch(obj) if is_torch_available() else False @@ -49,9 +64,45 @@ def is_jax_tensor(obj): return _is_jax(obj) if is_jax_available() else False -class ChannelDimension(enum.Enum): - FIRST = 1 - LAST = 3 +def is_valid_image(img): + return ( + isinstance(img, (PIL.Image.Image, np.ndarray)) + or is_torch_tensor(img) + or is_tf_tensor(img) + or is_jax_tensor(img) + ) + + +def valid_images(imgs): + return all(is_valid_image(img) for img in imgs) + + +def is_batched(img): + if isinstance(img, (list, tuple)): + return is_valid_image(img[0]) + return False + + +def get_image_type(obj) -> TensorType: + if is_torch_tensor(obj): + return TensorType.TORCH + elif is_tf_tensor(obj): + return TensorType.TF + elif is_jax_tensor(obj): + return TensorType.JAX + elif _is_numpy(obj): + return TensorType.NUMPY + elif isinstance(obj, PIL.Image.Image): + return TensorType.PIL + else: + raise ValueError("Could not infer tensor type") + + +def to_numpy_array(img) -> np.ndarray: + input_type = get_image_type(img) + if input_type == ImageType.PIL: + return np.array(img) + return to_numpy(img) def infer_channel_dimension(image: np.ndarray) -> ChannelDimension: diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py new file mode 100644 index 000000000000..9942845e5e45 --- /dev/null +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -0,0 +1,76 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for GLPN.""" + +from tkinter import Image +from typing import Union + +from numpy import np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature +from ...image_transforms import resize, rescale +from ...image_utils import ImageType, is_batched, to_numpy_array, valid_images, get_image_size +from ...utils import logging + +logger = logging.get_logger(__name__) + + +class GLPNImageProcessor(BaseImageProcessor): + def __init__(self, do_resize=True, do_rescale=True, size_divisor=32, resample=Image.Resampling.BILINEAR, **kwargs) -> None: + self.do_resize = do_resize + self.do_rescale = do_rescale + self.size_divisor = size_divisor + self.resample = resample + super().__init__(**kwargs) + + def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: Image.Resampling, **kwargs) -> np.ndarray: + height, width = get_image_size(image) + new_h = height // size_divisor * size_divisor + new_w = width // size_divisor * size_divisor + image = resize(image, (new_h, new_w), resample=resample, **kwargs) + return image + + def rescale(self, image: np.ndarray, scale: Union[int, float], **kwargs) -> np.ndarray: + return rescale(image, scale, **kwargs) + + def preprocess(self, images, do_resize=None, do_rescale=None, size_divisor=None, resample=None, return_tensors=None, **kwargs) -> BatchFeature: + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + size_divisor = size_divisor if size_divisor is not None else self.size_divisor + resample = resample if resample is not None else self.resample + + # If a return type isn't specified, default to numpy arrays. + return_tensors = ImageType.NUMPY if return_tensors is None else return_tensors + + if do_resize and size_divisor is None: + raise ValueError("size_divisor is required for resizing") + + if not is_batched(images): + images = [images] + + if not valid_images(images): + raise ValueError("Invalid image(s)") + + # All transformations expect numpy arrays. + images = [to_numpy_array(img) for img in images] + + if do_resize: + images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images] + + if do_rescale: + images = [self.rescale(image) for image in images] + + data = {"pixel_values": images} + return BatchFeature(**data, return_tensors=return_tensors) From b1c8b59fbe165d8a3f542c280fe65486eef0c2ad Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Wed, 27 Jul 2022 20:02:51 +0100 Subject: [PATCH 04/31] REmove accidental import --- src/transformers/models/glpn/image_processing_glpn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py index 9942845e5e45..bdc33e80ce9f 100644 --- a/src/transformers/models/glpn/image_processing_glpn.py +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -14,9 +14,9 @@ # limitations under the License. """Image processor class for GLPN.""" -from tkinter import Image from typing import Union +import PIL.Image from numpy import np from ...image_processing_utils import BaseImageProcessor, BatchFeature @@ -28,14 +28,14 @@ class GLPNImageProcessor(BaseImageProcessor): - def __init__(self, do_resize=True, do_rescale=True, size_divisor=32, resample=Image.Resampling.BILINEAR, **kwargs) -> None: + def __init__(self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs) -> None: self.do_resize = do_resize self.do_rescale = do_rescale self.size_divisor = size_divisor self.resample = resample super().__init__(**kwargs) - def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: Image.Resampling, **kwargs) -> np.ndarray: + def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: PIL.Image.Resampling, **kwargs) -> np.ndarray: height, width = get_image_size(image) new_h = height // size_divisor * size_divisor new_w = width // size_divisor * size_divisor From b9ce4a00399e5d177c1db07bab8ba47595abd12f Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 14:59:34 +0100 Subject: [PATCH 05/31] Import BatchFeature from feature_extraction_utils --- src/transformers/image_processing_utils.py | 173 +++------------------ src/transformers/image_transforms.py | 29 ++-- src/transformers/image_utils.py | 6 +- 3 files changed, 42 insertions(+), 166 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 32477c7f4f77..0e8b02c56b52 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -14,15 +14,16 @@ # limitations under the License. import copy -import os import json -from collections import UserDict -from typing import Any, Dict, Optional, Tuple, Union +import os +from typing import Any, Dict, Tuple, Union import numpy as np + from requests import HTTPError from .dynamic_module_utils import custom_object_save +from .feature_extraction_utils import BatchFeature as BaseBatchFeature from .utils import ( HUGGINGFACE_CO_RESOLVE_ENDPOINT, IMAGE_PROCESSOR_NAME, @@ -30,24 +31,21 @@ PushToHubMixin, RepositoryNotFoundError, RevisionNotFoundError, - TensorType, cached_path, copy_func, hf_bucket_url, - is_flax_available, - is_remote_url, - is_torch_available, - is_tf_available, is_offline_mode, + is_remote_url, logging, - torch_required, ) -from .utils.generic import _is_jax, _is_numpy, _is_torch_device + logger = logging.get_logger(__name__) -class BatchFeature(UserDict): +# TODO: Move BatchFeature to be imported by both feature_extraction_utils and image_processing_utils +# We override the class string here, but logic is the same. +class BatchFeature(BaseBatchFeature): r""" Holds the output of the image processor specific `__call__` methods. @@ -55,144 +53,13 @@ class BatchFeature(UserDict): Args: data (`dict`): - Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask', + Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('pixel_values', 'attention_mask', etc.). tensor_type (`Union[None, str, TensorType]`, *optional*): You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at initialization. """ - def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None): - super().__init__(data) - self.convert_to_tensors(tensor_type=tensor_type) - - # Copied from transformers.tokenization_utils_base.BatchEncoding.__getitem__ - def __getitem__(self, item: str) -> Any: - """ - If the key is a string, returns the value of the dict associated to `key` ('input_values', 'attention_mask', - etc.). - """ - if isinstance(item, str): - return self.data[item] - else: - raise KeyError("Indexing with integers is not available when using Python based feature extractors") - - # Copied from transformers.tokenization_utils_base.BatchEncoding.__getattr__ - def __getattr__(self, item: str): - try: - return self.data[item] - except KeyError: - raise AttributeError - - # Copied from transformers.feature_extraction_utils.BatchFeature.__getstate__ - def __getstate__(self): - return {"data": self.data} - - # Copied from transformers.feature_extraction_utils.BatchFeature.__setstate__ - def __setstate__(self, state): - if "data" in state: - self.data = state["data"] - - # Copied from transformers.tokenization_utils_base.BatchEncoding.keys - def keys(self): - return self.data.keys() - - # Copied from transformers.tokenization_utils_base.BatchEncoding.values - def values(self): - return self.data.values() - - # Copied from transformers.tokenization_utils_base.BatchEncoding.items - def items(self): - return self.data.items() - - # Copied from transformers.feature_extraction_utils.BatchFeature.convert_to_tensors - def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None): - """ - Convert the inner content to tensors. - - Args: - tensor_type (`str` or [`~utils.TensorType`], *optional*): - The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If - `None`, no modification is done. - """ - if tensor_type is None: - return self - - # Convert to TensorType - if not isinstance(tensor_type, TensorType): - tensor_type = TensorType(tensor_type) - - # Get a function reference for the correct framework - if tensor_type == TensorType.TENSORFLOW: - if not is_tf_available(): - raise ImportError( - "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed." - ) - import tensorflow as tf - - as_tensor = tf.constant - is_tensor = tf.is_tensor - elif tensor_type == TensorType.PYTORCH: - if not is_torch_available(): - raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") - import torch - - def as_tensor(value): - if isinstance(value, (list, tuple)) and len(value) > 0 and isinstance(value[0], np.ndarray): - value = np.array(value) - return torch.tensor(value) - - is_tensor = torch.is_tensor - elif tensor_type == TensorType.JAX: - if not is_flax_available(): - raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.") - import jax.numpy as jnp # noqa: F811 - - as_tensor = jnp.array - is_tensor = _is_jax - else: - as_tensor = np.asarray - is_tensor = _is_numpy - - # Do the tensor conversion in batch - for key, value in self.items(): - try: - if not is_tensor(value): - tensor = as_tensor(value) - - self[key] = tensor - except: # noqa E722 - if key == "overflowing_values": - raise ValueError("Unable to create tensor returning overflowing values of different lengths. ") - raise ValueError( - "Unable to create tensor, you should probably activate padding " - "with 'padding=True' to have batched tensors with the same length." - ) - - return self - - @torch_required - # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature - def to(self, device: Union[str, "torch.device"]) -> "BatchFeature": - """ - Send all values to device by calling `v.to(device)` (PyTorch only). - - Args: - device (`str` or `torch.device`): The device to put the tensors on. - - Returns: - [`BatchFeature`]: The same instance after modification. - """ - - # This check catches things like APEX blindly calling "to" on all inputs to a module - # Otherwise it passes the casts down and casts the LongTensor containing the token idxs - # into a HalfTensor - if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int): - self.data = {k: v.to(device=device) for k, v in self.data.items()} - else: - logger.warning(f"Attempting to cast a BatchFeature to type {str(device)}. This is not supported.") - return self - class ImageProcessorMixin(PushToHubMixin): """ @@ -218,12 +85,10 @@ def _set_processor_class(self, processor_class: str): self._processor_class = processor_class @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs - ): + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs): r""" - Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a - derived class of [`BaseImageProcessor`]. + Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a derived + class of [`BaseImageProcessor`]. Args: pretrained_model_name_or_path (`str` or `os.PathLike`): @@ -241,8 +106,8 @@ def from_pretrained( Path to a directory in which a downloaded pretrained model image processor should be cached if the standard cache should not be used. force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force to (re-)download the image processor files and override the cached versions - if they exist. + Whether or not to force to (re-)download the image processor files and override the cached versions if + they exist. resume_download (`bool`, *optional*, defaults to `False`): Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. @@ -496,16 +361,16 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_json_file(cls, json_file: Union[str, os.PathLike]): """ - Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to - a JSON file of parameters. + Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON + file of parameters. Args: json_file (`str` or `os.PathLike`): Path to the JSON file containing the parameters. Returns: - A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor - object instantiated from that JSON file. + A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object + instantiated from that JSON file. """ with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 7585aff76432..7551a431b4ba 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -13,32 +13,43 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional, Tuple, List, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union -import PIL import numpy as np +import PIL + +from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available from .image_utils import ( ChannelDimension, get_image_size, infer_channel_dimension, - is_torch_tensor, + is_jax_tensor, is_tf_tensor, - is_jax_tensor + is_torch_tensor, ) +if TYPE_CHECKING: + if is_torch_available(): + import torch + if is_tf_available(): + import tensorflow as tf + if is_flax_available(): + import jax.numpy as jnp + + def to_pil_image( - image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"], + image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.ndarray"], channel_dim: Optional[ChannelDimension] = None, - rescale=None + rescale=None, ) -> PIL.Image.Image: """ Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if needed. Args: - image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`): + image (`PIL.Image.Image`, `numpy.ndarray`, `torch.Tensor`, `tf.Tensor`): The image to convert to the PIL Image format. rescale (`bool`, *optional*): Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default @@ -53,7 +64,7 @@ def to_pil_image( image = np.array(image) if not isinstance(image, np.ndarray): - raise ValueError("Input image must be of type PIL.Image.Image, numpy.ndarray or torch.Tensor") + raise ValueError("Input image type not supported: {}".format(type(image))) # If the channel as been moved to first dim, we put it back at the end. channel_dim = infer_channel_dimension(image) if channel_dim is None else channel_dim @@ -72,7 +83,7 @@ def get_resize_output_image_size( input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int]], default_to_square: bool = True, - max_size: int = None + max_size: int = None, ) -> np.ndarray: if isinstance(size, (tuple, list)): if len(size) == 2: diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index e23321be4780..15bcf9954261 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -23,8 +23,8 @@ import requests -from .utils import is_torch_available, is_tf_available, is_jax_available -from .utils.generic import _is_torch, _is_tensorflow, _is_jax +from .utils import is_flax_available, is_tf_available, is_torch_available +from .utils.generic import _is_jax, _is_tensorflow, _is_torch IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406] @@ -46,7 +46,7 @@ def is_tf_tensor(obj): def is_jax_tensor(obj): - return _is_jax(obj) if is_jax_available() else False + return _is_jax(obj) if is_flax_available() else False class ChannelDimension(enum.Enum): From 6b678fb4397a5dc2ddffcf93271dd7c534a964ca Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 13:18:18 +0100 Subject: [PATCH 06/31] Fixup and docs --- .../en/internal/image_processing_utils.mdx | 24 +++++++++ src/transformers/image_transforms.py | 50 ++++++++++++++++--- src/transformers/image_utils.py | 6 +-- 3 files changed, 70 insertions(+), 10 deletions(-) create mode 100644 docs/source/en/internal/image_processing_utils.mdx diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx new file mode 100644 index 000000000000..ae4f826517aa --- /dev/null +++ b/docs/source/en/internal/image_processing_utils.mdx @@ -0,0 +1,24 @@ + + +# Utilities for Image Processors + +This page lists all the utility functions used by the image processors, mainly the functional +transformations used to process the images. + +Most of those are only useful if you are studying the code of the image processors in the library. + +## Image Transformations + +[[autodoc]] image_transforms.to_pil_image + +[[autodoc]] image_transforms.resize diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 7585aff76432..9112506e8ebf 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -13,25 +13,34 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional, Tuple, List, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union -import PIL import numpy as np +import PIL + +from transformers.utils.import_utils import is_tf_available, is_torch_available from .image_utils import ( ChannelDimension, get_image_size, infer_channel_dimension, - is_torch_tensor, + is_jax_tensor, is_tf_tensor, - is_jax_tensor + is_torch_tensor, ) +if TYPE_CHECKING: + if is_torch_available(): + import torch + if is_tf_available(): + import tensorflow as tf + + def to_pil_image( image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"], channel_dim: Optional[ChannelDimension] = None, - rescale=None + rescale=None, ) -> PIL.Image.Image: """ Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if @@ -70,10 +79,37 @@ def to_pil_image( def get_resize_output_image_size( input_image: np.ndarray, - size: Union[int, Tuple[int, int], List[int]], + size: Union[int, Tuple[int, int], List[int], Tuple[int]], default_to_square: bool = True, - max_size: int = None + max_size: int = None, ) -> np.ndarray: + """ + Find the target (height, width) dimension of the output image after resizing given the input image and the desired + size. + + Args: + input_image (`np.ndarray`): + The image to resize. + size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]): + The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to + this. + + If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If + `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this + number. i.e, if height > width, then image will be rescaled to (size * height / width, size). + resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`): + The filter to user for resampling. + default_to_square (`bool`, *optional*, defaults to `True`): + How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square + (`size`,`size`). If set to `False`, will replicate + [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize) + with support for resizing only the smallest edge and providing an optional `max_size`. + max_size (`int`, *optional*, defaults to `None`): + The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater + than `max_size` after being resized according to `size`, then the image is resized again so that the longer + edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter + than `size`. Only used if `default_to_square` is `False`. + """ if isinstance(size, (tuple, list)): if len(size) == 2: return size diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index e23321be4780..15bcf9954261 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -23,8 +23,8 @@ import requests -from .utils import is_torch_available, is_tf_available, is_jax_available -from .utils.generic import _is_torch, _is_tensorflow, _is_jax +from .utils import is_flax_available, is_tf_available, is_torch_available +from .utils.generic import _is_jax, _is_tensorflow, _is_torch IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406] @@ -46,7 +46,7 @@ def is_tf_tensor(obj): def is_jax_tensor(obj): - return _is_jax(obj) if is_jax_available() else False + return _is_jax(obj) if is_flax_available() else False class ChannelDimension(enum.Enum): From db9343777571f0ee77307646755cab377ea6d388 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 13:24:56 +0100 Subject: [PATCH 07/31] Fixup and docs --- .../en/internal/image_processing_utils.mdx | 5 +++ src/transformers/__init__.py | 2 ++ src/transformers/image_processing_utils.py | 32 +++++++++---------- .../utils/dummy_vision_objects.py | 7 ++++ 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx index ae4f826517aa..4d5831a12fd6 100644 --- a/docs/source/en/internal/image_processing_utils.mdx +++ b/docs/source/en/internal/image_processing_utils.mdx @@ -22,3 +22,8 @@ Most of those are only useful if you are studying the code of the image processo [[autodoc]] image_transforms.to_pil_image [[autodoc]] image_transforms.resize + + +## ImageProcessorMixin + +[[autodoc]] image_processing_utils.ImageProcessorMixin diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index dcacc328e8a8..a5c3e0d90575 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -632,6 +632,7 @@ name for name in dir(dummy_vision_objects) if not name.startswith("_") ] else: + _import_structure["image_processing_utils"] = ["ImageProcessorMixin"] _import_structure["image_transforms"] = ["resize", "to_pil_image"] _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"] _import_structure["models.beit"].append("BeitFeatureExtractor") @@ -3340,6 +3341,7 @@ except OptionalDependencyNotAvailable: from .utils.dummy_vision_objects import * else: + from .image_processing_utils import ImageProcessorMixin from .image_transforms import resize, to_pil_image from .image_utils import ImageFeatureExtractionMixin from .models.beit import BeitFeatureExtractor diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 908216cd4634..0ec4a429e1a5 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -14,29 +14,31 @@ # limitations under the License. import copy -import os import json +import os from typing import Any, Dict, Tuple, Union import numpy as np + from requests import HTTPError from .dynamic_module_utils import custom_object_save from .utils import ( - IMAGE_PROCESSOR_NAME, - PushToHubMixin, - logging, HUGGINGFACE_CO_RESOLVE_ENDPOINT, + IMAGE_PROCESSOR_NAME, EntryNotFoundError, + PushToHubMixin, RepositoryNotFoundError, RevisionNotFoundError, cached_path, copy_func, hf_bucket_url, + is_offline_mode, is_remote_url, - is_offline_mode + logging, ) + logger = logging.get_logger(__name__) @@ -64,12 +66,10 @@ def _set_processor_class(self, processor_class: str): self._processor_class = processor_class @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs - ): + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs): r""" - Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a - derived class of [`BaseImageProcessor`]. + Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a derived + class of [`BaseImageProcessor`]. Args: pretrained_model_name_or_path (`str` or `os.PathLike`): @@ -87,8 +87,8 @@ def from_pretrained( Path to a directory in which a downloaded pretrained model image processor should be cached if the standard cache should not be used. force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force to (re-)download the image processor files and override the cached versions - if they exist. + Whether or not to force to (re-)download the image processor files and override the cached versions if + they exist. resume_download (`bool`, *optional*, defaults to `False`): Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. @@ -342,16 +342,16 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_json_file(cls, json_file: Union[str, os.PathLike]): """ - Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to - a JSON file of parameters. + Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON + file of parameters. Args: json_file (`str` or `os.PathLike`): Path to the JSON file containing the parameters. Returns: - A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor - object instantiated from that JSON file. + A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object + instantiated from that JSON file. """ with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 0d9da1b7a16d..5756f67326b7 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -3,6 +3,13 @@ from ..utils import DummyObject, requires_backends +class ImageProcessorMixin(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + def resize(*args, **kwargs): requires_backends(resize, ["vision"]) From bd890d599d3929403c68cd65164200e695bf56e9 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 13:18:18 +0100 Subject: [PATCH 08/31] Fixup and docs --- .../en/internal/image_processing_utils.mdx | 24 +++++++++++++++ src/transformers/image_transforms.py | 29 ++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 docs/source/en/internal/image_processing_utils.mdx diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx new file mode 100644 index 000000000000..ae4f826517aa --- /dev/null +++ b/docs/source/en/internal/image_processing_utils.mdx @@ -0,0 +1,24 @@ + + +# Utilities for Image Processors + +This page lists all the utility functions used by the image processors, mainly the functional +transformations used to process the images. + +Most of those are only useful if you are studying the code of the image processors in the library. + +## Image Transformations + +[[autodoc]] image_transforms.to_pil_image + +[[autodoc]] image_transforms.resize diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 7551a431b4ba..38640029eaa0 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -81,10 +81,37 @@ def to_pil_image( def get_resize_output_image_size( input_image: np.ndarray, - size: Union[int, Tuple[int, int], List[int]], + size: Union[int, Tuple[int, int], List[int], Tuple[int]], default_to_square: bool = True, max_size: int = None, ) -> np.ndarray: + """ + Find the target (height, width) dimension of the output image after resizing given the input image and the desired + size. + + Args: + input_image (`np.ndarray`): + The image to resize. + size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]): + The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to + this. + + If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If + `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this + number. i.e, if height > width, then image will be rescaled to (size * height / width, size). + resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`): + The filter to user for resampling. + default_to_square (`bool`, *optional*, defaults to `True`): + How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square + (`size`,`size`). If set to `False`, will replicate + [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize) + with support for resizing only the smallest edge and providing an optional `max_size`. + max_size (`int`, *optional*, defaults to `None`): + The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater + than `max_size` after being resized according to `size`, then the image is resized again so that the longer + edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter + than `size`. Only used if `default_to_square` is `False`. + """ if isinstance(size, (tuple, list)): if len(size) == 2: return size From 4b27a340e451b0a402e78886f965c1c617f1fcf4 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 13:24:56 +0100 Subject: [PATCH 09/31] Fixup and docs --- docs/source/en/internal/image_processing_utils.mdx | 5 +++++ src/transformers/__init__.py | 2 ++ src/transformers/utils/dummy_vision_objects.py | 7 +++++++ 3 files changed, 14 insertions(+) diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx index ae4f826517aa..4d5831a12fd6 100644 --- a/docs/source/en/internal/image_processing_utils.mdx +++ b/docs/source/en/internal/image_processing_utils.mdx @@ -22,3 +22,8 @@ Most of those are only useful if you are studying the code of the image processo [[autodoc]] image_transforms.to_pil_image [[autodoc]] image_transforms.resize + + +## ImageProcessorMixin + +[[autodoc]] image_processing_utils.ImageProcessorMixin diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index dcacc328e8a8..a5c3e0d90575 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -632,6 +632,7 @@ name for name in dir(dummy_vision_objects) if not name.startswith("_") ] else: + _import_structure["image_processing_utils"] = ["ImageProcessorMixin"] _import_structure["image_transforms"] = ["resize", "to_pil_image"] _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"] _import_structure["models.beit"].append("BeitFeatureExtractor") @@ -3340,6 +3341,7 @@ except OptionalDependencyNotAvailable: from .utils.dummy_vision_objects import * else: + from .image_processing_utils import ImageProcessorMixin from .image_transforms import resize, to_pil_image from .image_utils import ImageFeatureExtractionMixin from .models.beit import BeitFeatureExtractor diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 0d9da1b7a16d..5756f67326b7 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -3,6 +3,13 @@ from ..utils import DummyObject, requires_backends +class ImageProcessorMixin(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + def resize(*args, **kwargs): requires_backends(resize, ["vision"]) From ff0d49ec10c4d7ecda79248567a88a4285efbbd3 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Wed, 27 Jul 2022 17:30:59 +0100 Subject: [PATCH 10/31] BatchFeature for packaging image processor outputs --- src/transformers/image_processing_utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 0e8b02c56b52..5370b9d35cad 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -15,8 +15,8 @@ import copy import json -import os -from typing import Any, Dict, Tuple, Union +from collections import UserDict +from typing import Any, Dict, Optional, Tuple, Union import numpy as np @@ -31,13 +31,19 @@ PushToHubMixin, RepositoryNotFoundError, RevisionNotFoundError, + TensorType, cached_path, copy_func, hf_bucket_url, - is_offline_mode, + is_flax_available, is_remote_url, + is_torch_available, + is_tf_available, + is_offline_mode, logging, + torch_required, ) +from .utils.generic import _is_jax, _is_numpy, _is_torch_device logger = logging.get_logger(__name__) From 2c2fa9aa63f734f67fde6876fc1db5a961020e08 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 14:59:34 +0100 Subject: [PATCH 11/31] Import BatchFeature from feature_extraction_utils --- src/transformers/image_processing_utils.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 5370b9d35cad..9aa4bebf89bf 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -15,8 +15,8 @@ import copy import json -from collections import UserDict -from typing import Any, Dict, Optional, Tuple, Union +import os +from typing import Any, Dict, Tuple, Union import numpy as np @@ -31,19 +31,14 @@ PushToHubMixin, RepositoryNotFoundError, RevisionNotFoundError, - TensorType, cached_path, copy_func, hf_bucket_url, - is_flax_available, - is_remote_url, - is_torch_available, - is_tf_available, is_offline_mode, + is_remote_url, logging, - torch_required, ) -from .utils.generic import _is_jax, _is_numpy, _is_torch_device + logger = logging.get_logger(__name__) From 7faf2e69167453d8c095d62790b3cd79b7ec3987 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 14:59:34 +0100 Subject: [PATCH 12/31] Import BatchFeature from feature_extraction_utils --- src/transformers/image_processing_utils.py | 173 +++------------------ src/transformers/image_transforms.py | 41 +++-- src/transformers/image_utils.py | 6 +- 3 files changed, 40 insertions(+), 180 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 2f1377b3773f..872ce352c3de 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -14,16 +14,16 @@ # limitations under the License. import copy -import os import json -from collections import UserDict -from typing import Any, Dict, Optional, Tuple, Union +import os +from typing import Any, Dict, Tuple, Union import numpy as np + from requests import HTTPError from .dynamic_module_utils import custom_object_save -from .image_utils import ImageType +from .feature_extraction_utils import BatchFeature as BaseBatchFeature from .utils import ( HUGGINGFACE_CO_RESOLVE_ENDPOINT, IMAGE_PROCESSOR_NAME, @@ -31,24 +31,21 @@ PushToHubMixin, RepositoryNotFoundError, RevisionNotFoundError, - TensorType, cached_path, copy_func, hf_bucket_url, - is_flax_available, - is_remote_url, - is_torch_available, - is_tf_available, is_offline_mode, + is_remote_url, logging, - torch_required, ) -from .utils.generic import _is_jax, _is_numpy, _is_torch_device + logger = logging.get_logger(__name__) -class BatchFeature(UserDict): +# TODO: Move BatchFeature to be imported by both feature_extraction_utils and image_processing_utils +# We override the class string here, but logic is the same. +class BatchFeature(BaseBatchFeature): r""" Holds the output of the image processor specific `__call__` methods. @@ -56,143 +53,13 @@ class BatchFeature(UserDict): Args: data (`dict`): - Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask', + Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('pixel_values', 'attention_mask', etc.). tensor_type (`Union[None, str, TensorType]`, *optional*): You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at initialization. """ - def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None): - super().__init__(data) - self.convert_to_tensors(tensor_type=tensor_type) - - # Copied from transformers.tokenization_utils_base.BatchEncoding.__getitem__ - def __getitem__(self, item: str) -> Any: - """ - If the key is a string, returns the value of the dict associated to `key` ('input_values', 'attention_mask', - etc.). - """ - if isinstance(item, str): - return self.data[item] - else: - raise KeyError("Indexing with integers is not available when using Python based feature extractors") - - # Copied from transformers.tokenization_utils_base.BatchEncoding.__getattr__ - def __getattr__(self, item: str): - try: - return self.data[item] - except KeyError: - raise AttributeError - - # Copied from transformers.feature_extraction_utils.BatchFeature.__getstate__ - def __getstate__(self): - return {"data": self.data} - - # Copied from transformers.feature_extraction_utils.BatchFeature.__setstate__ - def __setstate__(self, state): - if "data" in state: - self.data = state["data"] - - # Copied from transformers.tokenization_utils_base.BatchEncoding.keys - def keys(self): - return self.data.keys() - - # Copied from transformers.tokenization_utils_base.BatchEncoding.values - def values(self): - return self.data.values() - - # Copied from transformers.tokenization_utils_base.BatchEncoding.items - def items(self): - return self.data.items() - - def convert_to_tensors(self, tensor_type: Optional[Union[str, ImageType]] = None): - """ - Convert the inner content to tensors. - - Args: - tensor_type (`str` or [`~utils.ImageType`], *optional*): - The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.ImageType`]. If - `None`, no modification is done. - """ - if tensor_type is None: - return self - - # Convert to TensorType - if not isinstance(tensor_type, ImageType): - tensor_type = ImageType(tensor_type) - - # Get a function reference for the correct framework - if tensor_type == ImageType.TENSORFLOW: - if not is_tf_available(): - raise ImportError( - "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed." - ) - import tensorflow as tf - - as_tensor = tf.constant - is_tensor = tf.is_tensor - elif tensor_type == ImageType.PYTORCH: - if not is_torch_available(): - raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") - import torch - - def as_tensor(value): - if isinstance(value, (list, tuple)) and len(value) > 0 and isinstance(value[0], np.ndarray): - value = np.array(value) - return torch.tensor(value) - - is_tensor = torch.is_tensor - elif tensor_type == ImageType.JAX: - if not is_flax_available(): - raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.") - import jax.numpy as jnp # noqa: F811 - - as_tensor = jnp.array - is_tensor = _is_jax - else: - as_tensor = np.asarray - is_tensor = _is_numpy - - # Do the tensor conversion in batch - for key, value in self.items(): - try: - if not is_tensor(value): - tensor = as_tensor(value) - - self[key] = tensor - except: # noqa E722 - if key == "overflowing_values": - raise ValueError("Unable to create tensor returning overflowing values of different lengths. ") - raise ValueError( - "Unable to create tensor, you should probably activate padding " - "with 'padding=True' to have batched tensors with the same length." - ) - - return self - - @torch_required - # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature - def to(self, device: Union[str, "torch.device"]) -> "BatchFeature": - """ - Send all values to device by calling `v.to(device)` (PyTorch only). - - Args: - device (`str` or `torch.device`): The device to put the tensors on. - - Returns: - [`BatchFeature`]: The same instance after modification. - """ - - # This check catches things like APEX blindly calling "to" on all inputs to a module - # Otherwise it passes the casts down and casts the LongTensor containing the token idxs - # into a HalfTensor - if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int): - self.data = {k: v.to(device=device) for k, v in self.data.items()} - else: - logger.warning(f"Attempting to cast a BatchFeature to type {str(device)}. This is not supported.") - return self - class ImageProcessorMixin(PushToHubMixin): """ @@ -218,12 +85,10 @@ def _set_processor_class(self, processor_class: str): self._processor_class = processor_class @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs - ): + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs): r""" - Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a - derived class of [`BaseImageProcessor`]. + Instantiate a type of [`~image_processing_utils.ImageProcessorMixin`] from a image processor, *e.g.* a derived + class of [`BaseImageProcessor`]. Args: pretrained_model_name_or_path (`str` or `os.PathLike`): @@ -241,8 +106,8 @@ def from_pretrained( Path to a directory in which a downloaded pretrained model image processor should be cached if the standard cache should not be used. force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force to (re-)download the image processor files and override the cached versions - if they exist. + Whether or not to force to (re-)download the image processor files and override the cached versions if + they exist. resume_download (`bool`, *optional*, defaults to `False`): Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. @@ -496,16 +361,16 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_json_file(cls, json_file: Union[str, os.PathLike]): """ - Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to - a JSON file of parameters. + Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON + file of parameters. Args: json_file (`str` or `os.PathLike`): Path to the JSON file containing the parameters. Returns: - A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor - object instantiated from that JSON file. + A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object + instantiated from that JSON file. """ with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 55d788d47531..e7e99c09f850 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -13,48 +13,43 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional, Tuple, List, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union -import PIL import numpy as np +import PIL + +from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available from .image_utils import ( ChannelDimension, get_image_size, infer_channel_dimension, - is_torch_tensor, + is_jax_tensor, is_tf_tensor, - is_jax_tensor + is_torch_tensor, ) -def rescale(image: np.ndarray, scale: Union[float, int] = 255) -> np.ndarray: - """ - Rescales `image` by `scale`. - - Args: - image (`np.ndarray``): - The image to rescale. - scale (`float`, `int`): - The scale to use for rescaling the image. - - Returns: - image: A rescaled np.ndarray image. - """ - return image * scale +if TYPE_CHECKING: + if is_torch_available(): + import torch + if is_tf_available(): + import tensorflow as tf + if is_flax_available(): + import jax.numpy as jnp def to_pil_image( - image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor"], + image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.ndarray"], channel_dim: Optional[ChannelDimension] = None, - do_rescale: Optional[bool] = None + rescale=None, ) -> PIL.Image.Image: """ Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if needed. Args: - image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`): + image (`PIL.Image.Image`, `numpy.ndarray`, `torch.Tensor`, `tf.Tensor`): The image to convert to the PIL Image format. rescale (`bool`, *optional*): Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default @@ -69,7 +64,7 @@ def to_pil_image( image = np.array(image) if not isinstance(image, np.ndarray): - raise ValueError("Input image must be of type PIL.Image.Image, numpy.ndarray or torch.Tensor") + raise ValueError("Input image type not supported: {}".format(type(image))) # If the channel as been moved to first dim, we put it back at the end. channel_dim = infer_channel_dimension(image) if channel_dim is None else channel_dim @@ -88,7 +83,7 @@ def get_resize_output_image_size( input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int]], default_to_square: bool = True, - max_size: int = None + max_size: int = None, ) -> np.ndarray: if isinstance(size, (tuple, list)): if len(size) == 2: diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index e5afb3122bcd..8fdf7aadac4d 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -23,8 +23,8 @@ import requests -from .utils import TensorType, is_torch_available, is_tf_available, is_jax_available -from .utils.generic import ExplicitEnum, _is_torch, _is_tensorflow, _is_jax, _is_numpy, to_numpy +from .utils import is_flax_available, is_tf_available, is_torch_available +from .utils.generic import _is_jax, _is_tensorflow, _is_torch IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406] @@ -61,7 +61,7 @@ def is_tf_tensor(obj): def is_jax_tensor(obj): - return _is_jax(obj) if is_jax_available() else False + return _is_jax(obj) if is_flax_available() else False def is_valid_image(img): From ccc15fb6887f748b98c67a3ad4521cfb96a0ddf7 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 13:18:18 +0100 Subject: [PATCH 13/31] Fixup and docs --- .../en/internal/image_processing_utils.mdx | 24 +++++++++++++++ src/transformers/image_transforms.py | 29 ++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 docs/source/en/internal/image_processing_utils.mdx diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx new file mode 100644 index 000000000000..ae4f826517aa --- /dev/null +++ b/docs/source/en/internal/image_processing_utils.mdx @@ -0,0 +1,24 @@ + + +# Utilities for Image Processors + +This page lists all the utility functions used by the image processors, mainly the functional +transformations used to process the images. + +Most of those are only useful if you are studying the code of the image processors in the library. + +## Image Transformations + +[[autodoc]] image_transforms.to_pil_image + +[[autodoc]] image_transforms.resize diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index e7e99c09f850..c7e9c0ec9e20 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -81,10 +81,37 @@ def to_pil_image( def get_resize_output_image_size( input_image: np.ndarray, - size: Union[int, Tuple[int, int], List[int]], + size: Union[int, Tuple[int, int], List[int], Tuple[int]], default_to_square: bool = True, max_size: int = None, ) -> np.ndarray: + """ + Find the target (height, width) dimension of the output image after resizing given the input image and the desired + size. + + Args: + input_image (`np.ndarray`): + The image to resize. + size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]): + The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to + this. + + If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If + `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this + number. i.e, if height > width, then image will be rescaled to (size * height / width, size). + resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`): + The filter to user for resampling. + default_to_square (`bool`, *optional*, defaults to `True`): + How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square + (`size`,`size`). If set to `False`, will replicate + [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize) + with support for resizing only the smallest edge and providing an optional `max_size`. + max_size (`int`, *optional*, defaults to `None`): + The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater + than `max_size` after being resized according to `size`, then the image is resized again so that the longer + edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter + than `size`. Only used if `default_to_square` is `False`. + """ if isinstance(size, (tuple, list)): if len(size) == 2: return size From c8f8eb6e0c918ea2f82b0f0faba141489a508c42 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 13:24:56 +0100 Subject: [PATCH 14/31] Fixup and docs --- docs/source/en/internal/image_processing_utils.mdx | 5 +++++ src/transformers/__init__.py | 2 ++ src/transformers/utils/dummy_vision_objects.py | 7 +++++++ 3 files changed, 14 insertions(+) diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx index ae4f826517aa..4d5831a12fd6 100644 --- a/docs/source/en/internal/image_processing_utils.mdx +++ b/docs/source/en/internal/image_processing_utils.mdx @@ -22,3 +22,8 @@ Most of those are only useful if you are studying the code of the image processo [[autodoc]] image_transforms.to_pil_image [[autodoc]] image_transforms.resize + + +## ImageProcessorMixin + +[[autodoc]] image_processing_utils.ImageProcessorMixin diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index dcacc328e8a8..a5c3e0d90575 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -632,6 +632,7 @@ name for name in dir(dummy_vision_objects) if not name.startswith("_") ] else: + _import_structure["image_processing_utils"] = ["ImageProcessorMixin"] _import_structure["image_transforms"] = ["resize", "to_pil_image"] _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"] _import_structure["models.beit"].append("BeitFeatureExtractor") @@ -3340,6 +3341,7 @@ except OptionalDependencyNotAvailable: from .utils.dummy_vision_objects import * else: + from .image_processing_utils import ImageProcessorMixin from .image_transforms import resize, to_pil_image from .image_utils import ImageFeatureExtractionMixin from .models.beit import BeitFeatureExtractor diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 0d9da1b7a16d..5756f67326b7 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -3,6 +3,13 @@ from ..utils import DummyObject, requires_backends +class ImageProcessorMixin(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + def resize(*args, **kwargs): requires_backends(resize, ["vision"]) From 90093f4b989869cc4191c26ba27cd428af521cae Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Wed, 27 Jul 2022 17:30:59 +0100 Subject: [PATCH 15/31] BatchFeature for packaging image processor outputs --- src/transformers/image_processing_utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 872ce352c3de..587753856420 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -15,8 +15,8 @@ import copy import json -import os -from typing import Any, Dict, Tuple, Union +from collections import UserDict +from typing import Any, Dict, Optional, Tuple, Union import numpy as np @@ -31,13 +31,19 @@ PushToHubMixin, RepositoryNotFoundError, RevisionNotFoundError, + TensorType, cached_path, copy_func, hf_bucket_url, - is_offline_mode, + is_flax_available, is_remote_url, + is_torch_available, + is_tf_available, + is_offline_mode, logging, + torch_required, ) +from .utils.generic import _is_jax, _is_numpy, _is_torch_device logger = logging.get_logger(__name__) From d89c0513ba7e52c1de91870c6179d392a0d7114b Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 14:59:34 +0100 Subject: [PATCH 16/31] Import BatchFeature from feature_extraction_utils --- src/transformers/image_processing_utils.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 587753856420..38778039aee2 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -15,8 +15,8 @@ import copy import json -from collections import UserDict -from typing import Any, Dict, Optional, Tuple, Union +import os +from typing import Any, Dict, Tuple, Union import numpy as np @@ -31,19 +31,14 @@ PushToHubMixin, RepositoryNotFoundError, RevisionNotFoundError, - TensorType, cached_path, copy_func, hf_bucket_url, - is_flax_available, - is_remote_url, - is_torch_available, - is_tf_available, is_offline_mode, + is_remote_url, logging, - torch_required, ) -from .utils.generic import _is_jax, _is_numpy, _is_torch_device + logger = logging.get_logger(__name__) From 9bc91578fd60c5a2662440946807162159c327e8 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 13:24:56 +0100 Subject: [PATCH 17/31] Fixup and docs --- src/transformers/image_processing_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 38778039aee2..872ce352c3de 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -40,7 +40,6 @@ ) - logger = logging.get_logger(__name__) From 6ec382acb30842b9a1871e907c923b87293ca63e Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Wed, 27 Jul 2022 15:55:48 +0100 Subject: [PATCH 18/31] Mixin for saving the image processor --- src/transformers/image_processing_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 872ce352c3de..6e7c8e530d72 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -361,16 +361,16 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_json_file(cls, json_file: Union[str, os.PathLike]): """ - Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON - file of parameters. + Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to + a JSON file of parameters. Args: json_file (`str` or `os.PathLike`): Path to the JSON file containing the parameters. Returns: - A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object - instantiated from that JSON file. + A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor + object instantiated from that JSON file. """ with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() From 56ee6ad282f0d17f3e100655ecef58b0ae0cb354 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 13:24:56 +0100 Subject: [PATCH 19/31] Fixup and docs --- src/transformers/image_processing_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 6e7c8e530d72..872ce352c3de 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -361,16 +361,16 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_json_file(cls, json_file: Union[str, os.PathLike]): """ - Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to - a JSON file of parameters. + Instantiates an image processor of type [`~image_processing_utils.ImageProcessorMixin`] from the path to a JSON + file of parameters. Args: json_file (`str` or `os.PathLike`): Path to the JSON file containing the parameters. Returns: - A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor - object instantiated from that JSON file. + A image processor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The image_processor object + instantiated from that JSON file. """ with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() From 6b88d5f8e09d3ea7d22317eb241df67b7286697d Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 15:57:24 +0100 Subject: [PATCH 20/31] Add rescale back and remove ImageType --- .../en/internal/image_processing_utils.mdx | 5 ++- src/transformers/__init__.py | 4 +-- src/transformers/image_transforms.py | 18 ++++++++++- src/transformers/image_utils.py | 32 ++----------------- .../models/glpn/image_processing_glpn.py | 23 +++++++++---- .../utils/dummy_vision_objects.py | 4 +++ 6 files changed, 46 insertions(+), 40 deletions(-) diff --git a/docs/source/en/internal/image_processing_utils.mdx b/docs/source/en/internal/image_processing_utils.mdx index 4d5831a12fd6..8bdf0ed11099 100644 --- a/docs/source/en/internal/image_processing_utils.mdx +++ b/docs/source/en/internal/image_processing_utils.mdx @@ -19,10 +19,13 @@ Most of those are only useful if you are studying the code of the image processo ## Image Transformations -[[autodoc]] image_transforms.to_pil_image +[[autodoc]] image_transforms.rescale [[autodoc]] image_transforms.resize +[[autodoc]] image_transforms.to_pil_image + + ## ImageProcessorMixin diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index a5c3e0d90575..9b34e4cea7f7 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -633,7 +633,7 @@ ] else: _import_structure["image_processing_utils"] = ["ImageProcessorMixin"] - _import_structure["image_transforms"] = ["resize", "to_pil_image"] + _import_structure["image_transforms"] = ["rescale", "resize", "to_pil_image"] _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"] _import_structure["models.beit"].append("BeitFeatureExtractor") _import_structure["models.clip"].append("CLIPFeatureExtractor") @@ -3342,7 +3342,7 @@ from .utils.dummy_vision_objects import * else: from .image_processing_utils import ImageProcessorMixin - from .image_transforms import resize, to_pil_image + from .image_transforms import rescale, resize, to_pil_image from .image_utils import ImageFeatureExtractionMixin from .models.beit import BeitFeatureExtractor from .models.clip import CLIPFeatureExtractor, CLIPProcessor diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index c7e9c0ec9e20..b15a1372a953 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -39,10 +39,26 @@ import jax.numpy as jnp +def rescale(image: np.ndarray, scale: Union[float, int] = 255) -> np.ndarray: + """ + Rescales `image` by `scale`. + + Args: + image (`np.ndarray``): + The image to rescale. + scale (`float`, `int`): + The scale to use for rescaling the image. + + Returns: + image: A rescaled np.ndarray image. + """ + return image * scale + + def to_pil_image( image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.ndarray"], channel_dim: Optional[ChannelDimension] = None, - rescale=None, + do_rescale=None, ) -> PIL.Image.Image: """ Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 8fdf7aadac4d..3bb72816ced2 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -24,7 +24,7 @@ import requests from .utils import is_flax_available, is_tf_available, is_torch_available -from .utils.generic import _is_jax, _is_tensorflow, _is_torch +from .utils.generic import _is_jax, _is_tensorflow, _is_torch, to_numpy IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406] @@ -36,22 +36,12 @@ PIL.Image.Image, np.ndarray, "torch.Tensor", List[PIL.Image.Image], List[np.ndarray], List["torch.Tensor"] # noqa ] + class ChannelDimension(enum.Enum): FIRST = 1 LAST = 3 -class ImageType(ExplicitEnum): - """ - Possible image data formats that can be fed into an image processor - """ - PYTORCH = "pt" - TENSORFLOW = "tf" - NUMPY = "np" - JAX = "jax" - PIL = "pillow" - - def is_torch_tensor(obj): return _is_torch(obj) if is_torch_available() else False @@ -83,24 +73,8 @@ def is_batched(img): return False -def get_image_type(obj) -> TensorType: - if is_torch_tensor(obj): - return TensorType.TORCH - elif is_tf_tensor(obj): - return TensorType.TF - elif is_jax_tensor(obj): - return TensorType.JAX - elif _is_numpy(obj): - return TensorType.NUMPY - elif isinstance(obj, PIL.Image.Image): - return TensorType.PIL - else: - raise ValueError("Could not infer tensor type") - - def to_numpy_array(img) -> np.ndarray: - input_type = get_image_type(img) - if input_type == ImageType.PIL: + if isinstance(img, PIL.Image.Image): return np.array(img) return to_numpy(img) diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py index bdc33e80ce9f..c07163a7e1d9 100644 --- a/src/transformers/models/glpn/image_processing_glpn.py +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -19,23 +19,30 @@ import PIL.Image from numpy import np +from transformers.utils.generic import TensorType + from ...image_processing_utils import BaseImageProcessor, BatchFeature -from ...image_transforms import resize, rescale -from ...image_utils import ImageType, is_batched, to_numpy_array, valid_images, get_image_size +from ...image_transforms import rescale, resize +from ...image_utils import get_image_size, is_batched, to_numpy_array, valid_images from ...utils import logging + logger = logging.get_logger(__name__) class GLPNImageProcessor(BaseImageProcessor): - def __init__(self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs) -> None: + def __init__( + self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs + ) -> None: self.do_resize = do_resize self.do_rescale = do_rescale self.size_divisor = size_divisor self.resample = resample super().__init__(**kwargs) - def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: PIL.Image.Resampling, **kwargs) -> np.ndarray: + def resize( + self, image: np.ndarray, size_divisor: Union[int, float], resample: PIL.Image.Resampling, **kwargs + ) -> np.ndarray: height, width = get_image_size(image) new_h = height // size_divisor * size_divisor new_w = width // size_divisor * size_divisor @@ -45,14 +52,16 @@ def resize(self, image: np.ndarray, size_divisor: Union[int, float], resample: P def rescale(self, image: np.ndarray, scale: Union[int, float], **kwargs) -> np.ndarray: return rescale(image, scale, **kwargs) - def preprocess(self, images, do_resize=None, do_rescale=None, size_divisor=None, resample=None, return_tensors=None, **kwargs) -> BatchFeature: + def preprocess( + self, images, do_resize=None, do_rescale=None, size_divisor=None, resample=None, return_tensors=None, **kwargs + ) -> BatchFeature: do_resize = do_resize if do_resize is not None else self.do_resize do_rescale = do_rescale if do_rescale is not None else self.do_rescale size_divisor = size_divisor if size_divisor is not None else self.size_divisor resample = resample if resample is not None else self.resample # If a return type isn't specified, default to numpy arrays. - return_tensors = ImageType.NUMPY if return_tensors is None else return_tensors + return_tensors = TensorType.NUMPY if return_tensors is None else return_tensors if do_resize and size_divisor is None: raise ValueError("size_divisor is required for resizing") @@ -70,7 +79,7 @@ def preprocess(self, images, do_resize=None, do_rescale=None, size_divisor=None, images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images] if do_rescale: - images = [self.rescale(image) for image in images] + images = [self.rescale(image, scale=255) for image in images] data = {"pixel_values": images} return BatchFeature(**data, return_tensors=return_tensors) diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 5756f67326b7..6622564eafd6 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -10,6 +10,10 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +def rescale(*args, **kwargs): + requires_backends(rescale, ["vision"]) + + def resize(*args, **kwargs): requires_backends(resize, ["vision"]) From 67077f1de95425da52541078f0ce0eb121158889 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Thu, 28 Jul 2022 16:19:42 +0100 Subject: [PATCH 21/31] fix import mistake --- src/transformers/models/glpn/image_processing_glpn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py index c07163a7e1d9..8ba6568bfcf0 100644 --- a/src/transformers/models/glpn/image_processing_glpn.py +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -17,7 +17,7 @@ from typing import Union import PIL.Image -from numpy import np +import numpy as np from transformers.utils.generic import TensorType From 60c56e5dc4d428b83ca1f23b4162596232ade716 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Fri, 29 Jul 2022 15:53:37 +0100 Subject: [PATCH 22/31] Data format flag for rescale --- src/transformers/image_transforms.py | 37 ++++++++++++++++------------ 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 16b297f3abfc..950a75e08141 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -39,22 +39,6 @@ import jax.numpy as jnp -def rescale(image: np.ndarray, scale: Union[float, int] = 255) -> np.ndarray: - """ - Rescales `image` by `scale`. - - Args: - image (`np.ndarray``): - The image to rescale. - scale (`float`, `int`): - The scale to use for rescaling the image. - - Returns: - image: A rescaled np.ndarray image. - """ - return image * scale - - def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDimension, str]) -> np.ndarray: """ Converts `image` to the channel dimension format specified by `channel_dim`. @@ -82,6 +66,27 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim raise ValueError("Unsupported channel dimension format: {}".format(channel_dim)) +def rescale( + image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None +) -> np.ndarray: + """ + Rescales `image` by `scale`. + + Args: + image (`np.ndarray``): + The image to rescale. + scale (`float`, `int`): + The scale to use for rescaling the image. + + Returns: + image: A rescaled np.ndarray image. + """ + rescaled_image = image * scale + if data_format is not None: + rescaled_image = to_channel_dimension_format(rescaled_image, data_format) + return rescaled_image + + def to_pil_image( image: Union[np.ndarray, PIL.Image.Image, "torch.Tensor", "tf.Tensor", "jnp.Tensor"], do_rescale=None ) -> PIL.Image.Image: From 9294dbcef3e4cebb4ac68efd614c47e8ee8a6638 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Fri, 29 Jul 2022 15:59:31 +0100 Subject: [PATCH 23/31] Fix typo --- src/transformers/image_transforms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 950a75e08141..1863c01d60ec 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -194,7 +194,7 @@ def resize( size: Tuple[int, int], resample=PIL.Image.Resampling.BILINEAR, data_format: Optional[ChannelDimension] = None, -) -> np.np.ndarray: +) -> np.ndarray: """ Resizes `image` to (h, w) specified by `size` using the PIL library. From 88b82e936793a2746c56bdc0c47c3c4b4c8b1590 Mon Sep 17 00:00:00 2001 From: Amy Roberts Date: Fri, 29 Jul 2022 16:47:43 +0100 Subject: [PATCH 24/31] Fixes to make IP and FE outputs match --- src/transformers/image_transforms.py | 8 +++- .../models/glpn/image_processing_glpn.py | 37 ++++++++++++++----- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 1863c01d60ec..9f7e1f48520f 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -67,7 +67,7 @@ def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDim def rescale( - image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None + image: np.ndarray, scale: Union[float, int] = 255, data_format: Optional[ChannelDimension] = None, dtype=np.float32 ) -> np.ndarray: """ Rescales `image` by `scale`. @@ -77,6 +77,11 @@ def rescale( The image to rescale. scale (`float`, `int`): The scale to use for rescaling the image. + data_format (`ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + dtype (`np.dtype`, *optional*): + The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility + with feature extractors Returns: image: A rescaled np.ndarray image. @@ -84,6 +89,7 @@ def rescale( rescaled_image = image * scale if data_format is not None: rescaled_image = to_channel_dimension_format(rescaled_image, data_format) + rescaled_image = rescaled_image.astype(dtype) return rescaled_image diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py index cce1dccf6aff..6ab3d85221eb 100644 --- a/src/transformers/models/glpn/image_processing_glpn.py +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -14,7 +14,7 @@ # limitations under the License. """Image processor class for GLPN.""" -from typing import Union +from typing import Optional, Union import numpy as np import PIL.Image @@ -22,8 +22,8 @@ from transformers.utils.generic import TensorType from ...image_processing_utils import BaseImageProcessor, BatchFeature -from ...image_transforms import rescale, resize -from ...image_utils import get_image_size, is_batched, to_numpy_array, valid_images +from ...image_transforms import rescale, resize, to_channel_dimension_format +from ...image_utils import ChannelDimension, get_image_size, is_batched, to_numpy_array, valid_images from ...utils import logging @@ -41,19 +41,34 @@ def __init__( super().__init__(**kwargs) def resize( - self, image: np.ndarray, size_divisor: Union[int, float], resample: PIL.Image.Resampling, **kwargs + self, + image: np.ndarray, + size_divisor: Union[int, float], + resample: PIL.Image.Resampling, + data_format: Optional[ChannelDimension] = None, + **kwargs ) -> np.ndarray: height, width = get_image_size(image) new_h = height // size_divisor * size_divisor new_w = width // size_divisor * size_divisor - image = resize(image, (new_h, new_w), resample=resample, **kwargs) + image = resize(image, (new_h, new_w), resample=resample, data_format=data_format, **kwargs) return image - def rescale(self, image: np.ndarray, scale: Union[int, float], **kwargs) -> np.ndarray: - return rescale(image, scale, **kwargs) + def rescale( + self, image: np.ndarray, scale: Union[int, float], data_format: Optional[ChannelDimension] = None, **kwargs + ) -> np.ndarray: + return rescale(image=image, scale=scale, data_format=data_format, **kwargs) def preprocess( - self, images, do_resize=None, do_rescale=None, size_divisor=None, resample=None, return_tensors=None, **kwargs + self, + images, + do_resize: bool = None, + do_rescale: bool = None, + size_divisor: int = None, + resample: PIL.Image.Resampling = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + **kwargs ) -> BatchFeature: do_resize = do_resize if do_resize is not None else self.do_resize do_rescale = do_rescale if do_rescale is not None else self.do_rescale @@ -79,7 +94,9 @@ def preprocess( images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images] if do_rescale: - images = [self.rescale(image, scale=255) for image in images] + images = [self.rescale(image, scale=1/255) for image in images] + + images = [to_channel_dimension_format(image, data_format) for image in images] data = {"pixel_values": images} - return BatchFeature(**data, return_tensors=return_tensors) + return BatchFeature(data=data, tensor_type=return_tensors) From 082e4ff9631b934c7eb25ba211ed4fd351171fbb Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Tue, 2 Aug 2022 19:18:53 +0100 Subject: [PATCH 25/31] Remove default to numpy batching --- src/transformers/models/glpn/image_processing_glpn.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py index 6ab3d85221eb..eae2607334db 100644 --- a/src/transformers/models/glpn/image_processing_glpn.py +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -75,9 +75,6 @@ def preprocess( size_divisor = size_divisor if size_divisor is not None else self.size_divisor resample = resample if resample is not None else self.resample - # If a return type isn't specified, default to numpy arrays. - return_tensors = TensorType.NUMPY if return_tensors is None else return_tensors - if do_resize and size_divisor is None: raise ValueError("size_divisor is required for resizing") From bf7335821e3e262d2a3b56749c5442d4235dc6a1 Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Wed, 3 Aug 2022 12:53:39 +0100 Subject: [PATCH 26/31] Fix up --- src/transformers/image_processing_utils.py | 3 +-- src/transformers/image_transforms.py | 4 ++-- src/transformers/models/glpn/image_processing_glpn.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index a15b188349e2..756a4cee7823 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -313,8 +313,7 @@ def get_image_processor_dict( @classmethod def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): """ - Instantiates a type of [`~image_processing_utils.ImageProcessorMixin`] from a Python dictionary of - parameters. + Instantiates a type of [`~image_processing_utils.ImageProcessorMixin`] from a Python dictionary of parameters. Args: image_processor_dict (`Dict[str, Any]`): diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 871665455f3c..99149682616b 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -80,8 +80,8 @@ def rescale( data_format (`ChannelDimension`, *optional*): The channel dimension format of the image. If not provided, it will be the same as the input image. dtype (`np.dtype`, *optional*): - The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility - with feature extractors + The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility with feature + extractors Returns: image: A rescaled np.ndarray image. diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py index eae2607334db..6c13650c2a5f 100644 --- a/src/transformers/models/glpn/image_processing_glpn.py +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -91,7 +91,7 @@ def preprocess( images = [self.resize(image, size_divisor=size_divisor, resample=resample) for image in images] if do_rescale: - images = [self.rescale(image, scale=1/255) for image in images] + images = [self.rescale(image, scale=1 / 255) for image in images] images = [to_channel_dimension_format(image, data_format) for image in images] From 34b6b2fa64ef200192f03927997bb01bcb92122f Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Thu, 4 Aug 2022 09:22:42 +0100 Subject: [PATCH 27/31] Add docstring and model_input_types --- .../models/glpn/image_processing_glpn.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py index 6c13650c2a5f..1cdb455b5052 100644 --- a/src/transformers/models/glpn/image_processing_glpn.py +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -31,6 +31,25 @@ class GLPNImageProcessor(BaseImageProcessor): + r""" + Constructs a GLPN image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the input based on certain `size_divisor`. + size_divisor (`int` or `Tuple(int)`, *optional*, defaults to 32): + Make sure the input is divisible by this value. Only has an effect if `do_resize` is set to `True`. + resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`): + An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`, + `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`, + `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set + to `True`. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). + """ + + model_input_names = ["pixel_values"] + def __init__( self, do_resize=True, do_rescale=True, size_divisor=32, resample=PIL.Image.Resampling.BILINEAR, **kwargs ) -> None: From 7a4d22a880dd7ae176138f7e829e875e2be6d7b2 Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Mon, 8 Aug 2022 12:31:38 +0100 Subject: [PATCH 28/31] Fix up --- src/transformers/image_processing_utils.py | 3 ++- src/transformers/image_transforms.py | 7 ++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 3b8d380d53c4..74ed9c31c397 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -13,7 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .feature_extraction_utils import FeatureExtractionMixin, BatchFeature as BaseBatchFeature +from .feature_extraction_utils import BatchFeature as BaseBatchFeature +from .feature_extraction_utils import FeatureExtractionMixin from .utils import logging diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 73fa4e5f6e8d..d09ef526084a 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -18,7 +18,7 @@ import numpy as np import PIL -from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available +from transformers.utils.import_utils import is_tf_available, is_torch_available from .image_utils import ( ChannelDimension, @@ -35,8 +35,6 @@ import torch if is_tf_available(): import tensorflow as tf - if is_flax_available(): - import jax.numpy as jnp def to_channel_dimension_format(image: np.ndarray, channel_dim: Union[ChannelDimension, str]) -> np.ndarray: @@ -188,8 +186,7 @@ def resize( data_format (`ChannelDimension`, *optional*, defaults to `None`): The channel dimension format of the output image. If `None`, will use the inferred format from the input. return_numpy (`bool`, *optional*, defaults to `True`): - Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is - returned. + Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is returned. Returns: image: A resized np.ndarray. From 790c2c6d997ae263f0813ac0e499edfc140c8cb4 Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Wed, 10 Aug 2022 15:41:47 +0100 Subject: [PATCH 29/31] Apply suggestions from code review Co-authored-by: Sylvain Gugger Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/image_transforms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 315a6634fa3e..558b253addcd 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -73,13 +73,13 @@ def rescale( Rescales `image` by `scale`. Args: - image (`np.ndarray``): + image (`np.ndarray`): The image to rescale. - scale (`float`, `int`): + scale (`float` or `int`, *optional*, defaults to 255): The scale to use for rescaling the image. data_format (`ChannelDimension`, *optional*): The channel dimension format of the image. If not provided, it will be the same as the input image. - dtype (`np.dtype`, *optional*): + dtype (`np.dtype`, *optional*, defaults to `np.float32`): The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility with feature extractors From 2e929cfdba42dedf29aaa8c98b1c94e6f3c0a566 Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Fri, 12 Aug 2022 11:06:01 +0100 Subject: [PATCH 30/31] Update src/transformers/image_transforms.py Co-authored-by: Alara Dirik <8944735+alaradirik@users.noreply.github.com> --- src/transformers/image_transforms.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 558b253addcd..6b2ed5522b7e 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -80,8 +80,7 @@ def rescale( data_format (`ChannelDimension`, *optional*): The channel dimension format of the image. If not provided, it will be the same as the input image. dtype (`np.dtype`, *optional*, defaults to `np.float32`): - The dtype of the output image. Defaults to `np.float32`.Used for backwards compatibility with feature - extractors + The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature extractors. Returns: image: A rescaled np.ndarray image. From ae358735685986862d410a269780cc7986aee86d Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Wed, 17 Aug 2022 13:08:48 +0100 Subject: [PATCH 31/31] Add in docstrings --- src/transformers/image_processing_utils.py | 3 +- src/transformers/image_transforms.py | 6 +- .../models/glpn/image_processing_glpn.py | 86 ++++++++++++++++--- 3 files changed, 79 insertions(+), 16 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index c81600511a0a..721fc86f0ec5 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -13,7 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .feature_extraction_utils import FeatureExtractionMixin, BatchFeature as BaseBatchFeature +from .feature_extraction_utils import BatchFeature as BaseBatchFeature +from .feature_extraction_utils import FeatureExtractionMixin from .utils import logging diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 6b2ed5522b7e..05112d0118f8 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -80,7 +80,8 @@ def rescale( data_format (`ChannelDimension`, *optional*): The channel dimension format of the image. If not provided, it will be the same as the input image. dtype (`np.dtype`, *optional*, defaults to `np.float32`): - The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature extractors. + The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature + extractors. Returns: image: A rescaled np.ndarray image. @@ -214,8 +215,7 @@ def resize( data_format (`ChannelDimension`, *optional*, defaults to `None`): The channel dimension format of the output image. If `None`, will use the inferred format from the input. return_numpy (`bool`, *optional*, defaults to `True`): - Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is - returned. + Whether or not to return the resized image as a numpy array. If False a PIL.Image.Image object is returned. Returns: image: A resized np.ndarray. diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py index 1cdb455b5052..15605781d11a 100644 --- a/src/transformers/models/glpn/image_processing_glpn.py +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -14,7 +14,7 @@ # limitations under the License. """Image processor class for GLPN.""" -from typing import Optional, Union +from typing import List, Optional, Union import numpy as np import PIL.Image @@ -36,16 +36,16 @@ class GLPNImageProcessor(BaseImageProcessor): Args: do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the input based on certain `size_divisor`. - size_divisor (`int` or `Tuple(int)`, *optional*, defaults to 32): - Make sure the input is divisible by this value. Only has an effect if `do_resize` is set to `True`. - resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`): - An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`, - `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`, - `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set - to `True`. + Set the class default for the `do_resize` parameter. Controls whether to resize the image's (height, width) + dimensions, rounding them down to the closest multiple of `size_divisor`. do_rescale (`bool`, *optional*, defaults to `True`): - Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). + Set the class default for the `do_rescale` parameter. Controls whether or not to apply the scaling factor + (to make pixel values floats between 0. and 1.). + size_divisor (`int`, *optional*, defaults to 32): + Set the class default for the `size_divisor` parameter. When `do_resize` is `True`, images are resized so + their height and width are rounded down to the closest multiple of `size_divisor`. + resample (`PIL.Image.Resampling`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`): + Set the class default for `resample`. Defines the resampling filter to use if resizing the image. """ model_input_names = ["pixel_values"] @@ -62,12 +62,32 @@ def __init__( def resize( self, image: np.ndarray, - size_divisor: Union[int, float], + size_divisor: int, resample: PIL.Image.Resampling, data_format: Optional[ChannelDimension] = None, **kwargs ) -> np.ndarray: + """ + Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor. + + If the image is of dimension (3, 260, 170) and size_divisor is 32, the image will be resized to (3, 256, 160). + + Args: + image (`np.ndarray`): + The image to resize. + size_divisor (`int`): + The image is resized so its height and width are rounded down to the closest multiple of + `size_divisor`. + resample (`PIL.Image.Resampling`): + Resampling filter to use when resizing the image. + data_format (`ChannelDimension`, *optional*): + The channel dimension format for the output image. If `None`, the channel dimension format of the input + image is used. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ height, width = get_image_size(image) + # Rounds the height and width down to the closest multiple of size_divisor new_h = height // size_divisor * size_divisor new_w = width // size_divisor * size_divisor image = resize(image, (new_h, new_w), resample=resample, data_format=data_format, **kwargs) @@ -76,11 +96,25 @@ def resize( def rescale( self, image: np.ndarray, scale: Union[int, float], data_format: Optional[ChannelDimension] = None, **kwargs ) -> np.ndarray: + """ + Rescale the image by the given scaling factor `scale`. + + Args: + image (`np.ndarray`): + The image to rescale. + scale (`int` or `float`): + The scaling factor to rescale pixel values by. + data_format (`ChannelDimension`, *optional*): + The channel dimension format for the output image. If `None`, the channel dimension format of the input + image is used. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ return rescale(image=image, scale=scale, data_format=data_format, **kwargs) def preprocess( self, - images, + images: Union["PIL.Image.Image", TensorType, List["PIL.Image.Image"], List[TensorType]], do_resize: bool = None, do_rescale: bool = None, size_divisor: int = None, @@ -89,6 +123,34 @@ def preprocess( data_format: ChannelDimension = ChannelDimension.FIRST, **kwargs ) -> BatchFeature: + """ + Preprocess the given images. + + Args: + images (`PIL.Image.Image` or `TensorType` or `List[np.ndarray]` or `List[TensorType]`): + The image or images to preprocess. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the input such that the (height, width) dimensions are a multiple of `size_divisor`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). + size_divisor (`int`, *optional*, defaults to `self.size_divisor`): + When `do_resize` is `True`, images are resized so their height and width are rounded down to the + closest multiple of `size_divisor`. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PIL.Image.Resampling`, + Only has an effect if `do_resize` is set to `True`. + return_tensors (`str`, *optional*, defaults to `None`): + The type of tensors to return. Can be one of: + - `None`: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ do_resize = do_resize if do_resize is not None else self.do_resize do_rescale = do_rescale if do_rescale is not None else self.do_rescale size_divisor = size_divisor if size_divisor is not None else self.size_divisor