diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index dd7bb326993d..b776270faeb4 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -131,6 +131,13 @@ def convert_rgb(self, image): return image.convert("RGB") + def rescale(self, image: np.ndarray, scale: Union[float, int]) -> np.ndarray: + """ + Rescale a numpy image by scale amount + """ + self._ensure_format_supported(image) + return image * scale + def to_numpy_array(self, image, rescale=None, channel_first=True): """ Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first @@ -153,11 +160,10 @@ def to_numpy_array(self, image, rescale=None, channel_first=True): if is_torch_tensor(image): image = image.numpy() - if rescale is None: - rescale = isinstance(image.flat[0], np.integer) + rescale = isinstance(image.flat[0], np.integer) if rescale is None else rescale if rescale: - image = image.astype(np.float32) / 255.0 + image = self.rescale(image.astype(np.float32), 1 / 255.0) if channel_first and image.ndim == 3: image = image.transpose(2, 0, 1) @@ -184,7 +190,7 @@ def expand_dims(self, image): image = np.expand_dims(image, axis=0) return image - def normalize(self, image, mean, std): + def normalize(self, image, mean, std, rescale=False): """ Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of `image` to a NumPy array if it's a PIL Image. @@ -196,11 +202,21 @@ def normalize(self, image, mean, std): The mean (per channel) to use for normalization. std (`List[float]` or `np.ndarray` or `torch.Tensor`): The standard deviation (per channel) to use for normalization. + rescale (`bool`, *optional*, defaults to `False`): + Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will + happen automatically. """ self._ensure_format_supported(image) if isinstance(image, PIL.Image.Image): - image = self.to_numpy_array(image) + image = self.to_numpy_array(image, rescale=True) + # If the input image is a PIL image, it automatically gets rescaled. If it's another + # type it may need rescaling. + elif rescale: + if isinstance(image, np.ndarray): + image = self.rescale(image.astype(np.float32), 1 / 255.0) + elif is_torch_tensor(image): + image = self.rescale(image.float(), 1 / 255.0) if isinstance(image, np.ndarray): if not isinstance(mean, np.ndarray): diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py index 6c870e3341cd..3c1be7102c1a 100644 --- a/tests/utils/test_image_utils.py +++ b/tests/utils/test_image_utils.py @@ -58,13 +58,13 @@ def test_conversion_image_to_array(self): array3 = feature_extractor.to_numpy_array(image, rescale=False) self.assertTrue(array3.dtype, np.uint8) self.assertEqual(array3.shape, (3, 16, 32)) - self.assertTrue(np.array_equal(array1, array3.astype(np.float32) / 255.0)) + self.assertTrue(np.array_equal(array1, array3.astype(np.float32) * (1 / 255.0))) # Conversion with no rescale and not channel first array4 = feature_extractor.to_numpy_array(image, rescale=False, channel_first=False) self.assertTrue(array4.dtype, np.uint8) self.assertEqual(array4.shape, (16, 32, 3)) - self.assertTrue(np.array_equal(array2, array4.astype(np.float32) / 255.0)) + self.assertTrue(np.array_equal(array2, array4.astype(np.float32) * (1 / 255.0))) def test_conversion_array_to_array(self): feature_extractor = ImageFeatureExtractionMixin() @@ -74,13 +74,13 @@ def test_conversion_array_to_array(self): array1 = feature_extractor.to_numpy_array(array) self.assertTrue(array1.dtype, np.float32) self.assertEqual(array1.shape, (3, 16, 32)) - self.assertTrue(np.array_equal(array1, array.transpose(2, 0, 1).astype(np.float32) / 255.0)) + self.assertTrue(np.array_equal(array1, array.transpose(2, 0, 1).astype(np.float32) * (1 / 255.0))) # Same with no permute array2 = feature_extractor.to_numpy_array(array, channel_first=False) self.assertTrue(array2.dtype, np.float32) self.assertEqual(array2.shape, (16, 32, 3)) - self.assertTrue(np.array_equal(array2, array.astype(np.float32) / 255.0)) + self.assertTrue(np.array_equal(array2, array.astype(np.float32) * (1 / 255.0))) # Force rescale to False array3 = feature_extractor.to_numpy_array(array, rescale=False) @@ -110,13 +110,13 @@ def test_conversion_torch_to_array(self): array1 = feature_extractor.to_numpy_array(array) self.assertTrue(array1.dtype, np.float32) self.assertEqual(array1.shape, (3, 16, 32)) - self.assertTrue(np.array_equal(array1, array.transpose(2, 0, 1).astype(np.float32) / 255.0)) + self.assertTrue(np.array_equal(array1, array.transpose(2, 0, 1).astype(np.float32) * (1 / 255.0))) # Same with no permute array2 = feature_extractor.to_numpy_array(array, channel_first=False) self.assertTrue(array2.dtype, np.float32) self.assertEqual(array2.shape, (16, 32, 3)) - self.assertTrue(np.array_equal(array2, array.astype(np.float32) / 255.0)) + self.assertTrue(np.array_equal(array2, array.astype(np.float32) * (1 / 255.0))) # Force rescale to False array3 = feature_extractor.to_numpy_array(array, rescale=False) @@ -160,7 +160,7 @@ def test_conversion_array_to_image(self): self.assertTrue(np.array_equal(np.array(image2), array)) # If the array has floating type, it's rescaled by default. - image3 = feature_extractor.to_pil_image(array.astype(np.float32) / 255.0) + image3 = feature_extractor.to_pil_image(array.astype(np.float32) * (1 / 255.0)) self.assertTrue(isinstance(image3, PIL.Image.Image)) self.assertTrue(np.array_equal(np.array(image3), array)) @@ -170,7 +170,7 @@ def test_conversion_array_to_image(self): self.assertTrue(np.array_equal(np.array(image4), array)) # And with floats + channel first. - image5 = feature_extractor.to_pil_image(array.transpose(2, 0, 1).astype(np.float32) / 255.0) + image5 = feature_extractor.to_pil_image(array.transpose(2, 0, 1).astype(np.float32) * (1 / 255.0)) self.assertTrue(isinstance(image5, PIL.Image.Image)) self.assertTrue(np.array_equal(np.array(image5), array)) @@ -201,7 +201,7 @@ def test_conversion_tensor_to_image(self): self.assertTrue(np.array_equal(np.array(image4), array)) # And with floats + channel first. - image5 = feature_extractor.to_pil_image(tensor.permute(2, 0, 1).float() / 255.0) + image5 = feature_extractor.to_pil_image(tensor.permute(2, 0, 1).float() * (1 / 255.0)) self.assertTrue(isinstance(image5, PIL.Image.Image)) self.assertTrue(np.array_equal(np.array(image5), array)) @@ -316,7 +316,7 @@ def test_normalize_image(self): self.assertEqual(normalized_image.shape, (3, 16, 32)) # During the conversion rescale and channel first will be applied. - expected = array.transpose(2, 0, 1).astype(np.float32) / 255.0 + expected = array.transpose(2, 0, 1).astype(np.float32) * (1 / 255.0) np_mean = np.array(mean).astype(np.float32)[:, None, None] np_std = np.array(std).astype(np.float32)[:, None, None] expected = (expected - np_mean) / np_std