From 9415a4f9616561175ba6ff7d7711b6e328239134 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Mon, 18 Nov 2024 16:32:16 +0000 Subject: [PATCH 1/7] refactor image_processing_auto logic --- .../source/en/main_classes/image_processor.md | 28 ++++--------- .../models/auto/image_processing_auto.py | 42 ++++++++++++------- .../models/auto/test_image_processing_auto.py | 4 +- tests/test_image_processing_common.py | 5 ++- 4 files changed, 40 insertions(+), 39 deletions(-) diff --git a/docs/source/en/main_classes/image_processor.md b/docs/source/en/main_classes/image_processor.md index 320916f1ce94..c07ef0e23946 100644 --- a/docs/source/en/main_classes/image_processor.md +++ b/docs/source/en/main_classes/image_processor.md @@ -20,13 +20,7 @@ An image processor is in charge of preparing input features for vision models an Fast image processors are available for a few models and more will be added in the future. They are based on the [torchvision](https://pytorch.org/vision/stable/index.html) library and provide a significant speed-up, especially when processing on GPU. They have the same API as the base image processors and can be used as drop-in replacements. -To use a fast image processor, you need to install the `torchvision` library, and set the `use_fast` argument to `True` when instantiating the image processor: - -```python -from transformers import AutoImageProcessor - -processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True) -``` +Fast image processors are used by default when available and when the `torchvision` library is installed. To force the use of a standard processor, you can set the `use_fast` argument to `False` when instantiating the image processor. When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. @@ -42,21 +36,17 @@ images_processed = processor(images, return_tensors="pt", device="cuda") Here are some speed comparisons between the base and fast image processors for the `DETR` and `RT-DETR` models, and how they impact overall inference time:
-
- -
-
- -
+ +
+
+
-
- -
-
- -
+ +
+
+
These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon.com/ec2/instance-types/g5/), utilizing an NVIDIA A10G Tensor Core GPU. diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 0670637c9152..dc416b0ba080 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -175,7 +175,7 @@ IMAGE_PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, IMAGE_PROCESSOR_MAPPING_NAMES) -def image_processor_class_from_name(class_name: str): +def get_image_processor_class_from_name(class_name: str): if class_name == "BaseImageProcessorFast": return BaseImageProcessorFast @@ -368,7 +368,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): identifier allowed by git. use_fast (`bool`, *optional*, defaults to `False`): Use a fast torchvision-base image processor if it is supported for a given model. - If a fast tokenizer is not available for a given model, a normal numpy-based image processor + If a fast image processor is not available for a given model, a normal numpy-based image processor is returned instead. return_unused_kwargs (`bool`, *optional*, defaults to `False`): If `False`, then this function returns just the final image processor object. If `True`, then this @@ -416,7 +416,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): kwargs["token"] = use_auth_token config = kwargs.pop("config", None) - use_fast = kwargs.pop("use_fast", None) + use_fast = kwargs.pop("use_fast", True) trust_remote_code = kwargs.pop("trust_remote_code", None) kwargs["_from_auto"] = True @@ -451,23 +451,23 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): if not is_timm_config_dict(config_dict): raise initial_exception - image_processor_class = config_dict.get("image_processor_type", None) + image_processor_type = config_dict.get("image_processor_type", None) image_processor_auto_map = None if "AutoImageProcessor" in config_dict.get("auto_map", {}): image_processor_auto_map = config_dict["auto_map"]["AutoImageProcessor"] # If we still don't have the image processor class, check if we're loading from a previous feature extractor config # and if so, infer the image processor class from there. - if image_processor_class is None and image_processor_auto_map is None: + if image_processor_type is None and image_processor_auto_map is None: feature_extractor_class = config_dict.pop("feature_extractor_type", None) if feature_extractor_class is not None: - image_processor_class = feature_extractor_class.replace("FeatureExtractor", "ImageProcessor") + image_processor_type = feature_extractor_class.replace("FeatureExtractor", "ImageProcessor") if "AutoFeatureExtractor" in config_dict.get("auto_map", {}): feature_extractor_auto_map = config_dict["auto_map"]["AutoFeatureExtractor"] image_processor_auto_map = feature_extractor_auto_map.replace("FeatureExtractor", "ImageProcessor") # If we don't find the image processor class in the image processor config, let's try the model config. - if image_processor_class is None and image_processor_auto_map is None: + if image_processor_type is None and image_processor_auto_map is None: if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained( pretrained_model_name_or_path, @@ -475,18 +475,28 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): **kwargs, ) # It could be in `config.image_processor_type`` - image_processor_class = getattr(config, "image_processor_type", None) + image_processor_type = getattr(config, "image_processor_type", None) if hasattr(config, "auto_map") and "AutoImageProcessor" in config.auto_map: image_processor_auto_map = config.auto_map["AutoImageProcessor"] - if image_processor_class is not None: - # Update class name to reflect the use_fast option. If class is not found, None is returned. - if use_fast is not None: - if use_fast and not image_processor_class.endswith("Fast"): - image_processor_class += "Fast" - elif not use_fast and image_processor_class.endswith("Fast"): - image_processor_class = image_processor_class[:-4] - image_processor_class = image_processor_class_from_name(image_processor_class) + image_processor_class = None + if image_processor_type is not None: + # Update class name to reflect the use_fast option. If class is not found, we fall back to the slow version. + if use_fast: + if not image_processor_type.endswith("Fast"): + image_processor_type += "Fast" + image_processor_class = get_image_processor_class_from_name(image_processor_type) + if image_processor_class is None: + logger.warning_once( + "`use_fast` is set to `True` but the image processor class does not have a fast version. " + " Falling back to the slow version." + ) + image_processor_class = get_image_processor_class_from_name(image_processor_type[:-4]) + else: + image_processor_type = ( + image_processor_type[:-4] if image_processor_type.endswith("Fast") else image_processor_type + ) + image_processor_class = get_image_processor_class_from_name(image_processor_type) has_remote_code = image_processor_auto_map is not None has_local_code = image_processor_class is not None or type(config) in IMAGE_PROCESSOR_MAPPING diff --git a/tests/models/auto/test_image_processing_auto.py b/tests/models/auto/test_image_processing_auto.py index c0046ae1c363..e16c9f1c80a1 100644 --- a/tests/models/auto/test_image_processing_auto.py +++ b/tests/models/auto/test_image_processing_auto.py @@ -140,9 +140,9 @@ def test_image_processor_not_found(self): def test_use_fast_selection(self): checkpoint = "hf-internal-testing/tiny-random-vit" - # Slow image processor is selected by default + # Fast image processor is selected by default image_processor = AutoImageProcessor.from_pretrained(checkpoint) - self.assertIsInstance(image_processor, ViTImageProcessor) + self.assertIsInstance(image_processor, ViTImageProcessorFast) # Fast image processor is selected when use_fast=True image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=True) diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py index 7d89b43ce35b..221552175a93 100644 --- a/tests/test_image_processing_common.py +++ b/tests/test_image_processing_common.py @@ -228,14 +228,15 @@ def test_image_processor_from_and_save_pretrained(self): self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict()) def test_image_processor_save_load_with_autoimageprocessor(self): - for image_processing_class in self.image_processor_list: + for i, image_processing_class in enumerate(self.image_processor_list): image_processor_first = image_processing_class(**self.image_processor_dict) with tempfile.TemporaryDirectory() as tmpdirname: saved_file = image_processor_first.save_pretrained(tmpdirname)[0] check_json_file_has_correct_format(saved_file) - image_processor_second = AutoImageProcessor.from_pretrained(tmpdirname) + use_fast = i == 1 + image_processor_second = AutoImageProcessor.from_pretrained(tmpdirname, use_fast=use_fast) self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict()) From 7454c6d8ac114741cd939c62882c4ca141c01910 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Mon, 18 Nov 2024 16:48:12 +0000 Subject: [PATCH 2/7] fix fast image processor tests --- tests/models/detr/test_image_processing_detr.py | 4 +++- tests/models/rt_detr/test_image_processing_rt_detr.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index f91c52087366..a0b469f2de92 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -19,7 +19,7 @@ import numpy as np -from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow +from transformers.testing_utils import require_torch, require_torch_gpu, require_torchvision, require_vision, slow from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs @@ -669,6 +669,7 @@ def test_longest_edge_shortest_edge_resizing_strategy(self): @slow @require_torch_gpu + @require_torchvision def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self): # prepare image and target image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") @@ -724,6 +725,7 @@ def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self): @slow @require_torch_gpu + @require_torchvision def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self): # prepare image, target and masks_path image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") diff --git a/tests/models/rt_detr/test_image_processing_rt_detr.py b/tests/models/rt_detr/test_image_processing_rt_detr.py index e7bfbae3f9c2..2be3ea3e7651 100644 --- a/tests/models/rt_detr/test_image_processing_rt_detr.py +++ b/tests/models/rt_detr/test_image_processing_rt_detr.py @@ -16,7 +16,7 @@ import requests -from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow +from transformers.testing_utils import require_torch, require_torch_gpu, require_torchvision, require_vision, slow from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs @@ -374,6 +374,7 @@ def test_batched_coco_detection_annotations(self): @slow @require_torch_gpu + @require_torchvision # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self): # prepare image and target From 7ecf0f9d787b024e17df689ad2eed8627f6818c1 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Mon, 18 Nov 2024 17:39:55 +0000 Subject: [PATCH 3/7] Fix tests fast vit image processor --- .../models/vit/image_processing_vit_fast.py | 1 + .../test_processor_vision_text_dual_encoder.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/vit/image_processing_vit_fast.py b/src/transformers/models/vit/image_processing_vit_fast.py index 98ecfb3927a3..e8abdcfe5cc8 100644 --- a/src/transformers/models/vit/image_processing_vit_fast.py +++ b/src/transformers/models/vit/image_processing_vit_fast.py @@ -254,6 +254,7 @@ def preprocess( image_std = image_std if image_std is not None else self.image_std size = size if size is not None else self.size do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + return_tensors = "pt" if return_tensors is None else return_tensors # Make hashable for cache size = SizeDict(**size) image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean diff --git a/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py index c9386a160f84..e62bfe704d1d 100644 --- a/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py +++ b/tests/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py @@ -21,13 +21,13 @@ from transformers import BertTokenizerFast from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES, BertTokenizer from transformers.testing_utils import require_tokenizers, require_vision -from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available +from transformers.utils import IMAGE_PROCESSOR_NAME, is_torchvision_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin if is_vision_available(): - from transformers import VisionTextDualEncoderProcessor, ViTImageProcessor + from transformers import VisionTextDualEncoderProcessor, ViTImageProcessor, ViTImageProcessorFast @require_tokenizers @@ -63,6 +63,8 @@ def get_tokenizer(self, **kwargs): return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_image_processor(self, **kwargs): + if is_torchvision_available(): + return ViTImageProcessorFast.from_pretrained(self.tmpdirname, **kwargs) return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs) def tearDown(self): @@ -81,7 +83,7 @@ def test_save_load_pretrained_default(self): self.assertIsInstance(processor.tokenizer, (BertTokenizer, BertTokenizerFast)) self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string()) - self.assertIsInstance(processor.image_processor, ViTImageProcessor) + self.assertIsInstance(processor.image_processor, (ViTImageProcessor, ViTImageProcessorFast)) def test_save_load_pretrained_additional_features(self): processor = VisionTextDualEncoderProcessor( @@ -100,7 +102,7 @@ def test_save_load_pretrained_additional_features(self): self.assertIsInstance(processor.tokenizer, (BertTokenizer, BertTokenizerFast)) self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) - self.assertIsInstance(processor.image_processor, ViTImageProcessor) + self.assertIsInstance(processor.image_processor, (ViTImageProcessor, ViTImageProcessorFast)) def test_image_processor(self): image_processor = self.get_image_processor() @@ -110,8 +112,8 @@ def test_image_processor(self): image_input = self.prepare_image_inputs() - input_feat_extract = image_processor(image_input, return_tensors="np") - input_processor = processor(images=image_input, return_tensors="np") + input_feat_extract = image_processor(image_input, return_tensors="pt") + input_processor = processor(images=image_input, return_tensors="pt") for key in input_feat_extract.keys(): self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) From aa9b213dc21f5d17e078688b3ba974dbc39be8d2 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Mon, 18 Nov 2024 17:40:12 +0000 Subject: [PATCH 4/7] Add safeguard when use_fast True and torchvision not available --- src/transformers/models/auto/image_processing_auto.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index dc416b0ba080..cf8a59fbcc53 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -482,6 +482,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): image_processor_class = None if image_processor_type is not None: # Update class name to reflect the use_fast option. If class is not found, we fall back to the slow version. + if use_fast and not is_torchvision_available(): + logger.warning_once( + "Using `use_fast=True` but `torchvision` is not available. Falling back to the slow image processor." + ) + use_fast = False if use_fast: if not image_processor_type.endswith("Fast"): image_processor_type += "Fast" From a90f537a7bff09e90a44fdec7fb306c017223a31 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Fri, 29 Nov 2024 21:05:43 +0000 Subject: [PATCH 5/7] change default use_fast back to None, add warnings --- docs/source/en/main_classes/image_processor.md | 9 ++++++++- .../models/auto/image_processing_auto.py | 14 +++++++++++++- tests/models/auto/test_image_processing_auto.py | 5 +++-- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/docs/source/en/main_classes/image_processor.md b/docs/source/en/main_classes/image_processor.md index c07ef0e23946..cbf6ae95577f 100644 --- a/docs/source/en/main_classes/image_processor.md +++ b/docs/source/en/main_classes/image_processor.md @@ -20,7 +20,14 @@ An image processor is in charge of preparing input features for vision models an Fast image processors are available for a few models and more will be added in the future. They are based on the [torchvision](https://pytorch.org/vision/stable/index.html) library and provide a significant speed-up, especially when processing on GPU. They have the same API as the base image processors and can be used as drop-in replacements. -Fast image processors are used by default when available and when the `torchvision` library is installed. To force the use of a standard processor, you can set the `use_fast` argument to `False` when instantiating the image processor. +To use a fast image processor, you need to install the `torchvision` library, and set the `use_fast` argument to `True` when instantiating the image processor: + +```python +from transformers import AutoImageProcessor + +processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True) +``` +Note that `use_fast` will be set to `True` by default in a future release. When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index cf8a59fbcc53..6dc6118045ae 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -416,7 +416,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): kwargs["token"] = use_auth_token config = kwargs.pop("config", None) - use_fast = kwargs.pop("use_fast", True) + # TODO: @yoni, change in v4.48 (use_fast set to True by default) + use_fast = kwargs.pop("use_fast", None) trust_remote_code = kwargs.pop("trust_remote_code", None) kwargs["_from_auto"] = True @@ -480,7 +481,18 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): image_processor_auto_map = config.auto_map["AutoImageProcessor"] image_processor_class = None + # TODO: @yoni, change logic in v4.48 (when use_fast set to True by default) + print("image_processor_type", image_processor_type) if image_processor_type is not None: + # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor. + if use_fast is None: + use_fast = image_processor_type.endswith("Fast") + if not use_fast: + logger.warning_once( + "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. " + "`use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. " + "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`." + ) # Update class name to reflect the use_fast option. If class is not found, we fall back to the slow version. if use_fast and not is_torchvision_available(): logger.warning_once( diff --git a/tests/models/auto/test_image_processing_auto.py b/tests/models/auto/test_image_processing_auto.py index e16c9f1c80a1..1becf25ae7c3 100644 --- a/tests/models/auto/test_image_processing_auto.py +++ b/tests/models/auto/test_image_processing_auto.py @@ -140,9 +140,10 @@ def test_image_processor_not_found(self): def test_use_fast_selection(self): checkpoint = "hf-internal-testing/tiny-random-vit" - # Fast image processor is selected by default + # TODO: @yoni, change in v4.48 (when use_fast set to True by default) + # Slow image processor is selected by default image_processor = AutoImageProcessor.from_pretrained(checkpoint) - self.assertIsInstance(image_processor, ViTImageProcessorFast) + self.assertIsInstance(image_processor, ViTImageProcessor) # Fast image processor is selected when use_fast=True image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=True) From 94a9c23a7817ac536a5b1bb96ee38e7135a69cc7 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Fri, 29 Nov 2024 21:11:35 +0000 Subject: [PATCH 6/7] remove debugging print --- src/transformers/models/auto/image_processing_auto.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 6dc6118045ae..350b3401ccea 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -482,7 +482,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): image_processor_class = None # TODO: @yoni, change logic in v4.48 (when use_fast set to True by default) - print("image_processor_type", image_processor_type) if image_processor_type is not None: # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor. if use_fast is None: From 7c066f2a76811f1c475c665c87cac693512bb51c Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Sun, 15 Dec 2024 18:43:30 +0000 Subject: [PATCH 7/7] call get_image_processor_class_from_name once --- src/transformers/models/auto/image_processing_auto.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 350b3401ccea..db25591eaa35 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -501,13 +501,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): if use_fast: if not image_processor_type.endswith("Fast"): image_processor_type += "Fast" - image_processor_class = get_image_processor_class_from_name(image_processor_type) - if image_processor_class is None: + for _, image_processors in IMAGE_PROCESSOR_MAPPING_NAMES.items(): + if image_processor_type in image_processors: + break + else: + image_processor_type = image_processor_type[:-4] + use_fast = False logger.warning_once( "`use_fast` is set to `True` but the image processor class does not have a fast version. " " Falling back to the slow version." ) - image_processor_class = get_image_processor_class_from_name(image_processor_type[:-4]) + image_processor_class = get_image_processor_class_from_name(image_processor_type) else: image_processor_type = ( image_processor_type[:-4] if image_processor_type.endswith("Fast") else image_processor_type