diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md index aebac36c90c2..39362b461585 100644 --- a/docs/source/en/tasks/object_detection.md +++ b/docs/source/en/tasks/object_detection.md @@ -206,10 +206,10 @@ Instantiate the image processor from the same checkpoint as the model you want t >>> image_processor = AutoImageProcessor.from_pretrained( ... MODEL_NAME, -... # At this moment we recommend using external transform to pad and resize images. -... # It`s faster and yields better results for object-detection models. -... do_pad=False, -... do_resize=False, +... do_resize=True, +... size={"max_height": MAX_SIZE, "max_width": MAX_SIZE}, +... do_pad=True, +... pad_size={"height": MAX_SIZE, "width": MAX_SIZE}, ... ) ``` @@ -217,38 +217,28 @@ Before passing the images to the `image_processor`, apply two preprocessing tran - Augmenting images - Reformatting annotations to meet DETR expectations -First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/) ... +First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/). This library ensures that transformations affect the image and update the bounding boxes accordingly. The 🤗 Datasets library documentation has a detailed [guide on how to augment images for object detection](https://huggingface.co/docs/datasets/object_detection), -and it uses the exact same dataset as an example. Apply the same approach here, resize each image to (480, 480), -flip it horizontally, and brighten it. For additional augmentation options, explore the [Albumentations Demo Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo). +and it uses the exact same dataset as an example. Apply some geometric and color transformations to the image. For additional augmentation options, explore the [Albumentations Demo Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo). ```py >>> import albumentations as A >>> max_size = IMAGE_SIZE ->>> # Resize image longest edge to 480 and then pad image to square 480x480. ->>> # This padding and resizing strategy give better results, see ->>> # https://github.com/huggingface/transformers/pull/30422#discussion_r1584647408 ->>> basic_transforms = [ -... A.LongestMaxSize(max_size=max_size), -... A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"), -... ] - >>> train_augment_and_transform = A.Compose( ... [ ... A.Perspective(p=0.1), ... A.HorizontalFlip(p=0.5), ... A.RandomBrightnessContrast(p=0.5), ... A.HueSaturationValue(p=0.1), -... *basic_transforms, ... ], ... bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25), ... ) >>> validation_transform = A.Compose( -... basic_transforms, +... [A.NoOp()], ... bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True), ... ) ``` @@ -294,7 +284,7 @@ The `image_processor` expects the annotations to be in the following format: `{' Now you can combine the image and annotation transformations to use on a batch of examples: ```py ->>> def augment_and_transform_batch(examples, transform, image_processor): +>>> def augment_and_transform_batch(examples, transform, image_processor, return_pixel_mask=False): ... """Apply augmentations and format annotations in COCO format for object detection task""" ... images = [] @@ -315,6 +305,9 @@ Now you can combine the image and annotation transformations to use on a batch o ... # Apply the image processor transformations: resizing, rescaling, normalization ... result = image_processor(images=images, annotations=annotations, return_tensors="pt") +... if not return_pixel_mask: +... result.pop("pixel_mask", None) + ... return result ``` @@ -1485,25 +1478,12 @@ Now that you have finetuned a model, evaluated it, and uploaded it to the Huggin ```py >>> import torch >>> import requests ->>> import numpy as np ->>> import albumentations as A ->>> from PIL import Image +>>> from PIL import Image, ImageDraw >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection >>> url = "https://images.pexels.com/photos/8413299/pexels-photo-8413299.jpeg?auto=compress&cs=tinysrgb&w=630&h=375&dpr=2" >>> image = Image.open(requests.get(url, stream=True).raw) - ->>> # Define transformations for inference ->>> resize_and_pad = A.Compose([ -... A.LongestMaxSize(max_size=max_size), -... A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"), -... ]) - ->>> # This one is for visualization with no padding ->>> resize_only = A.Compose([ -... A.LongestMaxSize(max_size=max_size), -... ]) ``` Load model and image processor from the Hugging Face Hub (skip to use already trained in this session): @@ -1519,12 +1499,11 @@ Load model and image processor from the Hugging Face Hub (skip to use already tr And detect bounding boxes: ```py ->>> np_preprocessed_image = resize_and_pad(image=np.array(image))["image"] >>> with torch.no_grad(): -... inputs = image_processor(images=[np_preprocessed_image], return_tensors="pt") -... outputs = model(inputs["pixel_values"].to(device)) -... target_sizes = torch.tensor([np_preprocessed_image.shape[:2]]) +... inputs = image_processor(images=[image], return_tensors="pt") +... outputs = model(**inputs.to(device)) +... target_sizes = torch.tensor([[image.size[1], image.size[0]]]) ... results = image_processor.post_process_object_detection(outputs, threshold=0.3, target_sizes=target_sizes)[0] >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): @@ -1543,9 +1522,7 @@ Detected Coverall with confidence 0.391 at location [68.61, 126.66, 309.03, 318. Let's plot the result: ```py ->>> resized_image = resize_only(image=np.array(image))["image"] ->>> resized_image = Image.fromarray(resized_image) ->>> draw = ImageDraw.Draw(resized_image) +>>> draw = ImageDraw.Draw(image) >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): ... box = [round(i, 2) for i in box.tolist()] @@ -1553,7 +1530,7 @@ Let's plot the result: ... draw.rectangle((x, y, x2, y2), outline="red", width=1) ... draw.text((x, y), model.config.id2label[label.item()], fill="white") ->>> resized_image +>>> image ```
diff --git a/examples/pytorch/object-detection/README.md b/examples/pytorch/object-detection/README.md index 5150a76b7869..ab474f760753 100644 --- a/examples/pytorch/object-detection/README.md +++ b/examples/pytorch/object-detection/README.md @@ -50,7 +50,7 @@ python run_object_detection.py \ --per_device_train_batch_size 8 \ --gradient_accumulation_steps 1 \ --remove_unused_columns false \ - --eval_do_concat_batches false \ + --eval_do_concat_batches false \ --ignore_mismatched_sizes true \ --metric_for_best_model eval_map \ --greater_is_better true \ @@ -200,6 +200,7 @@ Where `metadata.jsonl` is a file with the following structure: {"file_name": "0002.jpg", "objects": {"bbox": [[810.0, 100.0, 57.0, 28.0]], "categories": [1], "id": [2], "area": [40.0]}} ... ``` +Trining script support bounding boxes in COCO format (x_min, y_min, width, height). Then, you cat load the dataset with just a few lines of code: diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index a3849e195f19..62e60acc7234 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -117,7 +117,10 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int] def augment_and_transform_batch( - examples: Mapping[str, Any], transform: A.Compose, image_processor: AutoImageProcessor + examples: Mapping[str, Any], + transform: A.Compose, + image_processor: AutoImageProcessor, + return_pixel_mask: bool = False, ) -> BatchFeature: """Apply augmentations and format annotations in COCO format for object detection task""" @@ -139,6 +142,9 @@ def augment_and_transform_batch( # Apply the image processor transformations: resizing, rescaling, normalization result = image_processor(images=images, annotations=annotations, return_tensors="pt") + if not return_pixel_mask: + result.pop("pixel_mask", None) + return result @@ -415,12 +421,10 @@ def main(): ) image_processor = AutoImageProcessor.from_pretrained( model_args.image_processor_name or model_args.model_name_or_path, - # At this moment we recommend using external transform to pad and resize images. - # It`s faster and yields much better results for object-detection models. - do_pad=False, - do_resize=False, - # We will save image size parameter in config just for reference - size={"longest_edge": data_args.image_square_size}, + do_resize=True, + size={"max_height": data_args.image_square_size, "max_width": data_args.image_square_size}, + do_pad=True, + pad_size={"height": data_args.image_square_size, "width": data_args.image_square_size}, **common_pretrained_args, ) @@ -428,10 +432,6 @@ def main(): # Define image augmentations and dataset transforms # ------------------------------------------------------------------------------------------------ max_size = data_args.image_square_size - basic_transforms = [ - A.LongestMaxSize(max_size=max_size), - A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"), - ] train_augment_and_transform = A.Compose( [ A.Compose( @@ -453,12 +453,11 @@ def main(): A.HorizontalFlip(p=0.5), A.RandomBrightnessContrast(p=0.5), A.HueSaturationValue(p=0.1), - *basic_transforms, ], bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25), ) validation_transform = A.Compose( - basic_transforms, + [A.NoOp()], bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True), ) diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index f0889f3d282f..8bea58aa5064 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -120,7 +120,10 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int] # Copied from examples/pytorch/object-detection/run_object_detection.augment_and_transform_batch def augment_and_transform_batch( - examples: Mapping[str, Any], transform: A.Compose, image_processor: AutoImageProcessor + examples: Mapping[str, Any], + transform: A.Compose, + image_processor: AutoImageProcessor, + return_pixel_mask: bool = False, ) -> BatchFeature: """Apply augmentations and format annotations in COCO format for object detection task""" @@ -142,6 +145,9 @@ def augment_and_transform_batch( # Apply the image processor transformations: resizing, rescaling, normalization result = image_processor(images=images, annotations=annotations, return_tensors="pt") + if not return_pixel_mask: + result.pop("pixel_mask", None) + return result @@ -473,12 +479,10 @@ def main(): ) image_processor = AutoImageProcessor.from_pretrained( args.model_name_or_path, - # At this moment we recommend using external transform to pad and resize images. - # It`s faster and yields much better results for object-detection models. - do_pad=False, - do_resize=False, - # We will save image size parameter in config just for reference - size={"longest_edge": args.image_square_size}, + do_resize=True, + size={"max_height": args.image_square_size, "max_width": args.image_square_size}, + do_pad=True, + pad_size={"height": args.image_square_size, "width": args.image_square_size}, **common_pretrained_args, ) @@ -486,10 +490,6 @@ def main(): # Define image augmentations and dataset transforms # ------------------------------------------------------------------------------------------------ max_size = args.image_square_size - basic_transforms = [ - A.LongestMaxSize(max_size=max_size), - A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"), - ] train_augment_and_transform = A.Compose( [ A.Compose( @@ -511,12 +511,11 @@ def main(): A.HorizontalFlip(p=0.5), A.RandomBrightnessContrast(p=0.5), A.HueSaturationValue(p=0.1), - *basic_transforms, ], bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25), ) validation_transform = A.Compose( - basic_transforms, + [A.NoOp()], bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True), )