Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 17 additions & 40 deletions docs/source/en/tasks/object_detection.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,49 +206,39 @@ Instantiate the image processor from the same checkpoint as the model you want t

>>> image_processor = AutoImageProcessor.from_pretrained(
... MODEL_NAME,
... # At this moment we recommend using external transform to pad and resize images.
... # It`s faster and yields better results for object-detection models.
... do_pad=False,
... do_resize=False,
... do_resize=True,
... size={"max_height": MAX_SIZE, "max_width": MAX_SIZE},
... do_pad=True,
... pad_size={"height": MAX_SIZE, "width": MAX_SIZE},
... )
```

Before passing the images to the `image_processor`, apply two preprocessing transformations to the dataset:
- Augmenting images
- Reformatting annotations to meet DETR expectations

First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/) ...
First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/).
This library ensures that transformations affect the image and update the bounding boxes accordingly.
The 🤗 Datasets library documentation has a detailed [guide on how to augment images for object detection](https://huggingface.co/docs/datasets/object_detection),
and it uses the exact same dataset as an example. Apply the same approach here, resize each image to (480, 480),
flip it horizontally, and brighten it. For additional augmentation options, explore the [Albumentations Demo Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo).
and it uses the exact same dataset as an example. Apply some geometric and color transformations to the image. For additional augmentation options, explore the [Albumentations Demo Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo).

```py
>>> import albumentations as A

>>> max_size = IMAGE_SIZE

>>> # Resize image longest edge to 480 and then pad image to square 480x480.
>>> # This padding and resizing strategy give better results, see
>>> # https://github.com/huggingface/transformers/pull/30422#discussion_r1584647408
>>> basic_transforms = [
... A.LongestMaxSize(max_size=max_size),
... A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
... ]

>>> train_augment_and_transform = A.Compose(
... [
... A.Perspective(p=0.1),
... A.HorizontalFlip(p=0.5),
... A.RandomBrightnessContrast(p=0.5),
... A.HueSaturationValue(p=0.1),
... *basic_transforms,
... ],
... bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
... )

>>> validation_transform = A.Compose(
... basic_transforms,
... [A.NoOp()],
... bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
... )
```
Expand Down Expand Up @@ -294,7 +284,7 @@ The `image_processor` expects the annotations to be in the following format: `{'
Now you can combine the image and annotation transformations to use on a batch of examples:

```py
>>> def augment_and_transform_batch(examples, transform, image_processor):
>>> def augment_and_transform_batch(examples, transform, image_processor, return_pixel_mask=False):
... """Apply augmentations and format annotations in COCO format for object detection task"""

... images = []
Expand All @@ -315,6 +305,9 @@ Now you can combine the image and annotation transformations to use on a batch o
... # Apply the image processor transformations: resizing, rescaling, normalization
... result = image_processor(images=images, annotations=annotations, return_tensors="pt")

... if not return_pixel_mask:
... result.pop("pixel_mask", None)

... return result
```

Expand Down Expand Up @@ -1485,25 +1478,12 @@ Now that you have finetuned a model, evaluated it, and uploaded it to the Huggin
```py
>>> import torch
>>> import requests
>>> import numpy as np
>>> import albumentations as A

>>> from PIL import Image
>>> from PIL import Image, ImageDraw
>>> from transformers import AutoImageProcessor, AutoModelForObjectDetection

>>> url = "https://images.pexels.com/photos/8413299/pexels-photo-8413299.jpeg?auto=compress&cs=tinysrgb&w=630&h=375&dpr=2"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> # Define transformations for inference
>>> resize_and_pad = A.Compose([
... A.LongestMaxSize(max_size=max_size),
... A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
... ])

>>> # This one is for visualization with no padding
>>> resize_only = A.Compose([
... A.LongestMaxSize(max_size=max_size),
... ])
```

Load model and image processor from the Hugging Face Hub (skip to use already trained in this session):
Expand All @@ -1519,12 +1499,11 @@ Load model and image processor from the Hugging Face Hub (skip to use already tr
And detect bounding boxes:

```py
>>> np_preprocessed_image = resize_and_pad(image=np.array(image))["image"]

>>> with torch.no_grad():
... inputs = image_processor(images=[np_preprocessed_image], return_tensors="pt")
... outputs = model(inputs["pixel_values"].to(device))
... target_sizes = torch.tensor([np_preprocessed_image.shape[:2]])
... inputs = image_processor(images=[image], return_tensors="pt")
... outputs = model(**inputs.to(device))
... target_sizes = torch.tensor([[image.size[1], image.size[0]]])
... results = image_processor.post_process_object_detection(outputs, threshold=0.3, target_sizes=target_sizes)[0]

>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
Expand All @@ -1543,17 +1522,15 @@ Detected Coverall with confidence 0.391 at location [68.61, 126.66, 309.03, 318.
Let's plot the result:

```py
>>> resized_image = resize_only(image=np.array(image))["image"]
>>> resized_image = Image.fromarray(resized_image)
>>> draw = ImageDraw.Draw(resized_image)
>>> draw = ImageDraw.Draw(image)

>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
... box = [round(i, 2) for i in box.tolist()]
... x, y, x2, y2 = tuple(box)
... draw.rectangle((x, y, x2, y2), outline="red", width=1)
... draw.text((x, y), model.config.id2label[label.item()], fill="white")

>>> resized_image
>>> image
```

<div class="flex justify-center">
Expand Down
3 changes: 2 additions & 1 deletion examples/pytorch/object-detection/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ python run_object_detection.py \
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 1 \
--remove_unused_columns false \
--eval_do_concat_batches false \
--eval_do_concat_batches false \
--ignore_mismatched_sizes true \
--metric_for_best_model eval_map \
--greater_is_better true \
Expand Down Expand Up @@ -200,6 +200,7 @@ Where `metadata.jsonl` is a file with the following structure:
{"file_name": "0002.jpg", "objects": {"bbox": [[810.0, 100.0, 57.0, 28.0]], "categories": [1], "id": [2], "area": [40.0]}}
...
```
Trining script support bounding boxes in COCO format (x_min, y_min, width, height).

Then, you cat load the dataset with just a few lines of code:

Expand Down
25 changes: 12 additions & 13 deletions examples/pytorch/object-detection/run_object_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,10 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]


def augment_and_transform_batch(
examples: Mapping[str, Any], transform: A.Compose, image_processor: AutoImageProcessor
examples: Mapping[str, Any],
transform: A.Compose,
image_processor: AutoImageProcessor,
return_pixel_mask: bool = False,
) -> BatchFeature:
"""Apply augmentations and format annotations in COCO format for object detection task"""

Expand All @@ -139,6 +142,9 @@ def augment_and_transform_batch(
# Apply the image processor transformations: resizing, rescaling, normalization
result = image_processor(images=images, annotations=annotations, return_tensors="pt")

if not return_pixel_mask:
result.pop("pixel_mask", None)

return result


Expand Down Expand Up @@ -415,23 +421,17 @@ def main():
)
image_processor = AutoImageProcessor.from_pretrained(
model_args.image_processor_name or model_args.model_name_or_path,
# At this moment we recommend using external transform to pad and resize images.
# It`s faster and yields much better results for object-detection models.
do_pad=False,
do_resize=False,
# We will save image size parameter in config just for reference
size={"longest_edge": data_args.image_square_size},
do_resize=True,
size={"max_height": data_args.image_square_size, "max_width": data_args.image_square_size},
do_pad=True,
pad_size={"height": data_args.image_square_size, "width": data_args.image_square_size},
**common_pretrained_args,
)

# ------------------------------------------------------------------------------------------------
# Define image augmentations and dataset transforms
# ------------------------------------------------------------------------------------------------
max_size = data_args.image_square_size
basic_transforms = [
A.LongestMaxSize(max_size=max_size),
A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
]
train_augment_and_transform = A.Compose(
[
A.Compose(
Expand All @@ -453,12 +453,11 @@ def main():
A.HorizontalFlip(p=0.5),
A.RandomBrightnessContrast(p=0.5),
A.HueSaturationValue(p=0.1),
*basic_transforms,
],
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
)
validation_transform = A.Compose(
basic_transforms,
[A.NoOp()],
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,10 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]

# Copied from examples/pytorch/object-detection/run_object_detection.augment_and_transform_batch
def augment_and_transform_batch(
examples: Mapping[str, Any], transform: A.Compose, image_processor: AutoImageProcessor
examples: Mapping[str, Any],
transform: A.Compose,
image_processor: AutoImageProcessor,
return_pixel_mask: bool = False,
) -> BatchFeature:
"""Apply augmentations and format annotations in COCO format for object detection task"""

Expand All @@ -142,6 +145,9 @@ def augment_and_transform_batch(
# Apply the image processor transformations: resizing, rescaling, normalization
result = image_processor(images=images, annotations=annotations, return_tensors="pt")

if not return_pixel_mask:
result.pop("pixel_mask", None)

return result


Expand Down Expand Up @@ -473,23 +479,17 @@ def main():
)
image_processor = AutoImageProcessor.from_pretrained(
args.model_name_or_path,
# At this moment we recommend using external transform to pad and resize images.
# It`s faster and yields much better results for object-detection models.
do_pad=False,
do_resize=False,
# We will save image size parameter in config just for reference
size={"longest_edge": args.image_square_size},
do_resize=True,
size={"max_height": args.image_square_size, "max_width": args.image_square_size},
do_pad=True,
pad_size={"height": args.image_square_size, "width": args.image_square_size},
**common_pretrained_args,
)

# ------------------------------------------------------------------------------------------------
# Define image augmentations and dataset transforms
# ------------------------------------------------------------------------------------------------
max_size = args.image_square_size
basic_transforms = [
A.LongestMaxSize(max_size=max_size),
A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
]
train_augment_and_transform = A.Compose(
[
A.Compose(
Expand All @@ -511,12 +511,11 @@ def main():
A.HorizontalFlip(p=0.5),
A.RandomBrightnessContrast(p=0.5),
A.HueSaturationValue(p=0.1),
*basic_transforms,
],
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
)
validation_transform = A.Compose(
basic_transforms,
[A.NoOp()],
bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
)

Expand Down