Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/source/en/model_doc/glpn.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
[[autodoc]] GLPNImageProcessor
- preprocess

## GLPNImageProcessorFast

[[autodoc]] GLPNImageProcessorFast
- preprocess

## GLPNModel

[[autodoc]] GLPNModel
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/auto/image_processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@
("gemma3n", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
("glm4v", ("Glm4vImageProcessor", "Glm4vImageProcessorFast")),
("glpn", ("GLPNImageProcessor", None)),
("glpn", ("GLPNImageProcessor", "GLPNImageProcessorFast")),
("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
("grounding-dino", ("GroundingDinoImageProcessor", "GroundingDinoImageProcessorFast")),
("groupvit", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/glpn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from .configuration_glpn import *
from .feature_extraction_glpn import *
from .image_processing_glpn import *
from .image_processing_glpn_fast import *
from .modeling_glpn import *
else:
import sys
Expand Down
230 changes: 230 additions & 0 deletions src/transformers/models/glpn/image_processing_glpn_fast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Image processor class for GLPN."""

from typing import Optional, Union

import torch
from torchvision.transforms.v2 import functional as F

from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
PILImageResampling,
)

# optional typing container (similar to ZoeDepthImageProcessorKwargs)
Comment thread
Aravind-11 marked this conversation as resolved.
Outdated
from ...processing_utils import ImagesKwargs
from ...utils import (
TensorType,
auto_docstring,
requires_backends,
)


class GLPNImageProcessorKwargs(ImagesKwargs, total=False):
# Public (persisted) key — must match slow processor:
size_divisor: int
# Back-compat alias (NOT persisted):
ensure_multiple_of: int
# Allow overriding resample (persisted like slow):
resample: PILImageResampling
Comment thread
Aravind-11 marked this conversation as resolved.
Outdated


@auto_docstring
class GLPNImageProcessorFast(BaseImageProcessorFast):
"""
Fast image processor for GLPN using the Torch/TorchVision backend.

Performs:
- Crop H,W down to the nearest multiple of `size_divisor` (default 32)
- Rescale [0,255] → [0,1]
- (No normalization by default)
"""

# Persist ONLY the same keys as the slow processor
Comment thread
Aravind-11 marked this conversation as resolved.
Outdated
do_resize = True
do_rescale = True
do_normalize = False
resample = PILImageResampling.BILINEAR
size_divisor = 32
# Don't persist an explicit `size` for GLPN (slow doesn't)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's fine to persist here

image_mean = IMAGENET_STANDARD_MEAN
image_std = IMAGENET_STANDARD_STD
size = {"height": 480, "width": 640} # only for validation; we still crop, not resize

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah but size is actually defined here - no need to re-define it after!

interpolation = F.InterpolationMode.BILINEAR
valid_kwargs = GLPNImageProcessorKwargs

# If BaseImageProcessorFast supports it, this makes persistence explicit:
try:
config_keys = {"do_resize", "size_divisor", "resample", "do_rescale"}
except Exception:
pass

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why we want to persist these keys? Might be a misunderstanding on my end

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed them.


def __init__(self, **kwargs: GLPNImageProcessorKwargs) -> None:
if "ensure_multiple_of" in kwargs and "size_divisor" not in kwargs:
kwargs = dict(kwargs)
kwargs["size_divisor"] = kwargs.pop("ensure_multiple_of")
# ensure resample default for validation
kwargs.setdefault("resample", PILImageResampling.BILINEAR)
super().__init__(**kwargs)

@staticmethod
def _crop_to_multiple(
images: torch.Tensor,
size_divisor: int = 32,
) -> torch.Tensor:
"""
Crop images (B,C,H,W) by flooring H and W to nearest multiple of `size_divisor`.
No resampling; purely geometric crop to match slow GLPN behavior.
"""
_, _, h, w = images.shape
new_h = (h // size_divisor) * size_divisor
new_w = (w // size_divisor) * size_divisor
if (new_h, new_w) == (h, w):
return images
# Use top-left crop to mirror typical behavior; slow doesn't center-crop.
return images[..., :new_h, :new_w]

def _preprocess(
self,
images: list["torch.Tensor"],
do_resize: bool,
size: Optional[dict] = None,
size_divisor: Optional[int] = None,
interpolation: Optional["F.InterpolationMode"] = None,
do_rescale: bool = True,
rescale_factor: Optional[float] = 1 / 255,
do_normalize: bool = False,
image_mean: Optional[Union[float, list[float]]] = None,
image_std: Optional[Union[float, list[float]]] = None,
disable_grouping: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
resample: Optional[PILImageResampling] = None,
**kwargs,
) -> BatchFeature:
"""
GLPN fast preprocessing:
- crop to floored multiple of size_divisor
- rescale [0,1]
- normalize (off by default)
"""
# 🔹 avoid validation error: inject dummy size/resample for validate_preprocess_arguments
if size is None:
size = {"height": 480, "width": 640}
if resample is None and interpolation is None:
resample = self.resample

grouped_images, grouped_index = group_images_by_shape(images, disable_grouping=disable_grouping)
processed_groups = {}
sd = size_divisor if size_divisor is not None else self.size_divisor

for shape, stacked_images in grouped_images.items():
if do_resize:
stacked_images = self._crop_to_multiple(stacked_images, sd)
if do_rescale:
stacked_images = self.rescale(stacked_images, rescale_factor)
if do_normalize:
stacked_images = self.normalize(stacked_images, image_mean, image_std)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can fuse the rescale and normalize ops with rescale_and_normalize

processed_groups[shape] = stacked_images

reordered = reorder_images(processed_groups, grouped_index)

if return_tensors:
# Detect heterogeneous shapes

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are there heterogeneous shapes or not? else a pattern like

        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

would be much preferred. Else let's at least extract the padding logic to a function, look in image processing utils fast, there's a padding method already. Why not use it?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, its producing heterogenous shapes. I used the pad function from utils.

shapes = {tuple(img.shape) for img in reordered}
if len(shapes) == 1:
# all images same shape -> safe to stack
processed = torch.stack(reordered, dim=0)
tensor_type = return_tensors
else:
# mimic slow processor: leave as list so BatchFeature won't tensorize
processed = [img.cpu().numpy() for img in reordered]
tensor_type = None
else:
processed = reordered
tensor_type = None

return BatchFeature(data={"pixel_values": processed}, tensor_type=tensor_type)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this parts isn't "fast": it converts to numpy when shapes differ, it's why the test test_slow_fast_equivalence_batched fails, when shapes differ tensor_type is set to None

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hey, I'm pretty confident test_slow_fast_equivalence_batched will fail with this setup currently - also looking at the slow test, what would cause the shapes to become heterogeneous, not resizing? In that case let's pad the batch and return it as a tensor IMO

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it.


# 🔹 ensure only slow keys are serialized
def to_dict(self):
d = super().to_dict()

# ✅ Keep identity metadata so AutoImageProcessor can load fast directly
keep_always = {"image_processor_type", "processor_class"}

# ✅ Keys that should persist with value (slow-compatible)
keep_values = {"do_resize", "size_divisor", "resample", "do_rescale", "default_to_square", "data_format"}

# ❌ Fast-only or confusing-on-disk: null them out to satisfy test expectations
null_out = {
"size", # validator-only; we crop anyway
"ensure_multiple_of", # alias we accepted in __init__
"interpolation", # runtime helper for validator
"image_mean",
"image_std",
"do_normalize", # GLPN slow doesn’t persist these by default
}

# Build filtered dict:
out = {}
for k, v in d.items():
if k in keep_always or k in keep_values:
out[k] = v
elif k in null_out:
out[k] = None
else:
# For any other unexpected fast-only keys, set None to be safe
out[k] = None

return out
Comment thread
Aravind-11 marked this conversation as resolved.
Outdated

@torch.no_grad()

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
@torch.no_grad()

def post_process_depth_estimation(self, outputs, target_sizes=None):
"""
Convert raw model outputs to final depth predictions.
Mirrors slow GLPN: PyTorch interpolate w/ bicubic, align_corners=False.
"""
requires_backends(self, "torch")
predicted_depth = outputs.predicted_depth # shape: (B, H, W) or (B, 1, H, W)

# Normalize shape to (B, H, W)
if predicted_depth.ndim == 4 and predicted_depth.shape[1] == 1:
predicted_depth = predicted_depth.squeeze(1)
elif predicted_depth.ndim == 3:
pass
else:
# fallback: ensure (B, H, W)
if predicted_depth.ndim == 4:
predicted_depth = predicted_depth[:, 0, ...]
else:
raise ValueError("Unexpected depth prediction shape")
Comment thread
Aravind-11 marked this conversation as resolved.
Outdated

results = []
target_sizes = target_sizes or [None] * predicted_depth.shape[0]
for depth, tgt in zip(predicted_depth, target_sizes):
if tgt is not None:
# slow adds [None, None, ...], interpolates, then squeezes
d = depth[None, None, ...]
d = torch.nn.functional.interpolate(d, size=tgt, mode="bicubic", align_corners=False)
depth = d.squeeze(0).squeeze(0)
results.append({"predicted_depth": depth})
return results
Comment thread
Aravind-11 marked this conversation as resolved.
Outdated


__all__ = ["GLPNImageProcessorFast"]
Loading