-
Notifications
You must be signed in to change notification settings - Fork 33.8k
Add GLPNImageProcessorFast #41725
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add GLPNImageProcessorFast #41725
Changes from 1 commit
a05687c
7376c7a
3b2647d
1d77a90
0afbb58
c70bdf0
74c8a83
8e9b398
51f12c9
c376606
ddc2f56
21723d1
60df2c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
| @@ -0,0 +1,230 @@ | ||||
| # coding=utf-8 | ||||
| # Copyright 2025 The HuggingFace Inc. team. All rights reserved. | ||||
| # | ||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| # you may not use this file except in compliance with the License. | ||||
| # You may obtain a copy of the License at | ||||
| # | ||||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||||
| # | ||||
| # Unless required by applicable law or agreed to in writing, software | ||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
| """Fast Image processor class for GLPN.""" | ||||
|
|
||||
| from typing import Optional, Union | ||||
|
|
||||
| import torch | ||||
| from torchvision.transforms.v2 import functional as F | ||||
|
|
||||
| from ...image_processing_utils import BatchFeature | ||||
| from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images | ||||
| from ...image_utils import ( | ||||
| IMAGENET_STANDARD_MEAN, | ||||
| IMAGENET_STANDARD_STD, | ||||
| PILImageResampling, | ||||
| ) | ||||
|
|
||||
| # optional typing container (similar to ZoeDepthImageProcessorKwargs) | ||||
| from ...processing_utils import ImagesKwargs | ||||
| from ...utils import ( | ||||
| TensorType, | ||||
| auto_docstring, | ||||
| requires_backends, | ||||
| ) | ||||
|
|
||||
|
|
||||
| class GLPNImageProcessorKwargs(ImagesKwargs, total=False): | ||||
| # Public (persisted) key — must match slow processor: | ||||
| size_divisor: int | ||||
| # Back-compat alias (NOT persisted): | ||||
| ensure_multiple_of: int | ||||
| # Allow overriding resample (persisted like slow): | ||||
| resample: PILImageResampling | ||||
|
Aravind-11 marked this conversation as resolved.
Outdated
|
||||
|
|
||||
|
|
||||
| @auto_docstring | ||||
| class GLPNImageProcessorFast(BaseImageProcessorFast): | ||||
| """ | ||||
| Fast image processor for GLPN using the Torch/TorchVision backend. | ||||
|
|
||||
| Performs: | ||||
| - Crop H,W down to the nearest multiple of `size_divisor` (default 32) | ||||
| - Rescale [0,255] → [0,1] | ||||
| - (No normalization by default) | ||||
| """ | ||||
|
|
||||
| # Persist ONLY the same keys as the slow processor | ||||
|
Aravind-11 marked this conversation as resolved.
Outdated
|
||||
| do_resize = True | ||||
| do_rescale = True | ||||
| do_normalize = False | ||||
| resample = PILImageResampling.BILINEAR | ||||
| size_divisor = 32 | ||||
| # Don't persist an explicit `size` for GLPN (slow doesn't) | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's fine to persist here |
||||
| image_mean = IMAGENET_STANDARD_MEAN | ||||
| image_std = IMAGENET_STANDARD_STD | ||||
| size = {"height": 480, "width": 640} # only for validation; we still crop, not resize | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah but size is actually defined here - no need to re-define it after! |
||||
| interpolation = F.InterpolationMode.BILINEAR | ||||
| valid_kwargs = GLPNImageProcessorKwargs | ||||
|
|
||||
| # If BaseImageProcessorFast supports it, this makes persistence explicit: | ||||
| try: | ||||
| config_keys = {"do_resize", "size_divisor", "resample", "do_rescale"} | ||||
| except Exception: | ||||
| pass | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure why we want to persist these keys? Might be a misunderstanding on my end
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed them. |
||||
|
|
||||
| def __init__(self, **kwargs: GLPNImageProcessorKwargs) -> None: | ||||
| if "ensure_multiple_of" in kwargs and "size_divisor" not in kwargs: | ||||
| kwargs = dict(kwargs) | ||||
| kwargs["size_divisor"] = kwargs.pop("ensure_multiple_of") | ||||
| # ensure resample default for validation | ||||
| kwargs.setdefault("resample", PILImageResampling.BILINEAR) | ||||
| super().__init__(**kwargs) | ||||
|
|
||||
| @staticmethod | ||||
| def _crop_to_multiple( | ||||
| images: torch.Tensor, | ||||
| size_divisor: int = 32, | ||||
| ) -> torch.Tensor: | ||||
| """ | ||||
| Crop images (B,C,H,W) by flooring H and W to nearest multiple of `size_divisor`. | ||||
| No resampling; purely geometric crop to match slow GLPN behavior. | ||||
| """ | ||||
| _, _, h, w = images.shape | ||||
| new_h = (h // size_divisor) * size_divisor | ||||
| new_w = (w // size_divisor) * size_divisor | ||||
| if (new_h, new_w) == (h, w): | ||||
| return images | ||||
| # Use top-left crop to mirror typical behavior; slow doesn't center-crop. | ||||
| return images[..., :new_h, :new_w] | ||||
|
|
||||
| def _preprocess( | ||||
| self, | ||||
| images: list["torch.Tensor"], | ||||
| do_resize: bool, | ||||
| size: Optional[dict] = None, | ||||
| size_divisor: Optional[int] = None, | ||||
| interpolation: Optional["F.InterpolationMode"] = None, | ||||
| do_rescale: bool = True, | ||||
| rescale_factor: Optional[float] = 1 / 255, | ||||
| do_normalize: bool = False, | ||||
| image_mean: Optional[Union[float, list[float]]] = None, | ||||
| image_std: Optional[Union[float, list[float]]] = None, | ||||
| disable_grouping: Optional[bool] = None, | ||||
| return_tensors: Optional[Union[str, TensorType]] = None, | ||||
| resample: Optional[PILImageResampling] = None, | ||||
| **kwargs, | ||||
| ) -> BatchFeature: | ||||
| """ | ||||
| GLPN fast preprocessing: | ||||
| - crop to floored multiple of size_divisor | ||||
| - rescale [0,1] | ||||
| - normalize (off by default) | ||||
| """ | ||||
| # 🔹 avoid validation error: inject dummy size/resample for validate_preprocess_arguments | ||||
| if size is None: | ||||
| size = {"height": 480, "width": 640} | ||||
| if resample is None and interpolation is None: | ||||
| resample = self.resample | ||||
|
|
||||
| grouped_images, grouped_index = group_images_by_shape(images, disable_grouping=disable_grouping) | ||||
| processed_groups = {} | ||||
| sd = size_divisor if size_divisor is not None else self.size_divisor | ||||
|
|
||||
| for shape, stacked_images in grouped_images.items(): | ||||
| if do_resize: | ||||
| stacked_images = self._crop_to_multiple(stacked_images, sd) | ||||
| if do_rescale: | ||||
| stacked_images = self.rescale(stacked_images, rescale_factor) | ||||
| if do_normalize: | ||||
| stacked_images = self.normalize(stacked_images, image_mean, image_std) | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can fuse the rescale and normalize ops with |
||||
| processed_groups[shape] = stacked_images | ||||
|
|
||||
| reordered = reorder_images(processed_groups, grouped_index) | ||||
|
|
||||
| if return_tensors: | ||||
| # Detect heterogeneous shapes | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are there heterogeneous shapes or not? else a pattern like processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)would be much preferred. Else let's at least extract the padding logic to a function, look in image processing utils fast, there's a padding method already. Why not use it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, its producing heterogenous shapes. I used the pad function from utils. |
||||
| shapes = {tuple(img.shape) for img in reordered} | ||||
| if len(shapes) == 1: | ||||
| # all images same shape -> safe to stack | ||||
| processed = torch.stack(reordered, dim=0) | ||||
| tensor_type = return_tensors | ||||
| else: | ||||
| # mimic slow processor: leave as list so BatchFeature won't tensorize | ||||
| processed = [img.cpu().numpy() for img in reordered] | ||||
| tensor_type = None | ||||
| else: | ||||
| processed = reordered | ||||
| tensor_type = None | ||||
|
|
||||
| return BatchFeature(data={"pixel_values": processed}, tensor_type=tensor_type) | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this parts isn't "fast": it converts to numpy when shapes differ, it's why the test
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hey, I'm pretty confident
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. |
||||
|
|
||||
| # 🔹 ensure only slow keys are serialized | ||||
| def to_dict(self): | ||||
| d = super().to_dict() | ||||
|
|
||||
| # ✅ Keep identity metadata so AutoImageProcessor can load fast directly | ||||
| keep_always = {"image_processor_type", "processor_class"} | ||||
|
|
||||
| # ✅ Keys that should persist with value (slow-compatible) | ||||
| keep_values = {"do_resize", "size_divisor", "resample", "do_rescale", "default_to_square", "data_format"} | ||||
|
|
||||
| # ❌ Fast-only or confusing-on-disk: null them out to satisfy test expectations | ||||
| null_out = { | ||||
| "size", # validator-only; we crop anyway | ||||
| "ensure_multiple_of", # alias we accepted in __init__ | ||||
| "interpolation", # runtime helper for validator | ||||
| "image_mean", | ||||
| "image_std", | ||||
| "do_normalize", # GLPN slow doesn’t persist these by default | ||||
| } | ||||
|
|
||||
| # Build filtered dict: | ||||
| out = {} | ||||
| for k, v in d.items(): | ||||
| if k in keep_always or k in keep_values: | ||||
| out[k] = v | ||||
| elif k in null_out: | ||||
| out[k] = None | ||||
| else: | ||||
| # For any other unexpected fast-only keys, set None to be safe | ||||
| out[k] = None | ||||
|
|
||||
| return out | ||||
|
Aravind-11 marked this conversation as resolved.
Outdated
|
||||
|
|
||||
| @torch.no_grad() | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||
| def post_process_depth_estimation(self, outputs, target_sizes=None): | ||||
| """ | ||||
| Convert raw model outputs to final depth predictions. | ||||
| Mirrors slow GLPN: PyTorch interpolate w/ bicubic, align_corners=False. | ||||
| """ | ||||
| requires_backends(self, "torch") | ||||
| predicted_depth = outputs.predicted_depth # shape: (B, H, W) or (B, 1, H, W) | ||||
|
|
||||
| # Normalize shape to (B, H, W) | ||||
| if predicted_depth.ndim == 4 and predicted_depth.shape[1] == 1: | ||||
| predicted_depth = predicted_depth.squeeze(1) | ||||
| elif predicted_depth.ndim == 3: | ||||
| pass | ||||
| else: | ||||
| # fallback: ensure (B, H, W) | ||||
| if predicted_depth.ndim == 4: | ||||
| predicted_depth = predicted_depth[:, 0, ...] | ||||
| else: | ||||
| raise ValueError("Unexpected depth prediction shape") | ||||
|
Aravind-11 marked this conversation as resolved.
Outdated
|
||||
|
|
||||
| results = [] | ||||
| target_sizes = target_sizes or [None] * predicted_depth.shape[0] | ||||
| for depth, tgt in zip(predicted_depth, target_sizes): | ||||
| if tgt is not None: | ||||
| # slow adds [None, None, ...], interpolates, then squeezes | ||||
| d = depth[None, None, ...] | ||||
| d = torch.nn.functional.interpolate(d, size=tgt, mode="bicubic", align_corners=False) | ||||
| depth = d.squeeze(0).squeeze(0) | ||||
| results.append({"predicted_depth": depth}) | ||||
| return results | ||||
|
Aravind-11 marked this conversation as resolved.
Outdated
|
||||
|
|
||||
|
|
||||
| __all__ = ["GLPNImageProcessorFast"] | ||||
Uh oh!
There was an error while loading. Please reload this page.