huggingface · ArthurZucker · Jan 21, 2026 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
@@ -38,9 +38,6 @@
 class BaseImageProcessor(ImageProcessingMixin):
     valid_kwargs = ImagesKwargs
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
     @property
     def is_fast(self) -> bool:
         """

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
@@ -863,31 +863,43 @@ def _group_images_by_shape(nested_images, *paired_inputs, is_nested: bool = Fals
                 paired_grouped_values[paired_index][shape].append(paired_value)
             grouped_images_index[key] = (shape, len(grouped_images[shape]) - 1)
 
+    # Store structure size for nested inputs to handle empty sublists during reconstruction
+    if is_nested:
+        grouped_images_index["_num_sublists"] = len(normalized_images)
+
     return grouped_images, *paired_grouped_values, grouped_images_index
 
 
 def _reconstruct_nested_structure(indices, processed_images):
     """Helper function to reconstruct a single level nested structure."""
-    # Find the maximum outer index
-    max_outer_idx = max(idx[0] for idx in indices)
-
-    # Create the outer list
-    result = [None] * (max_outer_idx + 1)
+    # Get the number of sublists (handles empty sublists like in [[], [image]])
+    num_sublists = indices.pop("_num_sublists", None)
 
     # Group indices by outer index
     nested_indices = defaultdict(list)
     for i, j in indices:
         nested_indices[i].append(j)
 
+    # Determine the number of outer sublists
+    if num_sublists is not None:
+        max_outer_idx = num_sublists - 1
+    elif nested_indices:
+        max_outer_idx = max(nested_indices.keys())
+    else:
+        return []
+
+    # Create the result structure
+    result = []
     for i in range(max_outer_idx + 1):
-        if i in nested_indices:
+        if i not in nested_indices:
+            result.append([])
+        else:
             inner_max_idx = max(nested_indices[i])
             inner_list = [None] * (inner_max_idx + 1)
-            for j in range(inner_max_idx + 1):
-                if (i, j) in indices:
-                    shape, idx = indices[(i, j)]
-                    inner_list[j] = processed_images[shape][idx]
-            result[i] = inner_list
+            for j in nested_indices[i]:
+                shape, idx = indices[(i, j)]
+                inner_list[j] = processed_images[shape][idx]
+            result.append(inner_list)
 
     return result
 
@@ -908,6 +920,21 @@ def _iterate_items(items, is_nested: bool):
             yield i, item
 
 
+def _get_device_from_images(images, is_nested: bool) -> "torch.device":
+    """
+    Get the device from the first non-empty element in a (potentially nested) list of images.
+
+    Handles cases like `images = [[], [image]]` where the first sublist may be empty.
+    """
+    if is_nested:
+        for row in images:
+            if isinstance(row, torch.Tensor):
+                return row.device
+            if isinstance(row, list) and len(row) > 0:
+                return row[0].device
+    return images[0].device
+
+
 def group_images_by_shape(
     images: Union[list["torch.Tensor"], "torch.Tensor"],
     *paired_inputs,
@@ -945,17 +972,21 @@ def group_images_by_shape(
     """
     # If disable grouping is not explicitly provided, we favor disabling it if the images are on CPU, and enabling it otherwise.
     if disable_grouping is None:
-        device = images[0][0].device if is_nested else images[0].device
+        device = _get_device_from_images(images, is_nested)
         disable_grouping = device == "cpu"
 
     if disable_grouping:
+        grouped_images_index = {key: (key, 0) for key, _ in _iterate_items(images, is_nested)}
+        if is_nested:
+            grouped_images_index["_num_sublists"] = len(images)
+
         return (
             {key: img.unsqueeze(0) for key, img in _iterate_items(images, is_nested)},
             *[
                 {key: item.unsqueeze(0) for key, item in _iterate_items(paired_list, is_nested)}
                 for paired_list in paired_inputs
             ],
-            {key: (key, 0) for key, _ in _iterate_items(images, is_nested)},
+            grouped_images_index,
         )
 
     # Handle single level nested structure

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -47,9 +47,14 @@
 
 logger = logging.get_logger(__name__)
 
-
-FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"]
-
+# These image processors use Lanczos interpolation, which is not supported by fast image processors.
+# To avoid important differences in outputs, we default to using the slow image processors for these processors.
+DEFAULT_TO_SLOW_IMAGE_PROCESSORS = [
+    "ChameleonImageProcessor",
+    "FlavaImageProcessor",
+    "Idefics3ImageProcessor",
+    "SmolVLMImageProcessor",
+]
 
 if TYPE_CHECKING:
     # This significantly improves completion suggestion performance when
@@ -535,24 +540,20 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 image_processor_auto_map = config.auto_map["AutoImageProcessor"]
 
         image_processor_class = None
-        # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
         if image_processor_type is not None:
             # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
             if use_fast is None:
                 use_fast = image_processor_type.endswith("Fast")
-                if not use_fast and image_processor_type in FORCE_FAST_IMAGE_PROCESSOR and is_torchvision_available():
-                    use_fast = True
+                if (
+                    not use_fast
+                    and is_torchvision_available()
+                    and image_processor_type not in DEFAULT_TO_SLOW_IMAGE_PROCESSORS
+                ):
                     logger.warning_once(
                         f"The image processor of type `{image_processor_type}` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. "
                         "This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. "
-                        "Note that this behavior will be extended to all models in a future release."
-                    )
-                if not use_fast:
-                    logger.warning_once(
-                        "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
-                        "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
-                        "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
                     )
+                    use_fast = True
             if use_fast and not image_processor_type.endswith("Fast"):
                 image_processor_type += "Fast"
             if use_fast and not is_torchvision_available():

diff --git a/src/transformers/models/clip/image_processing_clip_fast.py b/src/transformers/models/clip/image_processing_clip_fast.py
@@ -15,6 +15,7 @@
 
 from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import auto_docstring
 
 
@@ -34,5 +35,13 @@ class CLIPImageProcessorFast(BaseImageProcessorFast):
     do_normalize = True
     do_convert_rgb = True
 
+    def __init__(self, **kwargs: Unpack[ImagesKwargs]):
+        # for backwards compatibility of KOSMOS-2
+        if "use_square_size" in kwargs and kwargs["use_square_size"]:
+            kwargs["size"] = {"height": self.size["shortest_edge"], "width": self.size["shortest_edge"]}
+            kwargs.pop("use_square_size")
+
+        super().__init__(**kwargs)
+
 
 __all__ = ["CLIPImageProcessorFast"]
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
@@ -48,14 +48,22 @@ def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=No
         if text is not None and visual_prompt is not None:
             raise ValueError("You have to specify exactly one type of prompt. Either text or visual prompt.")
 
+        output_kwargs = self._merge_kwargs(
+            self.valid_processor_kwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs
+        )
+
         if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **output_kwargs["text_kwargs"])
 
         if visual_prompt is not None:
-            prompt_features = self.image_processor(visual_prompt, return_tensors=return_tensors, **kwargs)
+            prompt_features = self.image_processor(
+                visual_prompt, return_tensors=return_tensors, **output_kwargs["images_kwargs"]
+            )
 
         if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(
+                images, return_tensors=return_tensors, **output_kwargs["images_kwargs"]
+            )
 
         if visual_prompt is not None and images is not None:
             encoding = {

diff --git a/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py b/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py
@@ -111,6 +111,7 @@ def _prepare_images_structure(
         **kwargs,
     ) -> ImageInput:
         # we need to handle image pairs validation and flattening
+        images = self.fetch_images(images)
         return flatten_pair_images(images)
 
     def _preprocess(

diff --git a/src/transformers/models/fuyu/image_processing_fuyu_fast.py b/src/transformers/models/fuyu/image_processing_fuyu_fast.py
@@ -50,6 +50,7 @@
 class FuyuImageProcessorFast(BaseImageProcessorFast):
     do_resize = True
     size = {"height": 1080, "width": 1920}
+    patch_size = {"height": 30, "width": 30}
     resample = PILImageResampling.BILINEAR
     do_pad = True
     padding_value = 1.0

diff --git a/src/transformers/models/idefics2/image_processing_idefics2_fast.py b/src/transformers/models/idefics2/image_processing_idefics2_fast.py
@@ -147,6 +147,7 @@ def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3)
         """
         Prepare a nested images structure for processing.
         """
+        images = self.fetch_images(images)
         return make_nested_list_of_images(images, expected_ndims=expected_ndims)
 
     def split_images(

diff --git a/src/transformers/models/idefics3/image_processing_idefics3_fast.py b/src/transformers/models/idefics3/image_processing_idefics3_fast.py
@@ -152,6 +152,27 @@ def get_max_height_width(images_list: list[list["torch.Tensor"]]) -> tuple[int,
     return (max_height, max_width)
 
 
+def get_num_channels(images_list: list[list["torch.Tensor"]]) -> int:
+    """
+    Get the number of channels across all images in a batch. Handle empty sublists like in [[], [image]].
+    """
+    for images in images_list:
+        if images:
+            return images[0].shape[0]
+
+    raise ValueError("No images found in the batch.")
+
+
+def get_device_from_images(images_list: list[list["torch.Tensor"]]) -> "torch.device":
+    """
+    Get the device from the first non-empty element in a nested list of images.
+    Handle empty sublists like in [[], [image]].
+    """
+    for images in images_list:
+        if images:
+            return images[0].device
+
+
 def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "torch.Tensor":
     """
     Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
@@ -183,11 +204,14 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast):
     do_pad = True
     return_row_col_info = False
     valid_kwargs = Idefics3ImageProcessorKwargs
+    model_input_names = ["pixel_values", "pixel_attention_mask"]
 
     def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3) -> ImageInput:
         """
         Prepare a nested images structure for processing.
         """
+        # Checks for `str` in case of URL/local path and optionally loads images
+        images = self.fetch_images(images)
         return make_nested_list_of_images(images, expected_ndims=expected_ndims)
 
     def resize(
@@ -438,18 +462,20 @@ def _preprocess(
             # Get max images per batch
             max_num_images = max(len(images_) for images_ in processed_images)
             max_height, max_width = get_max_height_width(processed_images)
+            num_channels = get_num_channels(processed_images)
+            device = get_device_from_images(processed_images)
 
             processed_images_padded = torch.zeros(
                 len(processed_images),
                 max_num_images,
-                *(processed_images[0][0].shape[0], max_height, max_width),
-                device=processed_images[0][0].device,
+                *(num_channels, max_height, max_width),
+                device=device,
             )
             pixel_attention_masks = torch.zeros(
                 len(processed_images),
                 max_num_images,
                 *(max_height, max_width),
-                device=processed_images[0][0].device,
+                device=device,
             )
             for i, images in enumerate(processed_images):
                 for j, image in enumerate(images):

diff --git a/src/transformers/models/janus/image_processing_janus_fast.py b/src/transformers/models/janus/image_processing_janus_fast.py
@@ -217,10 +217,10 @@ def postprocess(
         if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
             images = [F.to_pil_image(image) for image in images]
 
-        data = {"pixel_values": images}
         return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
+        images = torch.stack(images, dim=0) if return_tensors == "pt" else images
 
-        return BatchFeature(data=data, tensor_type=return_tensors)
+        return BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
 
 
 __all__ = ["JanusImageProcessorFast"]
diff --git a/src/transformers/models/lightglue/image_processing_lightglue_fast.py b/src/transformers/models/lightglue/image_processing_lightglue_fast.py
@@ -132,6 +132,7 @@ def _prepare_images_structure(
         **kwargs,
     ) -> ImageInput:
         # we need to handle image pairs validation and flattening
+        images = self.fetch_images(images)
         return flatten_pair_images(images)
 
     def _preprocess(

diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
@@ -69,6 +69,7 @@ def __init__(self, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]):
     def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]) -> BatchFeature:
         if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
             # if the first element is a list, we assume that all elements are lists
+            images = [x for x in images if x]  # handle text-only case
             batch_num_images = [len(x) for x in images]
         elif isinstance(images, (tuple, list)):
             # treat this as a single-image case for backward compatibility

diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py
@@ -121,6 +121,7 @@ def pad_to_square(
     def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]) -> BatchFeature:
         if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
             # if the first element is a list, we assume that all elements are lists
+            images = [x for x in images if x]  # handle text-only case
             batch_num_images = [len(x) for x in images]
         elif isinstance(images, (tuple, list)):
             # treat this as a single-image case for backward compatibility

diff --git a/src/transformers/models/mllama/image_processing_mllama_fast.py b/src/transformers/models/mllama/image_processing_mllama_fast.py
@@ -214,6 +214,7 @@ class MllamaImageProcessorFast(BaseImageProcessorFast):
     do_pad = True
     max_image_tiles = 4
     valid_kwargs = MllamaImageProcessorKwargs
+    model_input_names = ["pixel_values", "num_tiles", "aspect_ratio_ids", "aspect_ratio_mask"]
 
     def __init__(self, **kwargs: Unpack[MllamaImageProcessorKwargs]):
         super().__init__(**kwargs)