diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index c1339500326f..50ba1c0424a6 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -38,9 +38,6 @@
 class BaseImageProcessor(ImageProcessingMixin):
     valid_kwargs = ImagesKwargs
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
     @property
     def is_fast(self) -> bool:
         """
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index fca1ad1b8ce6..b8df25511f81 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -863,31 +863,43 @@ def _group_images_by_shape(nested_images, *paired_inputs, is_nested: bool = Fals
                 paired_grouped_values[paired_index][shape].append(paired_value)
             grouped_images_index[key] = (shape, len(grouped_images[shape]) - 1)
 
+    # Store structure size for nested inputs to handle empty sublists during reconstruction
+    if is_nested:
+        grouped_images_index["_num_sublists"] = len(normalized_images)
+
     return grouped_images, *paired_grouped_values, grouped_images_index
 
 
 def _reconstruct_nested_structure(indices, processed_images):
     """Helper function to reconstruct a single level nested structure."""
-    # Find the maximum outer index
-    max_outer_idx = max(idx[0] for idx in indices)
-
-    # Create the outer list
-    result = [None] * (max_outer_idx + 1)
+    # Get the number of sublists (handles empty sublists like in [[], [image]])
+    num_sublists = indices.pop("_num_sublists", None)
 
     # Group indices by outer index
     nested_indices = defaultdict(list)
     for i, j in indices:
         nested_indices[i].append(j)
 
+    # Determine the number of outer sublists
+    if num_sublists is not None:
+        max_outer_idx = num_sublists - 1
+    elif nested_indices:
+        max_outer_idx = max(nested_indices.keys())
+    else:
+        return []
+
+    # Create the result structure
+    result = []
     for i in range(max_outer_idx + 1):
-        if i in nested_indices:
+        if i not in nested_indices:
+            result.append([])
+        else:
             inner_max_idx = max(nested_indices[i])
             inner_list = [None] * (inner_max_idx + 1)
-            for j in range(inner_max_idx + 1):
-                if (i, j) in indices:
-                    shape, idx = indices[(i, j)]
-                    inner_list[j] = processed_images[shape][idx]
-            result[i] = inner_list
+            for j in nested_indices[i]:
+                shape, idx = indices[(i, j)]
+                inner_list[j] = processed_images[shape][idx]
+            result.append(inner_list)
 
     return result
 
@@ -908,6 +920,21 @@ def _iterate_items(items, is_nested: bool):
             yield i, item
 
 
+def _get_device_from_images(images, is_nested: bool) -> "torch.device":
+    """
+    Get the device from the first non-empty element in a (potentially nested) list of images.
+
+    Handles cases like `images = [[], [image]]` where the first sublist may be empty.
+    """
+    if is_nested:
+        for row in images:
+            if isinstance(row, torch.Tensor):
+                return row.device
+            if isinstance(row, list) and len(row) > 0:
+                return row[0].device
+    return images[0].device
+
+
 def group_images_by_shape(
     images: Union[list["torch.Tensor"], "torch.Tensor"],
     *paired_inputs,
@@ -945,17 +972,21 @@ def group_images_by_shape(
     """
     # If disable grouping is not explicitly provided, we favor disabling it if the images are on CPU, and enabling it otherwise.
     if disable_grouping is None:
-        device = images[0][0].device if is_nested else images[0].device
+        device = _get_device_from_images(images, is_nested)
         disable_grouping = device == "cpu"
 
     if disable_grouping:
+        grouped_images_index = {key: (key, 0) for key, _ in _iterate_items(images, is_nested)}
+        if is_nested:
+            grouped_images_index["_num_sublists"] = len(images)
+
         return (
             {key: img.unsqueeze(0) for key, img in _iterate_items(images, is_nested)},
             *[
                 {key: item.unsqueeze(0) for key, item in _iterate_items(paired_list, is_nested)}
                 for paired_list in paired_inputs
             ],
-            {key: (key, 0) for key, _ in _iterate_items(images, is_nested)},
+            grouped_images_index,
         )
 
     # Handle single level nested structure
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 733ab2a40b56..e5fe9f4400eb 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -47,9 +47,14 @@
 
 logger = logging.get_logger(__name__)
 
-
-FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"]
-
+# These image processors use Lanczos interpolation, which is not supported by fast image processors.
+# To avoid important differences in outputs, we default to using the slow image processors for these processors.
+DEFAULT_TO_SLOW_IMAGE_PROCESSORS = [
+    "ChameleonImageProcessor",
+    "FlavaImageProcessor",
+    "Idefics3ImageProcessor",
+    "SmolVLMImageProcessor",
+]
 
 if TYPE_CHECKING:
     # This significantly improves completion suggestion performance when
@@ -535,24 +540,20 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 image_processor_auto_map = config.auto_map["AutoImageProcessor"]
 
         image_processor_class = None
-        # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
         if image_processor_type is not None:
             # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
             if use_fast is None:
                 use_fast = image_processor_type.endswith("Fast")
-                if not use_fast and image_processor_type in FORCE_FAST_IMAGE_PROCESSOR and is_torchvision_available():
-                    use_fast = True
+                if (
+                    not use_fast
+                    and is_torchvision_available()
+                    and image_processor_type not in DEFAULT_TO_SLOW_IMAGE_PROCESSORS
+                ):
                     logger.warning_once(
                         f"The image processor of type `{image_processor_type}` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. "
                         "This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. "
-                        "Note that this behavior will be extended to all models in a future release."
-                    )
-                if not use_fast:
-                    logger.warning_once(
-                        "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
-                        "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
-                        "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
                     )
+                    use_fast = True
             if use_fast and not image_processor_type.endswith("Fast"):
                 image_processor_type += "Fast"
             if use_fast and not is_torchvision_available():
diff --git a/src/transformers/models/clip/image_processing_clip_fast.py b/src/transformers/models/clip/image_processing_clip_fast.py
index 665ba49e507f..bd3c16e287cc 100644
--- a/src/transformers/models/clip/image_processing_clip_fast.py
+++ b/src/transformers/models/clip/image_processing_clip_fast.py
@@ -15,6 +15,7 @@
 
 from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import auto_docstring
 
 
@@ -34,5 +35,13 @@ class CLIPImageProcessorFast(BaseImageProcessorFast):
     do_normalize = True
     do_convert_rgb = True
 
+    def __init__(self, **kwargs: Unpack[ImagesKwargs]):
+        # for backwards compatibility of KOSMOS-2
+        if "use_square_size" in kwargs and kwargs["use_square_size"]:
+            kwargs["size"] = {"height": self.size["shortest_edge"], "width": self.size["shortest_edge"]}
+            kwargs.pop("use_square_size")
+
+        super().__init__(**kwargs)
+
 
 __all__ = ["CLIPImageProcessorFast"]
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index ad36ba4e14a2..9d2e0538401a 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -48,14 +48,22 @@ def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=No
         if text is not None and visual_prompt is not None:
             raise ValueError("You have to specify exactly one type of prompt. Either text or visual prompt.")
 
+        output_kwargs = self._merge_kwargs(
+            self.valid_processor_kwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs
+        )
+
         if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **output_kwargs["text_kwargs"])
 
         if visual_prompt is not None:
-            prompt_features = self.image_processor(visual_prompt, return_tensors=return_tensors, **kwargs)
+            prompt_features = self.image_processor(
+                visual_prompt, return_tensors=return_tensors, **output_kwargs["images_kwargs"]
+            )
 
         if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(
+                images, return_tensors=return_tensors, **output_kwargs["images_kwargs"]
+            )
 
         if visual_prompt is not None and images is not None:
             encoding = {
diff --git a/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py b/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py
index 4c69791f31bf..476ffe051714 100644
--- a/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py
+++ b/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py
@@ -111,6 +111,7 @@ def _prepare_images_structure(
         **kwargs,
     ) -> ImageInput:
         # we need to handle image pairs validation and flattening
+        images = self.fetch_images(images)
         return flatten_pair_images(images)
 
     def _preprocess(
diff --git a/src/transformers/models/fuyu/image_processing_fuyu_fast.py b/src/transformers/models/fuyu/image_processing_fuyu_fast.py
index dc64a5d50958..0552b474d177 100644
--- a/src/transformers/models/fuyu/image_processing_fuyu_fast.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu_fast.py
@@ -50,6 +50,7 @@
 class FuyuImageProcessorFast(BaseImageProcessorFast):
     do_resize = True
     size = {"height": 1080, "width": 1920}
+    patch_size = {"height": 30, "width": 30}
     resample = PILImageResampling.BILINEAR
     do_pad = True
     padding_value = 1.0
diff --git a/src/transformers/models/idefics2/image_processing_idefics2_fast.py b/src/transformers/models/idefics2/image_processing_idefics2_fast.py
index a945ac623b68..eef8ce77752a 100644
--- a/src/transformers/models/idefics2/image_processing_idefics2_fast.py
+++ b/src/transformers/models/idefics2/image_processing_idefics2_fast.py
@@ -147,6 +147,7 @@ def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3)
         """
         Prepare a nested images structure for processing.
         """
+        images = self.fetch_images(images)
         return make_nested_list_of_images(images, expected_ndims=expected_ndims)
 
     def split_images(
diff --git a/src/transformers/models/idefics3/image_processing_idefics3_fast.py b/src/transformers/models/idefics3/image_processing_idefics3_fast.py
index d8faf1cf202e..aee4756ae084 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3_fast.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3_fast.py
@@ -152,6 +152,27 @@ def get_max_height_width(images_list: list[list["torch.Tensor"]]) -> tuple[int,
     return (max_height, max_width)
 
 
+def get_num_channels(images_list: list[list["torch.Tensor"]]) -> int:
+    """
+    Get the number of channels across all images in a batch. Handle empty sublists like in [[], [image]].
+    """
+    for images in images_list:
+        if images:
+            return images[0].shape[0]
+
+    raise ValueError("No images found in the batch.")
+
+
+def get_device_from_images(images_list: list[list["torch.Tensor"]]) -> "torch.device":
+    """
+    Get the device from the first non-empty element in a nested list of images.
+    Handle empty sublists like in [[], [image]].
+    """
+    for images in images_list:
+        if images:
+            return images[0].device
+
+
 def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "torch.Tensor":
     """
     Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
@@ -183,11 +204,14 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast):
     do_pad = True
     return_row_col_info = False
     valid_kwargs = Idefics3ImageProcessorKwargs
+    model_input_names = ["pixel_values", "pixel_attention_mask"]
 
     def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3) -> ImageInput:
         """
         Prepare a nested images structure for processing.
         """
+        # Checks for `str` in case of URL/local path and optionally loads images
+        images = self.fetch_images(images)
         return make_nested_list_of_images(images, expected_ndims=expected_ndims)
 
     def resize(
@@ -438,18 +462,20 @@ def _preprocess(
             # Get max images per batch
             max_num_images = max(len(images_) for images_ in processed_images)
             max_height, max_width = get_max_height_width(processed_images)
+            num_channels = get_num_channels(processed_images)
+            device = get_device_from_images(processed_images)
 
             processed_images_padded = torch.zeros(
                 len(processed_images),
                 max_num_images,
-                *(processed_images[0][0].shape[0], max_height, max_width),
-                device=processed_images[0][0].device,
+                *(num_channels, max_height, max_width),
+                device=device,
             )
             pixel_attention_masks = torch.zeros(
                 len(processed_images),
                 max_num_images,
                 *(max_height, max_width),
-                device=processed_images[0][0].device,
+                device=device,
             )
             for i, images in enumerate(processed_images):
                 for j, image in enumerate(images):
diff --git a/src/transformers/models/janus/image_processing_janus_fast.py b/src/transformers/models/janus/image_processing_janus_fast.py
index f0e80a78d577..33c6411d1404 100644
--- a/src/transformers/models/janus/image_processing_janus_fast.py
+++ b/src/transformers/models/janus/image_processing_janus_fast.py
@@ -217,10 +217,10 @@ def postprocess(
         if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
             images = [F.to_pil_image(image) for image in images]
 
-        data = {"pixel_values": images}
         return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
+        images = torch.stack(images, dim=0) if return_tensors == "pt" else images
 
-        return BatchFeature(data=data, tensor_type=return_tensors)
+        return BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
 
 
 __all__ = ["JanusImageProcessorFast"]
diff --git a/src/transformers/models/lightglue/image_processing_lightglue_fast.py b/src/transformers/models/lightglue/image_processing_lightglue_fast.py
index 7a3eab39d85c..865e3a4adc25 100644
--- a/src/transformers/models/lightglue/image_processing_lightglue_fast.py
+++ b/src/transformers/models/lightglue/image_processing_lightglue_fast.py
@@ -132,6 +132,7 @@ def _prepare_images_structure(
         **kwargs,
     ) -> ImageInput:
         # we need to handle image pairs validation and flattening
+        images = self.fetch_images(images)
         return flatten_pair_images(images)
 
     def _preprocess(
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
index 483a38152384..2cb135160ba3 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
@@ -69,6 +69,7 @@ def __init__(self, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]):
     def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]) -> BatchFeature:
         if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
             # if the first element is a list, we assume that all elements are lists
+            images = [x for x in images if x]  # handle text-only case
             batch_num_images = [len(x) for x in images]
         elif isinstance(images, (tuple, list)):
             # treat this as a single-image case for backward compatibility
diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py
index 50c9f80746e5..f816e5d2b8f4 100644
--- a/src/transformers/models/llava_onevision/modular_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py
@@ -121,6 +121,7 @@ def pad_to_square(
     def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]) -> BatchFeature:
         if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
             # if the first element is a list, we assume that all elements are lists
+            images = [x for x in images if x]  # handle text-only case
             batch_num_images = [len(x) for x in images]
         elif isinstance(images, (tuple, list)):
             # treat this as a single-image case for backward compatibility
diff --git a/src/transformers/models/mllama/image_processing_mllama_fast.py b/src/transformers/models/mllama/image_processing_mllama_fast.py
index 0144cd485fcb..3989da135c22 100644
--- a/src/transformers/models/mllama/image_processing_mllama_fast.py
+++ b/src/transformers/models/mllama/image_processing_mllama_fast.py
@@ -214,6 +214,7 @@ class MllamaImageProcessorFast(BaseImageProcessorFast):
     do_pad = True
     max_image_tiles = 4
     valid_kwargs = MllamaImageProcessorKwargs
+    model_input_names = ["pixel_values", "num_tiles", "aspect_ratio_ids", "aspect_ratio_mask"]
 
     def __init__(self, **kwargs: Unpack[MllamaImageProcessorKwargs]):
         super().__init__(**kwargs)
diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
index 5b398b6bb15f..862677e300e3 100644
--- a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
+++ b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
@@ -153,6 +153,27 @@ def get_max_height_width(images_list: list[list["torch.Tensor"]]) -> tuple[int,
     return (max_height, max_width)
 
 
+def get_num_channels(images_list: list[list["torch.Tensor"]]) -> int:
+    """
+    Get the number of channels across all images in a batch. Handle empty sublists like in [[], [image]].
+    """
+    for images in images_list:
+        if images:
+            return images[0].shape[0]
+
+    raise ValueError("No images found in the batch.")
+
+
+def get_device_from_images(images_list: list[list["torch.Tensor"]]) -> "torch.device":
+    """
+    Get the device from the first non-empty element in a nested list of images.
+    Handle empty sublists like in [[], [image]].
+    """
+    for images in images_list:
+        if images:
+            return images[0].device
+
+
 @auto_docstring
 class SmolVLMImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.LANCZOS
@@ -168,11 +189,14 @@ class SmolVLMImageProcessorFast(BaseImageProcessorFast):
     do_pad = True
     return_row_col_info = False
     valid_kwargs = SmolVLMImageProcessorKwargs
+    model_input_names = ["pixel_values", "pixel_attention_mask"]
 
     def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3) -> ImageInput:
         """
         Prepare a nested images structure for processing.
         """
+        # Checks for `str` in case of URL/local path and optionally loads images
+        images = self.fetch_images(images)
         return make_nested_list_of_images(images, expected_ndims=expected_ndims)
 
     def resize(
@@ -423,18 +447,20 @@ def _preprocess(
             # Get max images per batch
             max_num_images = max(len(images_) for images_ in processed_images)
             max_height, max_width = get_max_height_width(processed_images)
+            num_channels = get_num_channels(processed_images)
+            device = get_device_from_images(processed_images)
 
             processed_images_padded = torch.zeros(
                 len(processed_images),
                 max_num_images,
-                *(processed_images[0][0].shape[0], max_height, max_width),
-                device=processed_images[0][0].device,
+                *(num_channels, max_height, max_width),
+                device=device,
             )
             pixel_attention_masks = torch.zeros(
                 len(processed_images),
                 max_num_images,
                 *(max_height, max_width),
-                device=processed_images[0][0].device,
+                device=device,
             )
             for i, images in enumerate(processed_images):
                 for j, image in enumerate(images):
diff --git a/src/transformers/models/superglue/image_processing_superglue_fast.py b/src/transformers/models/superglue/image_processing_superglue_fast.py
index 9409b57ca223..7dbdc0873548 100644
--- a/src/transformers/models/superglue/image_processing_superglue_fast.py
+++ b/src/transformers/models/superglue/image_processing_superglue_fast.py
@@ -118,6 +118,7 @@ def _prepare_images_structure(
         **kwargs,
     ) -> ImageInput:
         # we need to handle image pairs validation and flattening
+        images = self.fetch_images(images)
         return flatten_pair_images(images)
 
     def _preprocess(
diff --git a/src/transformers/models/tvp/image_processing_tvp_fast.py b/src/transformers/models/tvp/image_processing_tvp_fast.py
index be897094db0c..02cf41981709 100644
--- a/src/transformers/models/tvp/image_processing_tvp_fast.py
+++ b/src/transformers/models/tvp/image_processing_tvp_fast.py
@@ -83,6 +83,7 @@ def _prepare_images_structure(
         Returns:
             `ImageInput`: The images with a valid nesting.
         """
+        images = self.fetch_images(images)
         return make_nested_list_of_images(images, **kwargs)
 
     def resize(
diff --git a/src/transformers/models/yolos/image_processing_yolos_fast.py b/src/transformers/models/yolos/image_processing_yolos_fast.py
index 2e3a2b3e672f..780ba4559dbb 100644
--- a/src/transformers/models/yolos/image_processing_yolos_fast.py
+++ b/src/transformers/models/yolos/image_processing_yolos_fast.py
@@ -31,11 +31,14 @@
     validate_annotations,
 )
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring
+from ...utils import TensorType, auto_docstring, is_torch_available
 from ...utils.import_utils import requires
 from .image_processing_yolos import YolosImageProcessorKwargs
 
 
+if is_torch_available():
+    from torch import nn
+
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
@@ -653,11 +656,11 @@ def _preprocess(
         return encoded_inputs
 
     def post_process_object_detection(
-        self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100
+        self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None
     ):
         """
-        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
-        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
 
         Args:
             outputs ([`YolosObjectDetectionOutput`]):
@@ -666,10 +669,7 @@ def post_process_object_detection(
                 Score threshold to keep object detection predictions.
             target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
                 Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
-                (height, width) of each image in the batch. If left to None, predictions will not be resized.
-            top_k (`int`, *optional*, defaults to 100):
-                Keep only top k bounding boxes before filtering by thresholding.
-
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
         Returns:
             `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
             in the batch as predicted by the model.
@@ -682,23 +682,20 @@ def post_process_object_detection(
                     "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                 )
 
-        prob = out_logits.sigmoid()
-        prob = prob.view(out_logits.shape[0], -1)
-        k_value = min(top_k, prob.size(1))
-        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
-        scores = topk_values
-        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
-        labels = topk_indexes % out_logits.shape[2]
+        prob = nn.functional.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # Convert to [x0, y0, x1, y1] format
         boxes = center_to_corners_format(out_bbox)
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
 
-        # and from relative [0, 1] to absolute [0, height] coordinates
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
         if target_sizes is not None:
             if isinstance(target_sizes, list):
                 img_h = torch.Tensor([i[0] for i in target_sizes])
                 img_w = torch.Tensor([i[1] for i in target_sizes])
             else:
                 img_h, img_w = target_sizes.unbind(1)
+
             scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
             boxes = boxes * scale_fct[:, None, :]
 
diff --git a/src/transformers/models/yolos/modular_yolos.py b/src/transformers/models/yolos/modular_yolos.py
index 38ee570b0c31..569625b1bd52 100644
--- a/src/transformers/models/yolos/modular_yolos.py
+++ b/src/transformers/models/yolos/modular_yolos.py
@@ -5,10 +5,16 @@
 from ...image_transforms import center_to_corners_format
 from ...utils import (
     TensorType,
+    is_torch_available,
     logging,
 )
 
 
+if is_torch_available():
+    import torch
+    from torch import nn
+
+
 logger = logging.get_logger(__name__)
 
 
@@ -63,11 +69,11 @@ def get_size_with_aspect_ratio(
 
 class YolosImageProcessorFast(DetrImageProcessorFast):
     def post_process_object_detection(
-        self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100
+        self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None
     ):
         """
-        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
-        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
 
         Args:
             outputs ([`YolosObjectDetectionOutput`]):
@@ -76,10 +82,7 @@ def post_process_object_detection(
                 Score threshold to keep object detection predictions.
             target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
                 Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
-                (height, width) of each image in the batch. If left to None, predictions will not be resized.
-            top_k (`int`, *optional*, defaults to 100):
-                Keep only top k bounding boxes before filtering by thresholding.
-
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
         Returns:
             `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
             in the batch as predicted by the model.
@@ -92,23 +95,20 @@ def post_process_object_detection(
                     "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                 )
 
-        prob = out_logits.sigmoid()
-        prob = prob.view(out_logits.shape[0], -1)
-        k_value = min(top_k, prob.size(1))
-        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
-        scores = topk_values
-        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
-        labels = topk_indexes % out_logits.shape[2]
+        prob = nn.functional.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # Convert to [x0, y0, x1, y1] format
         boxes = center_to_corners_format(out_bbox)
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
 
-        # and from relative [0, 1] to absolute [0, height] coordinates
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
         if target_sizes is not None:
             if isinstance(target_sizes, list):
                 img_h = torch.Tensor([i[0] for i in target_sizes])
                 img_w = torch.Tensor([i[1] for i in target_sizes])
             else:
                 img_h, img_w = target_sizes.unbind(1)
+
             scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
             boxes = boxes * scale_fct[:, None, :]
 
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index d6ddf36295fd..c1f46dc5cdc5 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -521,7 +521,11 @@ def test_inference_interpolate_pos_encoding(self):
         self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-0.3589, -0.5939, 0.3534], [0.4346, 0.1647, 0.7071], [1.1404, -0.4716, 0.1664]]
+            [
+                [-0.3577, -0.5977, 0.3555],
+                [0.4544, 0.1660, 0.6583],
+                [1.1715, -0.4870, 0.1645],
+            ]
         ).to(torch_device)
 
         torch.testing.assert_close(
diff --git a/tests/models/aria/test_processing_aria.py b/tests/models/aria/test_processing_aria.py
index 1dbe7a6a7946..0f64fdcf59ff 100644
--- a/tests/models/aria/test_processing_aria.py
+++ b/tests/models/aria/test_processing_aria.py
@@ -265,7 +265,7 @@ def test_image_chat_template_accepts_processing_kwargs(self):
             tokenize=True,
             return_dict=True,
             max_image_size=980,
-            return_tensors="np",
+            return_tensors="pt",
         )
         self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980])
 
diff --git a/tests/models/auto/test_image_processing_auto.py b/tests/models/auto/test_image_processing_auto.py
index 61abc1caafb7..b9d627071fe9 100644
--- a/tests/models/auto/test_image_processing_auto.py
+++ b/tests/models/auto/test_image_processing_auto.py
@@ -27,6 +27,7 @@
     AutoImageProcessor,
     CLIPConfig,
     CLIPImageProcessor,
+    CLIPImageProcessorFast,
     ViTImageProcessor,
     ViTImageProcessorFast,
 )
@@ -43,10 +44,12 @@ class AutoImageProcessorTest(unittest.TestCase):
     def setUp(self):
         transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
 
+    @require_torchvision
     def test_image_processor_from_model_shortcut(self):
         config = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
-        self.assertIsInstance(config, CLIPImageProcessor)
+        self.assertIsInstance(config, CLIPImageProcessorFast)
 
+    @require_torchvision
     def test_image_processor_from_local_directory_from_key(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
@@ -58,8 +61,9 @@ def test_image_processor_from_local_directory_from_key(self):
             json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
 
             config = AutoImageProcessor.from_pretrained(tmpdirname)
-            self.assertIsInstance(config, CLIPImageProcessor)
+            self.assertIsInstance(config, CLIPImageProcessorFast)
 
+    @require_torchvision
     def test_image_processor_from_local_directory_from_feature_extractor_key(self):
         # Ensure we can load the image processor from the feature extractor config
         # Though we don't have any `CLIPFeatureExtractor` class, we can't be sure that
@@ -74,8 +78,9 @@ def test_image_processor_from_local_directory_from_feature_extractor_key(self):
             json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
 
             config = AutoImageProcessor.from_pretrained(tmpdirname)
-            self.assertIsInstance(config, CLIPImageProcessor)
+            self.assertIsInstance(config, CLIPImageProcessorFast)
 
+    @require_torchvision
     def test_image_processor_from_new_filename(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
@@ -87,8 +92,10 @@ def test_image_processor_from_new_filename(self):
             json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
 
             config = AutoImageProcessor.from_pretrained(tmpdirname)
-            self.assertIsInstance(config, CLIPImageProcessor)
+            # Now loading fast image processor by default
+            self.assertIsInstance(config, CLIPImageProcessorFast)
 
+    @require_torchvision
     def test_image_processor_from_local_directory_from_config(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             model_config = CLIPConfig()
@@ -118,8 +125,9 @@ def test_image_processor_from_local_directory_from_config(self):
             dict_as_saved = json.loads(config.to_json_string())
             self.assertTrue("_processor_class" not in dict_as_saved)
 
-        self.assertIsInstance(config, CLIPImageProcessor)
+        self.assertIsInstance(config, CLIPImageProcessorFast)
 
+    @require_torchvision
     def test_image_processor_from_local_file(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
@@ -129,7 +137,7 @@ def test_image_processor_from_local_file(self):
             )
 
             config = AutoImageProcessor.from_pretrained(processor_tmpfile)
-            self.assertIsInstance(config, CLIPImageProcessor)
+            self.assertIsInstance(config, CLIPImageProcessorFast)
 
     def test_repo_not_found(self):
         with self.assertRaisesRegex(
@@ -155,10 +163,9 @@ def test_image_processor_not_found(self):
     def test_use_fast_selection(self):
         checkpoint = "hf-internal-testing/tiny-random-vit"
 
-        # TODO: @yoni, change in v4.48 (when use_fast set to True by default)
-        # Slow image processor is selected by default
+        # Fast image processor is selected by default
         image_processor = AutoImageProcessor.from_pretrained(checkpoint)
-        self.assertIsInstance(image_processor, ViTImageProcessor)
+        self.assertIsInstance(image_processor, ViTImageProcessorFast)
 
         # Fast image processor is selected when use_fast=True
         image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=True)
diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py
index a177ebc941e3..2f76fcaa0457 100644
--- a/tests/models/auto/test_processor_auto.py
+++ b/tests/models/auto/test_processor_auto.py
@@ -423,7 +423,7 @@ def test_auto_processor_creates_tokenizer(self):
 
     def test_auto_processor_creates_image_processor(self):
         processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-convnext")
-        self.assertEqual(processor.__class__.__name__, "ConvNextImageProcessor")
+        self.assertEqual(processor.__class__.__name__, "ConvNextImageProcessorFast")
 
     def test_auto_processor_save_load(self):
         processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py
index b259255d022f..b5e659826a21 100644
--- a/tests/models/aya_vision/test_modeling_aya_vision.py
+++ b/tests/models/aya_vision/test_modeling_aya_vision.py
@@ -395,8 +395,7 @@ def test_small_model_integration_batched_generate(self):
             {
                 ("xpu", 3): "Wooden bridge stretches\nInto still waters, mountains gleam\nPeaceful forest scene",
                 # 4-bit
-                ("cuda", 7): "Wooden bridge stretches\nMirrored lake below, mountains rise\nPeaceful, serene",
-                ("cuda", 8): 'Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.',
+                ("cuda", 8): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
             }
         )  # fmt: skip
         expected_output = expected_outputs.get_expectation()
@@ -474,8 +473,7 @@ def test_small_model_integration_batched_generate_multi_image(self):
         expected_outputs = Expectations(
             {
                 ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.",
-                ("cuda", 7): 'Wooden bridge stretches\nMirrored lake below, mountains rise\nPeaceful, serene',
-                ("cuda", 8): 'Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.',
+                ("cuda", 8): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
             }
         )  # fmt: skip
         expected_output = expected_outputs.get_expectation()
@@ -491,7 +489,6 @@ def test_small_model_integration_batched_generate_multi_image(self):
         expected_outputs = Expectations(
             {
                 ("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ",
-                ("cuda", 7): 'The first image showcases the Statue of Liberty, a monumental sculpture located on Liberty Island in New York Harbor. Standing atop a',
                 ("cuda", 8): 'The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ',
             }
         )  # fmt: skip
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index fd3039d8b027..739352bf2832 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -623,7 +623,11 @@ def test_inference_interpolate_pos_encoding(self):
         self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-0.3990, 0.2983, -0.1239], [-0.1452, -0.2759, 0.0403], [-0.3149, -0.4763, 0.8555]]
+            [
+                [-0.3997, 0.2982, -0.1240],
+                [-0.1455, -0.2749, 0.0397],
+                [-0.3095, -0.4702, 0.8512],
+            ]
         ).to(torch_device)
 
         torch.testing.assert_close(
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 5cab257ecb63..ceda8e9cf3fa 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -721,7 +721,11 @@ def test_inference_interpolate_pos_encoding(self):
         self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]]
+            [
+                [-0.1552, 0.0314, -0.3233],
+                [0.2886, 0.1141, -0.5706],
+                [0.0468, 0.1570, -0.6028],
+            ]
         ).to(torch_device)
 
         torch.testing.assert_close(
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 9cdf45062b10..7fdeb0d817bf 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -559,7 +559,7 @@ def test_inference_image_segmentation(self):
 
         # verify conditional and pooled output
         expected_conditional = torch.tensor([0.5601, -0.0314, 0.1980]).to(torch_device)
-        expected_pooled_output = torch.tensor([0.5036, -0.2681, -0.2644]).to(torch_device)
+        expected_pooled_output = torch.tensor([0.4986, -0.2698, -0.2631]).to(torch_device)
         torch.testing.assert_close(outputs.conditional_embeddings[0, :3], expected_conditional, rtol=1e-3, atol=1e-3)
         torch.testing.assert_close(outputs.pooled_output[0, :3], expected_pooled_output, rtol=1e-3, atol=1e-3)
 
@@ -593,7 +593,11 @@ def test_inference_interpolate_pos_encoding(self):
         self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]]
+            [
+                [-0.1552, 0.0314, -0.3233],
+                [0.2886, 0.1141, -0.5706],
+                [0.0468, 0.1569, -0.6028],
+            ]
         ).to(torch_device)
 
         torch.testing.assert_close(
diff --git a/tests/models/convnext/test_modeling_convnext.py b/tests/models/convnext/test_modeling_convnext.py
index ce90bb537794..cef8ccbedcf3 100644
--- a/tests/models/convnext/test_modeling_convnext.py
+++ b/tests/models/convnext/test_modeling_convnext.py
@@ -284,7 +284,7 @@ def test_inference_image_classification_head(self):
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([-0.0261, -0.4739, 0.1910]).to(torch_device)
+        expected_slice = torch.tensor([-0.0267, -0.4735, 0.1901]).to(torch_device)
 
         torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=2e-4, atol=2e-4)
 
diff --git a/tests/models/convnextv2/test_modeling_convnextv2.py b/tests/models/convnextv2/test_modeling_convnextv2.py
index 86592528c257..4386d33633d5 100644
--- a/tests/models/convnextv2/test_modeling_convnextv2.py
+++ b/tests/models/convnextv2/test_modeling_convnextv2.py
@@ -304,5 +304,5 @@ def test_inference_image_classification_head(self):
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([0.9996, 0.1966, -0.4386]).to(torch_device)
+        expected_slice = torch.tensor([0.9989, 0.1953, -0.4382]).to(torch_device)
         torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
diff --git a/tests/models/cvt/test_modeling_cvt.py b/tests/models/cvt/test_modeling_cvt.py
index 80a16ba8c115..d5338e018df5 100644
--- a/tests/models/cvt/test_modeling_cvt.py
+++ b/tests/models/cvt/test_modeling_cvt.py
@@ -267,6 +267,6 @@ def test_inference_image_classification_head(self):
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([0.9287, 0.9016, -0.3152]).to(torch_device)
+        expected_slice = torch.tensor([0.9282, 0.9025, -0.3145]).to(torch_device)
 
         torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=2e-4, atol=2e-4)
diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py
index 0c5f56ec4d0a..e3c6673a60b4 100644
--- a/tests/models/dinov2/test_modeling_dinov2.py
+++ b/tests/models/dinov2/test_modeling_dinov2.py
@@ -317,7 +317,11 @@ def test_inference_no_head(self):
         self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-2.2005, -0.4495, 1.0964], [-3.3959, -0.8942, -1.0315], [-2.9355, 1.1564, -0.7656]],
+            [
+                [-2.2001, -0.4484, 1.0978],
+                [-3.3830, -0.8899, -1.0198],
+                [-2.9340, 1.1604, -0.7696],
+            ],
             device=torch_device,
         )
         torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-3, atol=1e-3)
diff --git a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
index 3e6916d3a890..e335e54842e2 100644
--- a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
+++ b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
@@ -332,7 +332,11 @@ def test_inference_no_head(self):
         self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-0.4636, -1.4582, -0.0274], [-1.4738, -0.8858, 0.3002], [0.0714, -0.2407, -1.5940]],
+            [
+                [-0.4638, -1.4563, -0.0289],
+                [-1.4736, -0.8866, 0.3005],
+                [0.0720, -0.2406, -1.5943],
+            ],
             device=torch_device,
         )
         torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
diff --git a/tests/models/emu3/test_processing_emu3.py b/tests/models/emu3/test_processing_emu3.py
index 9b1fa66d0a62..f5267336b332 100644
--- a/tests/models/emu3/test_processing_emu3.py
+++ b/tests/models/emu3/test_processing_emu3.py
@@ -76,12 +76,12 @@ def test_processor_postprocess(self):
         orig_image_input = self.prepare_image_inputs()
         orig_image = np.array(orig_image_input).transpose(2, 0, 1)
 
-        inputs = processor(text=input_str, images=orig_image, do_resize=False, return_tensors="np")
+        inputs = processor(text=input_str, images=orig_image, do_resize=False, return_tensors="pt")
         normalized_image_input = inputs.pixel_values
-        unnormalized_images = processor.postprocess(normalized_image_input, return_tensors="np")["pixel_values"]
+        unnormalized_images = processor.postprocess(normalized_image_input, return_tensors="pt")["pixel_values"]
 
         # For an image where pixels go from 0 to 255 the diff can be 1 due to some numerical precision errors when scaling and unscaling
-        self.assertTrue(np.abs(orig_image - unnormalized_images).max() >= 1)
+        self.assertTrue(np.abs(orig_image - unnormalized_images.numpy()).max() >= 1)
 
     # Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens
     def test_get_num_vision_tokens(self):
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index 9bcf54a79de2..58e2785938c6 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -1118,9 +1118,9 @@ def test_inference(self):
             outputs = model(**inputs, return_dict=True)
 
         # verify the embeddings
-        self.assertAlmostEqual(outputs.image_embeddings.sum().item(), -1352.53540, places=4)
+        self.assertAlmostEqual(outputs.image_embeddings.sum().item(), -1352.4685, places=4)
         self.assertAlmostEqual(outputs.text_embeddings.sum().item(), -198.98225, places=4)
-        self.assertAlmostEqual(outputs.multimodal_embeddings.sum().item(), -4030.4604492, places=4)
+        self.assertAlmostEqual(outputs.multimodal_embeddings.sum().item(), -4030.4226, places=4)
 
 
 @require_vision
diff --git a/tests/models/florence2/test_modeling_florence2.py b/tests/models/florence2/test_modeling_florence2.py
index 2db7c7eb6bf4..c2dc6fc436a7 100644
--- a/tests/models/florence2/test_modeling_florence2.py
+++ b/tests/models/florence2/test_modeling_florence2.py
@@ -327,15 +327,15 @@ def test_base_model_batching_inference_eager(self):
         predictions = model.generate(**inputs, do_sample=False, max_new_tokens=100)
 
         EXPECTED_PREDICTION_IDS = [
-            [2, 0, 50269, 50269, 51267, 50980, 50269, 50269, 50688, 50942, 50269, 50333, 50633, 50941, 51033, 50269, 51267, 50934, 50794, 50814, 51190, 51032, 50432, 50402, 50634, 50692, 50269, 50334, 50340, 50927, 51224, 50417, 51267, 50930, 51076, 50944, 51159, 51028, 50836, 50947, 50915, 51030, 2],
-            [2, 0, 28884,  2507, 50413, 50839, 51139, 51047, 28884,  2507, 50980, 50842, 51135, 51043, 28884, 2507, 50417, 50848, 50573, 51043, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            [2, 0, 50269, 50269, 51267, 50980, 50269, 50269, 50688, 50942, 50269, 50333, 50633, 50941, 51033, 50269, 51267, 50934, 50794, 50814, 51190, 51032, 50432, 50402, 50634, 50692, 50269, 50334, 50340, 50927, 51224, 50417, 51267, 50930, 51075, 50944, 51159, 51028, 50836, 50947, 50915, 51030, 2],
+            [2, 0, 28884, 2507, 50413, 50839, 51139, 51047, 28884, 2507, 50980, 50842, 51135, 51043, 28884, 2507, 50417, 50848, 50573, 51043, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         ]  # fmt: skip
         self.assertEqual(predictions.tolist(), EXPECTED_PREDICTION_IDS)
 
         generated_texts = processor.batch_decode(predictions, skip_special_tokens=False)
 
         EXPECTED_GENERATED_TEXTS = [
-            "</s><s><loc_0><loc_0><loc_998><loc_711><loc_0><loc_0><loc_419><loc_673><loc_0><loc_64><loc_364><loc_672><loc_764><loc_0><loc_998><loc_665><loc_525><loc_545><loc_921><loc_763><loc_163><loc_133><loc_365><loc_423><loc_0><loc_65><loc_71><loc_658><loc_955><loc_148><loc_998><loc_661><loc_807><loc_675><loc_890><loc_759><loc_567><loc_678><loc_646><loc_761></s>",
+            "</s><s><loc_0><loc_0><loc_998><loc_711><loc_0><loc_0><loc_419><loc_673><loc_0><loc_64><loc_364><loc_672><loc_764><loc_0><loc_998><loc_665><loc_525><loc_545><loc_921><loc_763><loc_163><loc_133><loc_365><loc_423><loc_0><loc_65><loc_71><loc_658><loc_955><loc_148><loc_998><loc_661><loc_806><loc_675><loc_890><loc_759><loc_567><loc_678><loc_646><loc_761></s>",
             "</s><s>wheels<loc_144><loc_570><loc_870><loc_778>wheels<loc_711><loc_573><loc_866><loc_774>wheels<loc_148><loc_579><loc_304><loc_774></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
         ]
         self.assertEqual(generated_texts, EXPECTED_GENERATED_TEXTS)
@@ -343,8 +343,23 @@ def test_base_model_batching_inference_eager(self):
         parsed_answer_0 = processor.post_process_generation(
             generated_texts[0], task="<REGION_PROPOSAL>", image_size=(images[0].width, images[0].height)
         )
-        EXPECTED_PARSED_ANSWER_0 = {"<REGION_PROPOSAL>": {"bboxes": [[0, 0, 1298, 623], [0, 0, 545, 589], [0, 56, 473, 589], [993, 0, 1298, 582], [683, 477, 1197, 668], [212, 116, 475, 370], [0, 57, 92, 576], [1242, 130, 1298, 579], [1049, 591, 1157, 665], [737, 594, 840, 667]], "labels": ["", "", "", "", "", "", "", "", "", ""]}}  # fmt: skip
-
+        EXPECTED_PARSED_ANSWER_0 = {
+            "<REGION_PROPOSAL>": {
+                "bboxes": [
+                    [0, 0, 1298, 623],
+                    [0, 0, 545, 589],
+                    [0, 56, 473, 589],
+                    [993, 0, 1298, 582],
+                    [683, 477, 1197, 668],
+                    [212, 116, 475, 370],
+                    [0, 57, 92, 576],
+                    [1242, 130, 1298, 579],
+                    [1048, 591, 1157, 665],
+                    [737, 594, 840, 667],
+                ],
+                "labels": ["", "", "", "", "", "", "", "", "", ""],
+            }
+        }
         self.assertEqual(parsed_answer_0, EXPECTED_PARSED_ANSWER_0)
 
         parsed_answer_1 = processor.post_process_generation(
@@ -471,23 +486,37 @@ def test_large_model_batching_inference_eager(self):
         predictions = model.generate(**inputs, max_new_tokens=100)
 
         EXPECTED_PREDICTION_IDS = [
-            [2, 0, 0, 0, 50269, 50269, 51268, 50944, 50269, 50269, 50579, 50940, 51032, 50269, 51268, 50932, 50793, 50813, 51190, 51031, 50432, 50401, 50632, 50691, 51071, 50943, 51159, 51027, 50835, 50946, 50915, 51029, 2],
-            [2, 0, 5901, 50321, 50603, 51201, 51043, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            [2, 0, 0, 0, 50269, 50269, 51268, 50944, 50269, 50269, 50631, 50940, 50269, 50269, 50575, 50940, 51032, 50269, 51268, 50932, 50793, 50813, 51190, 51031, 50432, 50401, 50632, 50691, 51071, 50943, 51159, 51027, 50835, 50946, 50915, 51029, 2],
+            [2, 0, 5901, 50321, 50603, 51201, 51043, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
         ]  # fmt: skip
         self.assertEqual(predictions.tolist(), EXPECTED_PREDICTION_IDS)
 
         generated_texts = processor.batch_decode(predictions, skip_special_tokens=False)
 
         EXPECTED_GENERATED_TEXTS = [
-            '</s><s><s><s><loc_0><loc_0><loc_999><loc_675><loc_0><loc_0><loc_310><loc_671><loc_763><loc_0><loc_999><loc_663><loc_524><loc_544><loc_921><loc_762><loc_163><loc_132><loc_363><loc_422><loc_802><loc_674><loc_890><loc_758><loc_566><loc_677><loc_646><loc_760></s>',
-            '</s><s>car<loc_52><loc_334><loc_932><loc_774></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'
-        ]  # fmt: skip
+            "</s><s><s><s><loc_0><loc_0><loc_999><loc_675><loc_0><loc_0><loc_362><loc_671><loc_0><loc_0><loc_306><loc_671><loc_763><loc_0><loc_999><loc_663><loc_524><loc_544><loc_921><loc_762><loc_163><loc_132><loc_363><loc_422><loc_802><loc_674><loc_890><loc_758><loc_566><loc_677><loc_646><loc_760></s>",
+            "</s><s>car<loc_52><loc_334><loc_932><loc_774></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
+        ]
         self.assertEqual(generated_texts, EXPECTED_GENERATED_TEXTS)
 
         parsed_answer_0 = processor.post_process_generation(
             generated_texts[0], task="<REGION_PROPOSAL>", image_size=(images[0].width, images[0].height)
         )
-        EXPECTED_PARSED_ANSWER_0 = {'<REGION_PROPOSAL>': {'bboxes': [[0, 0, 1299, 591], [0, 0, 403, 588], [992, 0, 1299, 581], [681, 476, 1197, 667], [212, 116, 472, 370], [1043, 590, 1157, 664], [736, 593, 840, 666]], 'labels': ['', '', '', '', '', '', '']}}  # fmt: skip
+        EXPECTED_PARSED_ANSWER_0 = {
+            "<REGION_PROPOSAL>": {
+                "bboxes": [
+                    [0, 0, 1299, 591],
+                    [0, 0, 471, 588],
+                    [0, 0, 398, 588],
+                    [992, 0, 1299, 581],
+                    [681, 476, 1197, 667],
+                    [212, 116, 472, 370],
+                    [1043, 590, 1157, 664],
+                    [736, 593, 840, 666],
+                ],
+                "labels": ["", "", "", "", "", "", "", ""],
+            }
+        }
         self.assertEqual(parsed_answer_0, EXPECTED_PARSED_ANSWER_0)
 
         parsed_answer_1 = processor.post_process_generation(
diff --git a/tests/models/focalnet/test_modeling_focalnet.py b/tests/models/focalnet/test_modeling_focalnet.py
index b39c87721bcf..f1ca7da33211 100644
--- a/tests/models/focalnet/test_modeling_focalnet.py
+++ b/tests/models/focalnet/test_modeling_focalnet.py
@@ -413,7 +413,7 @@ def test_inference_image_classification_head(self):
         expectations = Expectations(
             {
                 (None, None): [0.2166, -0.4368, 0.2191],
-                ("cuda", 8): [0.2168, -0.4367, 0.2190],
+                ("cuda", 8): [0.2180, -0.4355, 0.2198],
             }
         )
         expected_slice = torch.tensor(expectations.get_expectation()).to(torch_device)
diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py
index d47ce6e1f02a..eabee28c4ff9 100644
--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@@ -610,7 +610,7 @@ def test_model_4b_crops(self):
             {
                 ("xpu", 3): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
                 ("cuda", (8, 0)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"],
-                ("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
+                ("cuda", (8, 6)): ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There’s a bright blue sky with some white clouds in the'],
                 ("cuda", (9, 0)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
                 ("rocm", (9, 4)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
                 ("rocm", (9, 5)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"]
@@ -727,8 +727,7 @@ def test_model_4b_multiimage(self):
         EXPECTED_TEXTS = Expectations(
             {
                 ("xpu", 3): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a city with"],
-                ("cuda", (8, 0)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a vibrant,"],
-                ("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt appears to be a street scene in a city"],
+                ("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image!\n\nHere's a description of the scene:\n\n*   **Chinese Arch"],
                 ("cuda", (9, 0)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image!\n\nHere's a description of the scene:\n\n*   **Location:**"],
                 ("rocm", (9, 4)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Main Features:**\n\n*   **Chinese Archway:** The most prominent"],
                 ("rocm", (9, 5)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Main Features:**\n\n*   **Chinese Archway:** The most prominent"],
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index 4fa301c8c9ee..950e5c77d5e5 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -498,7 +498,11 @@ def test_forward_pass(self):
         expected_shape = torch.Size((1, 201, 30522))
         self.assertEqual(outputs.logits.shape, expected_shape)
         expected_slice = torch.tensor(
-            [[-0.9514, -0.9512, -0.9507], [-0.5454, -0.5453, -0.5453], [-0.8862, -0.8857, -0.8848]],
+            [
+                [-0.9545, -0.9543, -0.9538],
+                [-0.5421, -0.5420, -0.5420],
+                [-0.8865, -0.8861, -0.8851],
+            ],
             device=torch_device,
         )
         torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
@@ -521,7 +525,7 @@ def test_inference_image_captioning(self):
         self.assertEqual(outputs.sequences.shape, expected_shape)
         self.assertEqual(generated_caption, "two cats laying on a pink blanket")
         self.assertTrue(outputs.scores[-1].shape, expected_shape)
-        expected_slice = torch.tensor([-0.8131, -0.8128, -0.8124], device=torch_device)
+        expected_slice = torch.tensor([-0.8126, -0.8123, -0.8119], device=torch_device)
         torch.testing.assert_close(outputs.scores[-1][0, :3], expected_slice, rtol=1e-4, atol=1e-4)
 
     def test_visual_question_answering(self):
@@ -596,7 +600,11 @@ def test_inference_interpolate_pos_encoding(self):
         self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-1.0296, 2.5960, 0.8703], [1.7027, 1.3302, -0.4543], [-1.4932, -0.1084, 0.0502]]
+            [
+                [-1.0502, 2.5812, 0.8644],
+                [1.6594, 1.2927, -0.4329],
+                [-1.4966, -0.1032, 0.0572],
+            ]
         ).to(torch_device)
 
         torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index db1289388b30..101741788b04 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -578,7 +578,11 @@ def test_inference_interpolate_pos_encoding(self):
         self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[1.7853, 0.0690, 0.3177], [2.6853, -0.2334, 0.0889], [1.5445, -0.1515, -0.0300]]
+            [
+                [1.7840, 0.0678, 0.3173],
+                [2.6844, -0.2343, 0.0878],
+                [1.5457, -0.1520, -0.0306],
+            ]
         ).to(torch_device)
 
         torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
@@ -610,11 +614,11 @@ def test_inference_for_pretraining(self):
 
         expected_slice = torch.tensor(
             [
-                [1.6407, 1.6506, 1.6541, 1.6617, 1.6703],
-                [1.9730, 1.9842, 1.9848, 1.9896, 1.9947],
-                [1.5949, 1.8262, 1.2602, 1.4801, 1.4448],
-                [1.2341, 1.7907, 0.8618, 1.5202, 1.4523],
-                [2.0140, 1.9846, 1.9434, 1.9019, 1.8648],
+                [1.6410, 1.6510, 1.6545, 1.6622, 1.6708],
+                [1.9736, 1.9849, 1.9855, 1.9904, 1.9954],
+                [1.5943, 1.8277, 1.2627, 1.4798, 1.4432],
+                [1.2344, 1.7911, 0.8622, 1.5206, 1.4526],
+                [2.0146, 1.9852, 1.9438, 1.9023, 1.8652],
             ]
         )
 
diff --git a/tests/models/idefics2/test_processing_idefics2.py b/tests/models/idefics2/test_processing_idefics2.py
index 1ad5de01f83c..d411f5d8c5c0 100644
--- a/tests/models/idefics2/test_processing_idefics2.py
+++ b/tests/models/idefics2/test_processing_idefics2.py
@@ -241,7 +241,7 @@ def test_process_interleaved_images_prompts_image_error(self):
         with self.assertRaises(ValueError):
             processor(text=text, images=images, padding=True)
         images = [[], [self.image2]]
-        with self.assertRaises(ValueError):
+        with self.assertRaises((ValueError, IndexError)):
             processor(text=text, images=images, padding=True)
         images = [self.image1, self.image2, self.image3]
         with self.assertRaises(ValueError):
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
index 0b8482b2b665..875a6297c7cb 100644
--- a/tests/models/idefics3/test_processing_idefics3.py
+++ b/tests/models/idefics3/test_processing_idefics3.py
@@ -285,7 +285,7 @@ def test_process_interleaved_images_prompts_image_error(self):
         with self.assertRaises(ValueError):
             processor(text=text, images=images, padding=True)
         images = [[], [self.image2]]
-        with self.assertRaises(ValueError):
+        with self.assertRaises((ValueError, IndexError)):
             processor(text=text, images=images, padding=True)
         images = [self.image1, self.image2, self.image3]
         with self.assertRaises(ValueError):
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index 6f54d7335638..831b2bb5f800 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -693,12 +693,14 @@ def test_inference_flant5_xl(self):
             temperature=1,
         )
         generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-        expected_outputs = [0, 37, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 3, 9, 4459, 6177, 6, 11, 3, 88, 19, 338, 46, 3575, 53, 1476, 5223, 12, 8, 223, 13, 8, 4049, 5, 37, 1023, 19, 7225, 16, 24, 34, 1267, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 338, 46, 3575, 53, 1476, 5223, 12, 8, 223, 13, 3, 9, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 338, 46, 3575, 53, 1476, 5223, 12, 8, 223, 13, 3, 9, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 1023, 19, 7225, 16, 24, 34, 1267, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 338, 46, 3575, 53, 1476, 5223, 12, 8, 223, 13, 3, 9, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 1]  # fmt: skip
+        # fmt: off
+        expected_outputs = [0, 37, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 3, 9, 4459, 6177, 6, 11, 3, 88, 19, 3609, 46, 3575, 53, 1476, 16, 80, 609, 11, 3, 9, 10428, 8235, 16, 8, 119, 5, 37, 1023, 19, 7225, 16, 24, 34, 1267, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 1]
+        # fmt: on
 
         self.assertEqual(outputs[0].tolist(), expected_outputs)
         self.assertEqual(
             generated_text,
-            "The image depicts a man ironing clothes on the back of a yellow van in the middle of a busy city street. The man is wearing a yellow shirt with a yellow tie, and he is using an ironing board attached to the back of the van. The image is unusual in that it shows a man ironing clothes on the back of a van in the middle of a busy city street. The man is using an ironing board attached to the back of a van in the middle of a busy city street. The man is using an ironing board attached to the back of a van in the middle of a busy city street. The image is unusual in that it shows a man ironing clothes on the back of a van in the middle of a busy city street. The man is using an ironing board attached to the back of a van in the middle of a busy city street.",
+            "The image depicts a man ironing clothes on the back of a yellow van in the middle of a busy city street. The man is wearing a yellow shirt with a yellow tie, and he is holding an ironing board in one hand and a laundry basket in the other. The image is unusual in that it shows a man ironing clothes on the back of a van in the middle of a busy city street.",
         )
 
     def test_inference_interpolate_pos_encoding(self):
diff --git a/tests/models/janus/test_modeling_janus.py b/tests/models/janus/test_modeling_janus.py
index a3544793caaa..cf84c05adfee 100644
--- a/tests/models/janus/test_modeling_janus.py
+++ b/tests/models/janus/test_modeling_janus.py
@@ -18,7 +18,6 @@
 import unittest
 from functools import reduce
 
-import numpy as np
 import pytest
 import requests
 
@@ -538,7 +537,7 @@ def test_model_generate_images(self):
 
         # Decode generated tokens to pixel values and postprocess them.
         decoded_pixel_values = model.decode_image_tokens(out)
-        images = processor.postprocess(list(decoded_pixel_values.float()), return_tensors="np")
+        images = processor.postprocess(list(decoded_pixel_values.float()), return_tensors="pt")
 
-        self.assertTrue(images["pixel_values"].shape == (1, 384, 384, 3))
-        self.assertTrue(isinstance(images["pixel_values"], np.ndarray))
+        self.assertTrue(images["pixel_values"].shape == (1, 3, 384, 384))
+        self.assertTrue(isinstance(images["pixel_values"], torch.Tensor))
diff --git a/tests/models/janus/test_processing_janus.py b/tests/models/janus/test_processing_janus.py
index 9dde25b971ed..9d30dd847b2d 100644
--- a/tests/models/janus/test_processing_janus.py
+++ b/tests/models/janus/test_processing_janus.py
@@ -430,7 +430,7 @@ def test_chat_template_accepts_processing_kwargs(self):
             return_dict=True,
             do_rescale=True,
             rescale_factor=-1.0,
-            return_tensors="np",
+            return_tensors="pt",
         )
         self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0)
 
@@ -442,9 +442,9 @@ def test_processor_postprocess(self):
         orig_image_input = self.prepare_image_inputs()
         orig_image = np.array(orig_image_input).transpose(2, 0, 1)
 
-        inputs = processor(text=input_str, images=orig_image, do_resize=False, do_pad=False, return_tensors="np")
+        inputs = processor(text=input_str, images=orig_image, do_resize=False, do_pad=False, return_tensors="pt")
         normalized_image_input = inputs.pixel_values
-        unnormalized_images = processor.postprocess(normalized_image_input, return_tensors="np")["pixel_values"]
+        unnormalized_images = processor.postprocess(normalized_image_input, return_tensors="pt")["pixel_values"]
 
         # For an image where pixels go from 0 to 255 the diff can be 1 due to some numerical precision errors when scaling and unscaling
-        self.assertTrue(np.abs(orig_image - unnormalized_images).max() >= 1)
+        self.assertTrue(np.abs(orig_image - unnormalized_images.numpy()).max() >= 1)
diff --git a/tests/models/kosmos2/test_processing_kosmos2.py b/tests/models/kosmos2/test_processing_kosmos2.py
index 04388b5159da..abb42e4e02e4 100644
--- a/tests/models/kosmos2/test_processing_kosmos2.py
+++ b/tests/models/kosmos2/test_processing_kosmos2.py
@@ -294,8 +294,8 @@ def check(texts, bboxes, expected_input_ids):
             outputs.image_embeds_position_mask,
             [0] * 2 + [1] * num_image_tokens + [0] + [0] * (len(expected_input_ids[0]) - 1),
         )
-        np.testing.assert_allclose(outputs.pixel_values[0][:3, :3, :3], EXPECTED_PIXEL_VALUES_1, atol=1e-9)
-        np.testing.assert_allclose(outputs.pixel_values[0][:3, -3:, -3:], EXPECTED_PIXEL_VALUES_2, atol=1e-9)
+        np.testing.assert_allclose(outputs.pixel_values[0][:3, :3, :3], EXPECTED_PIXEL_VALUES_1, atol=1e-4)
+        np.testing.assert_allclose(outputs.pixel_values[0][:3, -3:, -3:], EXPECTED_PIXEL_VALUES_2, atol=1e-4)
 
         # test with image in batch (right padding)
         outputs = processor(
@@ -308,10 +308,10 @@ def check(texts, bboxes, expected_input_ids):
         )
         self.assertTupleEqual(outputs.pixel_values.shape, (4, 3, 224, 224))
         np.testing.assert_allclose(
-            outputs.pixel_values[:, :3, :3, :3].numpy(), [EXPECTED_PIXEL_VALUES_1] * len(batch_image), atol=1e-9
+            outputs.pixel_values[:, :3, :3, :3].numpy(), [EXPECTED_PIXEL_VALUES_1] * len(batch_image), atol=1e-4
         )
         np.testing.assert_allclose(
-            outputs.pixel_values[:, :3, -3:, -3:].numpy(), [EXPECTED_PIXEL_VALUES_2] * len(batch_image), atol=1e-9
+            outputs.pixel_values[:, :3, -3:, -3:].numpy(), [EXPECTED_PIXEL_VALUES_2] * len(batch_image), atol=1e-4
         )
         # padding on the right: the `[1:]` below is because the part for `BOS` is already added in the beginning of each (dynamically computed) expected value  # noqa
         # fmt: off
diff --git a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
index d0f7a73fc24e..ebaa95553cc2 100644
--- a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
+++ b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
@@ -564,12 +564,9 @@ def test_eager(self):
         prompt = "<ocr>"
         generated_ids, generated_text = self.run_example(prompt, image, model, processor)
         EXPECTED_TEXT = {
-            7: [
-                "<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_611></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_810><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_648></bbox>1\n<bbox><x_79><y_614><x_468><y_651></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_609><x_812><y_642></bbox>0\n<bbox><x_50><y_658><x_69><y_693></bbox>1\n<bbox><x_79><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_814><y_683></bbox>0\n<bbox><x_31><y_742><x_820><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_781><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_872></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_836><y_1108></bbox>Card Payment 50,000\n"
-            ],
             8: [
-                "<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_611></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_810><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_648></bbox>1\n<bbox><x_79><y_614><x_468><y_650></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_609><x_812><y_644></bbox>0\n<bbox><x_50><y_658><x_69><y_693></bbox>1\n<bbox><x_79><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_814><y_687></bbox>0\n<bbox><x_31><y_742><x_820><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_781><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_872></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_836><y_1108></bbox>Card Payment 50,000\n"
-            ],
+                "<bbox><x_53><y_573><x_69><y_606></bbox>1\n<bbox><x_79><y_573><x_464><y_611></bbox>[REG] BLACK SAKURA\n<bbox><x_690><y_569><x_810><y_606></bbox>45,455\n<bbox><x_53><y_614><x_69><y_648></bbox>1\n<bbox><x_79><y_614><x_468><y_651></bbox>COOKIE DOH SAUCES\n<bbox><x_788><y_609><x_812><y_642></bbox>0\n<bbox><x_50><y_658><x_69><y_693></bbox>1\n<bbox><x_79><y_658><x_358><y_693></bbox>NATA DE COCO\n<bbox><x_790><y_652><x_814><y_683></bbox>0\n<bbox><x_31><y_742><x_820><y_781></bbox>Sub Total 45,455\n<bbox><x_27><y_781><x_822><y_827></bbox>PB1 (10%) 4,545\n<bbox><x_27><y_826><x_824><y_872></bbox>Rounding 0\n<bbox><x_24><y_872><x_827><y_921></bbox>Total 50,000\n<bbox><x_17><y_1056><x_836><y_1108></bbox>Card Payment 50,000\n"
+            ]
         }
 
         self.assertListEqual(generated_text, EXPECTED_TEXT[self.cuda_compute_capability_major_version])
@@ -578,9 +575,6 @@ def test_eager(self):
         generated_ids, generated_text = self.run_example(prompt, image, model, processor)
 
         EXPECTED_TEXT = {
-            7: [
-                "- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n- **Sub Total** 45,455\n- **PB1 (10%)** 4,545\n- **Rounding** 0\n- **Total** **50,000**\n\nCard Payment 50,000"
-            ],
             8: [
                 "- **1 \\[REG\\] BLACK SAKURA** 45,455\n- **1 COOKIE DOH SAUCES** 0\n- **1 NATA DE COCO** 0\n- **Sub Total** 45,455\n- **PB1 (10%)** 4,545\n- **Rounding** 0\n- **Total** **50,000**\n\nCard Payment 50,000"
             ],
diff --git a/tests/models/kosmos2_5/test_processor_kosmos2_5.py b/tests/models/kosmos2_5/test_processor_kosmos2_5.py
index bb1ba1f3aa3e..4fa8774ab97f 100644
--- a/tests/models/kosmos2_5/test_processor_kosmos2_5.py
+++ b/tests/models/kosmos2_5/test_processor_kosmos2_5.py
@@ -296,12 +296,12 @@ def test_full_processor(self):
         np.testing.assert_allclose(
             outputs.flattened_patches[0][1][:10].numpy().tolist(),
             EXPECTED_FP_1,
-            atol=1e-9,
+            atol=1e-4,
         )
         np.testing.assert_allclose(
             outputs.flattened_patches[0][200][:10].numpy().tolist(),
             EXPECTED_FP_200,
-            atol=1e-9,
+            atol=1e-4,
         )
 
         # test a batch of images and texts, right padding
@@ -322,10 +322,10 @@ def test_full_processor(self):
         np.testing.assert_allclose(
             outputs.flattened_patches[1][1][:10].numpy().tolist(),
             EXPECTED_FP_1,
-            atol=1e-9,
+            atol=1e-4,
         )
         np.testing.assert_allclose(
             outputs.flattened_patches[1][200][:10].numpy().tolist(),
             EXPECTED_FP_200,
-            atol=1e-9,
+            atol=1e-4,
         )
diff --git a/tests/models/layoutxlm/test_processing_layoutxlm.py b/tests/models/layoutxlm/test_processing_layoutxlm.py
index caf591bb6f4a..fdb767f59488 100644
--- a/tests/models/layoutxlm/test_processing_layoutxlm.py
+++ b/tests/models/layoutxlm/test_processing_layoutxlm.py
@@ -21,9 +21,10 @@
     require_sentencepiece,
     require_tokenizers,
     require_torch,
+    require_torchvision,
     slow,
 )
-from transformers.utils import is_pytesseract_available
+from transformers.utils import is_pytesseract_available, is_torchvision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -31,6 +32,9 @@
 if is_pytesseract_available():
     from transformers import LayoutLMv2ImageProcessor
 
+if is_torchvision_available():
+    from transformers import LayoutLMv2ImageProcessorFast
+
 
 @require_pytesseract
 @require_sentencepiece
@@ -62,11 +66,12 @@ def test_image_processor_defaults(self):
     def test_processor_with_multiple_inputs(self):
         pass
 
+    @require_torchvision
     def test_save_load_pretrained_additional_features(self):
         processor = self.get_processor()
-        # slow tokenizer
+        # slow tokenizer and image processor
         tokenizer_add_kwargs = self.get_component("tokenizer", bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_component("image_processor", do_resize=False, size=30)
+        image_processor_add_kwargs = self.get_component("image_processor", do_resize=False, size=30, use_fast=False)
 
         processor = LayoutXLMProcessor.from_pretrained(
             self.tmpdirname,
@@ -83,7 +88,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
         self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
 
-        # fast tokenizer
+        # fast tokenizer and image processor
         tokenizer_add_kwargs = self.get_component("tokenizer", bos_token="(BOS)", eos_token="(EOS)")
         image_processor_add_kwargs = self.get_component("image_processor", do_resize=False, size=30)
 
@@ -95,7 +100,7 @@ def test_save_load_pretrained_additional_features(self):
         self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizerFast)
 
         self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor)
+        self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessorFast)
 
     @slow
     def test_overflowing_tokens(self):
diff --git a/tests/models/lightglue/test_modeling_lightglue.py b/tests/models/lightglue/test_modeling_lightglue.py
index 697f63b82717..256d2d9a1577 100644
--- a/tests/models/lightglue/test_modeling_lightglue.py
+++ b/tests/models/lightglue/test_modeling_lightglue.py
@@ -497,63 +497,25 @@ def test_inference_without_early_stop_and_keypoint_pruning(self):
         predicted_matches_values1 = outputs.matches[1, 0, 10:30]
         predicted_matching_scores_values1 = outputs.matching_scores[1, 0, 10:30]
 
-        expected_number_of_matches0 = 144
+        expected_number_of_matches0 = 143
         expected_matches_values0 = torch.tensor(
-            [-1, -1, 17, -1, -1, 13, -1, -1, -1, -1, -1, -1, 5, -1, -1, 19, -1, 10, -1, 11], dtype=torch.int64
+            [-1, -1, -1, -1, 17, 13, -1, -1, -1, -1, -1, -1, 5, -1, -1, 19, -1, 10, -1, 11], dtype=torch.int64
         ).to(torch_device)
+        # fmt: off
         expected_matching_scores_values0 = torch.tensor(
-            [
-                0.0699,
-                0.0302,
-                0.3356,
-                0.0820,
-                0,
-                0.2266,
-                0,
-                0,
-                0.0241,
-                0,
-                0,
-                0,
-                0.1674,
-                0,
-                0,
-                0.8114,
-                0,
-                0.8120,
-                0,
-                0.2936,
-            ]
+            [0.0696, 0.0283, 0.0000, 0.0863, 0.2834, 0.2308, 0.0000, 0.0000, 0.0189, 0.0000, 0.0000, 0.0000, 0.1792, 0.0000, 0.0000, 0.8197, 0.0000, 0.8194, 0.0000, 0.3058]
         ).to(torch_device)
+        # fmt: on
 
         expected_number_of_matches1 = 862
         expected_matches_values1 = torch.tensor(
             [10, 11, -1, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, -1, 26, -1, 28, 29], dtype=torch.int64
         ).to(torch_device)
+        # fmt: off
         expected_matching_scores_values1 = torch.tensor(
-            [
-                0.4772,
-                0.3781,
-                0.0631,
-                0.9559,
-                0.8746,
-                0.9271,
-                0.4882,
-                0.5406,
-                0.9439,
-                0.1526,
-                0.5028,
-                0.4107,
-                0.5591,
-                0.9130,
-                0.7572,
-                0.0302,
-                0.4532,
-                0.0893,
-                0.9490,
-                0.4880,
-            ]
+            [0.4744, 0.3749, 0.0628, 0.9572, 0.8744, 0.9277, 0.4843, 0.5365, 0.9441, 0.1519, 0.5004, 0.4058, 0.5569, 0.9113, 0.7525, 0.0301, 0.4510, 0.0892, 0.9483, 0.4815]
         ).to(torch_device)
+        # fmt: on
 
         # expected_early_stopping_layer = 2
         # predicted_early_stopping_layer = torch.max(outputs.prune[1]).item()
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 01f345fb2f11..3f548af2dfd9 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -643,14 +643,7 @@ def test_pixtral(self):
         output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
         # fmt: off
-        EXPECTED_GENERATION = """
-Describe the images.
-The first image shows a black dog sitting on a wooden surface. The dog has a glossy coat and is looking directly at the camera with a calm expression. The wooden background appears to be made of weathered wooden planks, giving the image a rustic feel.
-
-The second image depicts a scenic mountain landscape. The mountains are rugged and covered with patches of green vegetation. The sky is clear, and the scene conveys a sense of tranquility and natural beauty. The mountains extend into the
-"""
-        # Remove the first and last empty character.
-        EXPECTED_GENERATION = EXPECTED_GENERATION[1:-1]
+        EXPECTED_GENERATION = "Describe the images.\nThe image depicts a black dog sitting on a wooden surface. The dog has a glossy black coat and is looking directly at the camera with a calm and attentive expression. The wooden background consists of horizontal wooden planks, giving the image a rustic and warm feel. The lighting is soft, highlighting the dog's features and creating a cozy atmosphere. The overall composition is simple and focuses on the dog as the main subject."
         # fmt: on
         # check that both inputs are handled correctly and generate the same output
         self.assertEqual(output, EXPECTED_GENERATION)
@@ -718,13 +711,13 @@ def test_pixtral_batched(self):
         EXPECTED_GENERATIONS = Expectations(
             {
                 (None, None): [
-                                'What breed is the dog?The dog in the image is a black Labrador Retriever.',
-                                'What is shown in this image?The image depicts a narrow, winding dirt path surrounded by lush greenery. The path is flanked by grass and shrubs on both sides. On the left side, there are tall trees and dense foliage, while on the right side, there'
-                            ],
+                    "What breed is the dog?The dog in the image is a black Labrador Retriever.",
+                    "What is shown in this image?The image depicts a narrow, winding dirt path surrounded by lush greenery. The path is bordered by grass and shrubs on both sides. On the left side, there are tall trees and dense foliage, while on the right side, there"
+                ],
                 ("rocm", (9, 5)): [
-                                'What breed is the dog?The dog in the image is a black Labrador Retriever.',
-                                'What is shown in this image?A dirt path stretches into the distance, flanked by grassy areas on either side. The path appears to be well-trodden and leads towards a wooded area with tall trees. The sky is clear and blue, suggesting a bright and sunny day'
-                            ],
+                    'What breed is the dog?The dog in the image is a black Labrador Retriever.',
+                    'What is shown in this image?A dirt path stretches into the distance, flanked by grassy areas on either side. The path appears to be well-trodden and leads towards a wooded area with tall trees. The sky is clear and blue, suggesting a bright and sunny day'
+                ],
             }
         )  # fmt: skip
 
diff --git a/tests/models/metaclip_2/test_modeling_metaclip_2.py b/tests/models/metaclip_2/test_modeling_metaclip_2.py
index 575bbd28a0cc..825326ac228c 100644
--- a/tests/models/metaclip_2/test_modeling_metaclip_2.py
+++ b/tests/models/metaclip_2/test_modeling_metaclip_2.py
@@ -699,6 +699,6 @@ def test_inference(self):
             torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
         )
 
-        expected_logits = torch.tensor([[19.9799, 13.6169]], device=torch_device)
+        expected_logits = torch.tensor([[19.9531, 13.5910]], device=torch_device)
 
         torch.testing.assert_close(outputs.logits_per_image, expected_logits, rtol=1e-3, atol=1e-3)
diff --git a/tests/models/mlcd/test_modeling_mlcd.py b/tests/models/mlcd/test_modeling_mlcd.py
index 4c93b3c49aaf..8bb8a4aebc55 100644
--- a/tests/models/mlcd/test_modeling_mlcd.py
+++ b/tests/models/mlcd/test_modeling_mlcd.py
@@ -183,21 +183,21 @@ def test_inference(self):
         # fmt: off
         expected_partial_5x5_last_hidden_state = torch.tensor(
             [
-                [-0.8978, -0.1181,  0.4769,  0.4761, -0.5779],
-                [ 0.2640, -2.6150,  0.4853,  0.5743, -1.1003],
-                [ 0.3314, -0.3328, -0.4305, -0.1874, -0.7701],
-                [-1.5174, -1.0238, -1.1854,  0.1749, -0.8786],
-                [ 0.2323, -0.8346, -0.9680, -0.2951,  0.0867],
+                [-0.8976, -0.1173, 0.4770, 0.4768, -0.5785],
+                [0.2828, -2.6036, 0.4997, 0.5538, -1.0822],
+                [0.3285, -0.3092, -0.4157, -0.1794, -0.7793],
+                [-1.5005, -1.0548, -1.2262, 0.2269, -0.9054],
+                [0.2317, -0.8372, -0.9653, -0.3017, 0.0871],
             ]
         ).to(torch_device)
 
         expected_partial_5x5_last_attention = torch.tensor(
             [
-                [2.0930e-01, 6.3073e-05, 1.4717e-03, 2.6881e-05, 3.0513e-03],
-                [1.5828e-04, 2.1056e-03, 4.6784e-04, 1.8276e-03, 5.3233e-04],
-                [5.7824e-04, 1.1446e-03, 1.3854e-03, 1.1775e-03, 1.2750e-03],
-                [9.6343e-05, 1.6365e-03, 2.9066e-04, 3.1089e-03, 2.0607e-04],
-                [6.2688e-04, 1.1656e-03, 1.5030e-03, 8.2819e-04, 2.6992e-03],
+                [2.0956e-01, 6.4854e-05, 1.5120e-03, 2.6588e-05, 3.0168e-03],
+                [1.6095e-04, 2.0924e-03, 4.7327e-04, 1.8991e-03, 5.4792e-04],
+                [5.8087e-04, 1.1215e-03, 1.3588e-03, 1.1862e-03, 1.2509e-03],
+                [9.5609e-05, 1.6015e-03, 2.8401e-04, 3.2878e-03, 2.0171e-04],
+                [6.2485e-04, 1.1751e-03, 1.4737e-03, 8.2471e-04, 2.6918e-03],
             ]
         ).to(torch_device)
         # fmt: on
diff --git a/tests/models/oneformer/test_processing_oneformer.py b/tests/models/oneformer/test_processing_oneformer.py
index fbae54699727..e7b11724f635 100644
--- a/tests/models/oneformer/test_processing_oneformer.py
+++ b/tests/models/oneformer/test_processing_oneformer.py
@@ -22,8 +22,13 @@
 from datasets import load_dataset
 from huggingface_hub import hf_hub_download
 
-from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.testing_utils import (
+    check_json_file_has_correct_format,
+    require_torch,
+    require_torchvision,
+    require_vision,
+)
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import prepare_image_inputs
 
@@ -36,6 +41,9 @@
         from transformers.models.oneformer.image_processing_oneformer import binary_mask_to_rle
         from transformers.models.oneformer.modeling_oneformer import OneFormerForUniversalSegmentationOutput
 
+    if is_torchvision_available():
+        from transformers.models.oneformer.image_processing_oneformer_fast import OneFormerImageProcessorFast
+
 if is_vision_available():
     from PIL import Image
 
@@ -122,7 +130,7 @@ def prepare_processor_dict(self):
             "num_text": self.num_text,
         }
 
-        image_processor = OneFormerImageProcessor(**image_processor_dict)
+        image_processor = OneFormerImageProcessorFast(**image_processor_dict)
         tokenizer = CLIPTokenizer.from_pretrained(self.model_repo)
 
         return {
@@ -392,7 +400,6 @@ def comm_get_processor_inputs(self, with_segmentation_maps=False, is_instance_ma
             annotations,
             return_tensors="pt",
             instance_id_to_semantic_id=instance_id_to_semantic_id,
-            pad_and_return_pixel_mask=True,
         )
 
         return inputs
@@ -401,6 +408,7 @@ def comm_get_processor_inputs(self, with_segmentation_maps=False, is_instance_ma
     def test_init_without_params(self):
         pass
 
+    @require_torchvision
     def test_feat_extract_from_and_save_pretrained(self):
         feat_extract_first = self.feature_extraction_class(**self.processor_dict)
 
@@ -410,7 +418,7 @@ def test_feat_extract_from_and_save_pretrained(self):
             feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
 
         self.assertEqual(feat_extract_second.image_processor.to_dict(), feat_extract_first.image_processor.to_dict())
-        self.assertIsInstance(feat_extract_first.image_processor, OneFormerImageProcessor)
+        self.assertIsInstance(feat_extract_first.image_processor, OneFormerImageProcessorFast)
         self.assertIsInstance(feat_extract_first.tokenizer, CLIPTokenizer)
 
     def test_call_with_segmentation_maps(self):
diff --git a/tests/models/owlv2/test_image_processing_owlv2.py b/tests/models/owlv2/test_image_processing_owlv2.py
index eef7d5771522..4c4670bbe8fa 100644
--- a/tests/models/owlv2/test_image_processing_owlv2.py
+++ b/tests/models/owlv2/test_image_processing_owlv2.py
@@ -151,7 +151,7 @@ def test_image_processor_integration_test_resize(self):
             with torch.no_grad():
                 outputs = model(**inputs)
 
-            results = processor.image_processor.post_process_object_detection(
+            results = processor.post_process_grounded_object_detection(
                 outputs, threshold=0.2, target_sizes=[target_size]
             )[0]
 
@@ -162,7 +162,7 @@ def test_image_processor_integration_test_resize(self):
             inputs = processor(text=[text, text], images=[image, image], return_tensors="pt")
             with torch.no_grad():
                 outputs = model(**inputs)
-            results = processor.image_processor.post_process_object_detection(
+            results = processor.post_process_grounded_object_detection(
                 outputs, threshold=0.2, target_sizes=[target_size, target_size]
             )
 
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index 4286f8c2bee0..d80d5429fb8a 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -740,7 +740,7 @@ def test_inference_mask_generation_no_point(self):
         scores = outputs.iou_scores.squeeze().cpu()
         masks = outputs.pred_masks[0, 0, 0, 0, :3].cpu()
         torch.testing.assert_close(scores[-1], torch.tensor(0.4515), rtol=2e-4, atol=2e-4)
-        torch.testing.assert_close(masks, torch.tensor([-4.1800, -3.4948, -3.4481]), rtol=2e-4, atol=2e-4)
+        torch.testing.assert_close(masks, torch.tensor([-4.1795, -3.4934, -3.4477]), rtol=2e-4, atol=2e-4)
 
     def test_inference_mask_generation_one_point_one_bb(self):
         model = SamModel.from_pretrained("facebook/sam-vit-base")
diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py
index dd3a3d56346b..6dda8e6fb689 100644
--- a/tests/models/siglip/test_modeling_siglip.py
+++ b/tests/models/siglip/test_modeling_siglip.py
@@ -608,7 +608,7 @@ def test_inference(self):
             torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
         )
 
-        expected_logits = torch.tensor([[-0.7567, -10.3354]], device=torch_device)
+        expected_logits = torch.tensor([[-0.7538, -10.3387]], device=torch_device)
 
         torch.testing.assert_close(outputs.logits_per_image, expected_logits, rtol=1e-3, atol=1e-3)
 
diff --git a/tests/models/smolvlm/test_processing_smolvlm.py b/tests/models/smolvlm/test_processing_smolvlm.py
index 015c9bd70a30..cca33a631f3d 100644
--- a/tests/models/smolvlm/test_processing_smolvlm.py
+++ b/tests/models/smolvlm/test_processing_smolvlm.py
@@ -296,7 +296,7 @@ def test_process_interleaved_images_prompts_image_error(self):
         with self.assertRaises(ValueError):
             processor(text=text, images=images, padding=True)
         images = [[], [self.image2]]
-        with self.assertRaises(ValueError):
+        with self.assertRaises((ValueError, IndexError)):
             processor(text=text, images=images, padding=True)
         images = [self.image1, self.image2, self.image3]
         with self.assertRaises(ValueError):
diff --git a/tests/models/superglue/test_modeling_superglue.py b/tests/models/superglue/test_modeling_superglue.py
index fbc37d2b3078..9f1504944b8b 100644
--- a/tests/models/superglue/test_modeling_superglue.py
+++ b/tests/models/superglue/test_modeling_superglue.py
@@ -392,14 +392,15 @@ def test_inference(self):
         predicted_matches_values = outputs.matches[0, 0, :30]
         predicted_matching_scores_values = outputs.matching_scores[0, 0, :20]
 
-        expected_number_of_matches = 282
-        expected_matches_values = torch.tensor([125,630,137,138,136,143,135,-1,-1,153,
-                                                154,156,117,160,-1,149,147,152,168,-1,
-                                                165,182,-1,190,187,188,189,112,-1,193],
-                                                device=predicted_matches_values.device)  # fmt:skip
-        expected_matching_scores_values = torch.tensor([0.9899,0.0033,0.9897,0.9889,0.9879,0.7464,0.7109,0.0,0.0,0.9841,
-                                                        0.9889,0.9639,0.0114,0.9559,0.0,0.9735,0.8018,0.5190,0.9157,0.0],
-                                                        device=predicted_matches_values.device)  # fmt:skip
+        expected_number_of_matches = 278
+        expected_matches_values = torch.tensor(
+            [125, 631, 137, 138, 136, 143, 135,  -1,  -1, 153, 154, 156, 117, 160, -1, 149, 147, 152, 168,  -1, 165, 182,  -1, 190, 187, 188, 189, 112, -1, 193],
+            device=predicted_matches_values.device
+        )  # fmt:skip
+        expected_matching_scores_values = torch.tensor(
+            [0.9899, 0.0553, 0.9897, 0.9889, 0.9879, 0.7557, 0.7155, 0.0000, 0.0000, 0.9840, 0.9889, 0.9644, 0.0102, 0.9557, 0.0000, 0.9737, 0.8050, 0.5204, 0.9167, 0.0000],
+            device=predicted_matches_values.device
+        )  # fmt:skip
 
         """
         Because of inconsistencies introduced between CUDA versions, the checks here are less strict. SuperGlue relies
diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py
index 825c1aabfd48..f73aa3bfa412 100644
--- a/tests/models/swin/test_modeling_swin.py
+++ b/tests/models/swin/test_modeling_swin.py
@@ -469,7 +469,7 @@ def test_inference_image_classification_head(self):
         # verify the logits
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = torch.tensor([-0.0948, -0.6454, -0.0921]).to(torch_device)
+        expected_slice = torch.tensor([-0.0970, -0.6469, -0.0927]).to(torch_device)
         torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
 
     @slow
diff --git a/tests/models/swinv2/test_modeling_swinv2.py b/tests/models/swinv2/test_modeling_swinv2.py
index 80498f946535..dc3f67aeae42 100644
--- a/tests/models/swinv2/test_modeling_swinv2.py
+++ b/tests/models/swinv2/test_modeling_swinv2.py
@@ -469,7 +469,7 @@ def test_inference_image_classification_head(self):
         # verify the logits
         expected_shape = torch.Size((1, 1000))
         self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = torch.tensor([-0.3947, -0.4306, 0.0026]).to(torch_device)
+        expected_slice = torch.tensor([-0.3951, -0.4292, 0.0025]).to(torch_device)
         torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
 
     @slow
diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py
index 39fc3b4bf4bb..bd05a727efc7 100644
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -655,7 +655,7 @@ def test_inference_natural_language_visual_reasoning(self):
             )
         else:
             expected_slice = torch.tensor(
-                [-2.3713, 2.9168],
+                [-2.3694, 2.9153],
                 device=torch_device,
             )
 
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index 466817b5afb1..07d0bbdfded2 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -1185,7 +1185,7 @@ def test_inference_printed(self):
             )
         else:
             expected_slice = torch.tensor(
-                [-5.6844, -5.8372, 1.1518, -6.8984, 6.8587, -2.4453, 1.2347, -1.0241, -1.9649, -3.9109],
+                [-5.6832, -5.8361, 1.1500, -6.8975, 6.8576, -2.4450, 1.2335, -1.0246, -1.9654, -3.9127],
                 device=torch_device,
             )
 
diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py
index fd558551a65f..ec2820ee7d45 100644
--- a/tests/models/yolos/test_modeling_yolos.py
+++ b/tests/models/yolos/test_modeling_yolos.py
@@ -354,11 +354,20 @@ def test_inference_object_detection_head(self):
         self.assertEqual(outputs.logits.shape, expected_shape)
 
         expected_slice_logits = torch.tensor(
-            [[-23.7219, -10.3165, -14.9083], [-41.5429, -15.2403, -24.1478], [-29.3909, -12.7173, -19.4650]],
+            [
+                [-23.7215, -10.3157, -14.9062],
+                [-41.5453, -15.2413, -24.1479],
+                [-29.4172, -12.7263, -19.4834],
+            ],
             device=torch_device,
         )
         expected_slice_boxes = torch.tensor(
-            [[0.2536, 0.5449, 0.4643], [0.2037, 0.7735, 0.3672], [0.7692, 0.4056, 0.4549]], device=torch_device
+            [
+                [0.2536, 0.5449, 0.4643],
+                [0.2038, 0.7735, 0.3670],
+                [0.7692, 0.4056, 0.4549],
+            ],
+            device=torch_device,
         )
         torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, rtol=1e-4, atol=1e-4)
         torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, rtol=1e-4, atol=1e-4)
@@ -367,11 +376,11 @@ def test_inference_object_detection_head(self):
         results = image_processor.post_process_object_detection(
             outputs, threshold=0.3, target_sizes=[image.size[::-1]]
         )[0]
-        expected_scores = torch.tensor([0.9991, 0.9801, 0.9978, 0.9875, 0.9848]).to(torch_device)
+        expected_scores = torch.tensor([0.9991, 0.9801, 0.9977, 0.9875, 0.9848]).to(torch_device)
         expected_labels = [75, 75, 17, 63, 17]
         expected_slice_boxes = torch.tensor([331.8438, 80.5440, 369.9546, 188.0579]).to(torch_device)
 
         self.assertEqual(len(results["scores"]), 5)
         torch.testing.assert_close(results["scores"], expected_scores, rtol=1e-4, atol=1e-4)
         self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
-        torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes)
+        torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes, rtol=1e-4, atol=1e-4)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index dbf5b7156c5a..4bb67ef472ad 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -1759,7 +1759,7 @@ def test_chat_template_audio_from_video(self):
             add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
-            return_tensors="np",
+            return_tensors="pt",
             load_audio_from_video=True,
         )
         self.assertTrue(self.audio_input_name in out_dict)