From a9a5539bb83b5a76cb3b2ab513a8addf7bdd6a6e Mon Sep 17 00:00:00 2001
From: RUFFY-369 <prakarshkaushik369@gmail.com>
Date: Sun, 4 Aug 2024 13:49:39 +0530
Subject: [PATCH 01/11] chore:add func and classes to get vid clips from user
 given paths

---
 .../imagebind/image_processing_imagebind.py   | 188 +++++++++++++++++-
 .../models/imagebind/processing_imagebind.py  |   2 +-
 2 files changed, 188 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/imagebind/image_processing_imagebind.py b/src/transformers/models/imagebind/image_processing_imagebind.py
index ed20d8fa9e76..242553357e09 100644
--- a/src/transformers/models/imagebind/image_processing_imagebind.py
+++ b/src/transformers/models/imagebind/image_processing_imagebind.py
@@ -13,8 +13,15 @@
 # limitations under the License.
 """Image processor class for ImageBind."""
 
+import decord
 from fractions import Fraction
-from typing import Dict, List, Optional, Tuple, Union
+import io
+import math
+import mimetypes
+import pathlib
+from pathlib import Path
+import torch
+from typing import BinaryIO, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -50,6 +57,35 @@
 if is_vision_available():
     import PIL
 
+def check_for_video_paths(videos) -> bool:
+    return (isinstance(videos, list) and all(isinstance(video, Path) and mimetypes.guess_type(video)[0].startswith('video/') for video in videos))
+
+#Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/data/encoded_video.py#L42
+def encoded_video_from_path(video_path):
+    """
+    Fetches the given video path using PathManager (allowing remote uris to be
+    fetched) and constructs the EncodedVideo object.
+
+    Args:
+        file_path (str): a PathManager file-path.
+    """
+    video_path = Path(video_path)
+    if video_path.is_file():
+        with video_path.open('rb') as file:
+            video_file = io.BytesIO(file.read())
+    else:
+        raise FileNotFoundError(f"{video_path} does not exist or is not a file")
+    
+    sample_rate=16000
+    video = EncodedVideoDecord(
+        file=video_file,
+        video_name=pathlib.Path(video_path).name,
+        decode_video=True,
+        decode_audio=False,
+        **{"sample_rate": sample_rate},
+    )
+    return video
+    
 
 # Copy from models.video_llava.image_processing_video_llava.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
@@ -117,6 +153,148 @@ def uniform_temporal_subsample(video: VideoInput, num_samples: int) -> VideoInpu
 
     return [video[i] for i in indices]
 
+#Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/data/encoded_video_decord.py#L28
+class EncodedVideoDecord():
+    """
+
+    Accessing clips from an encoded video using Decord video reading API
+    as the decoding backend. For more details, please refer to -
+    `Decord <https://github.com/dmlc/decord>`
+    """
+
+    def __init__(
+        self,
+        file: BinaryIO,
+        video_name: Optional[str] = None,
+        decode_video: bool = True,
+        decode_audio: bool = False,
+        sample_rate: int = 44100,
+        mono: bool = True,
+        width: int = -1,
+        height: int = -1,
+        num_threads: int = 0,
+        fault_tol: int = -1,
+    ) -> None:
+        """
+        Args:
+            file (BinaryIO): a file-like object (e.g. io.BytesIO or io.StringIO) that
+                contains the encoded video.
+            video_name (str): An optional name assigned to the video.
+            decode_video (bool): If disabled, video is not decoded.
+            decode_audio (bool): If disabled, audio is not decoded.
+            sample_rate: int, default is -1
+                Desired output sample rate of the audio, unchanged if `-1` is specified.
+            mono: bool, default is True
+                Desired output channel layout of the audio. `True` is mono layout. `False`
+                is unchanged.
+            width : int, default is -1
+                Desired output width of the video, unchanged if `-1` is specified.
+            height : int, default is -1
+                Desired output height of the video, unchanged if `-1` is specified.
+            num_threads : int, default is 0
+                Number of decoding thread, auto if `0` is specified.
+            fault_tol : int, default is -1
+                The threshold of corrupted and recovered frames. This is to prevent silent fault
+                tolerance when for example 50% frames of a video cannot be decoded and duplicate
+                frames are returned. You may find the fault tolerant feature sweet in many
+                cases, but not for training models. Say `N = # recovered frames`
+                If `fault_tol` < 0, nothing will happen.
+                If 0 < `fault_tol` < 1.0, if N > `fault_tol * len(video)`,
+                raise `DECORDLimitReachedError`.
+                If 1 < `fault_tol`, if N > `fault_tol`, raise `DECORDLimitReachedError`.
+        """
+        if not decode_video:
+            raise NotImplementedError()
+
+        self._video_name = video_name
+
+        try:
+            self._av_reader = decord.VideoReader(
+                uri=file,
+                ctx=decord.cpu(0),
+                width=width,
+                height=height,
+                num_threads=num_threads,
+                fault_tol=fault_tol,
+            )
+        except Exception as e:
+            raise RuntimeError(f"Failed to open video {video_name} with Decord. {e}")
+
+        self._fps = self._av_reader.get_avg_fps()
+
+        self._duration = float(len(self._av_reader)) / float(self._fps)
+
+    @property
+    def name(self) -> Optional[str]:
+        """
+        Returns:
+            name: the name of the stored video if set.
+        """
+        return self._video_name
+
+    @property
+    def duration(self) -> float:
+        """
+        Returns:
+            duration: the video's duration/end-time in seconds.
+        """
+        return self._duration
+
+    def close(self):
+        if self._av_reader is not None:
+            del self._av_reader
+            self._av_reader = None
+
+    def get_clip(
+        self, start_sec: float, end_sec: float
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """
+        Retrieves frames from the encoded video at the specified start and end times
+        in seconds (the video always starts at 0 seconds).
+
+        Args:
+            start_sec (float): the clip start time in seconds
+            end_sec (float): the clip end time in seconds
+        Returns:
+            clip_data:
+                A dictionary mapping the entries at "video" and "audio" to a tensors.
+
+                "video": A tensor of the clip's RGB frames with shape:
+                (channel, time, height, width). The frames are of type torch.float32 and
+                in the range [0 - 255].
+
+                "audio": A tensor of the clip's audio samples with shape:
+                (samples). The samples are of type torch.float32 and
+                in the range [0 - 255].
+
+            Returns None if no video or audio found within time range.
+
+        """
+        if start_sec > end_sec or start_sec > self._duration:
+            raise RuntimeError(
+                f"Incorrect time window for Decord decoding for video: {self._video_name}."
+            )
+
+        start_idx = math.ceil(self._fps * start_sec)
+        end_idx = math.ceil(self._fps * end_sec)
+        end_idx = min(end_idx, len(self._av_reader))
+        frame_idxs = list(range(start_idx, end_idx))
+
+        try:
+            outputs = self._av_reader.get_batch(frame_idxs)
+        except Exception as e:
+            logger.debug(f"Failed to decode video with Decord: {self._video_name}. {e}")
+            raise e
+
+        video = outputs
+
+        if video is not None:
+            video = video.to(torch.float32)
+            #Permute tensor from (time, height, weight, channel) to (channel, height, width, time).
+            video = video.permute(3, 0, 1, 2)
+
+
+        return video
 
 class ImageBindImageProcessor(BaseImageProcessor):
     r"""
@@ -551,7 +729,12 @@ def preprocess(
             )
         else:
             pixel_values = []
+                              
             for video in videos:
+                if check_for_video_paths(videos):
+                     video = encoded_video_from_path(
+                        video,
+                    )
                 if do_chunk:
                     clips = self.chunk(
                         video=video,
@@ -607,3 +790,6 @@ def preprocess(
                 pixel_values.append(_pixel_values)
 
         return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
+
+
+    
diff --git a/src/transformers/models/imagebind/processing_imagebind.py b/src/transformers/models/imagebind/processing_imagebind.py
index 1d8162852d24..fa79abb3d8a5 100644
--- a/src/transformers/models/imagebind/processing_imagebind.py
+++ b/src/transformers/models/imagebind/processing_imagebind.py
@@ -31,7 +31,7 @@ class ImageBindProcessorKwargs(ProcessingKwargs, total=False):
 
 class ImageBindProcessor(ProcessorMixin):
     r"""
-    Constructs a ImageBind processor which wraps a ImageBind image processor and feature extracotr and a CLIP tokenizer into a single processor.
+    Constructs a ImageBind processor which wraps a ImageBind image processor and feature extractor and a CLIP tokenizer into a single processor.
 
     [`ImageBindProcessor`] offers all the functionalities of [`ImageBindImageProcessor`], [`ImageBindFeatureExtractor`] and [`CLIPTokenizerFast`].
     See the [`~ImageBindProcessor.__call__`] and [`~ImageBindProcessor.decode`] for more information.

From d1c33d0ebb569005f4d40e2b5c9fdb3f8c453c57 Mon Sep 17 00:00:00 2001
From: RUFFY-369 <prakarshkaushik369@gmail.com>
Date: Sun, 4 Aug 2024 16:55:36 +0530
Subject: [PATCH 02/11] chore:update uniform_chunk_sampling()

---
 .../imagebind/image_processing_imagebind.py   | 50 ++++++++++++++-----
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/imagebind/image_processing_imagebind.py b/src/transformers/models/imagebind/image_processing_imagebind.py
index 242553357e09..c11896e9e9c2 100644
--- a/src/transformers/models/imagebind/image_processing_imagebind.py
+++ b/src/transformers/models/imagebind/image_processing_imagebind.py
@@ -113,22 +113,46 @@ def uniform_chunk_sampling(
 
     Args:
         total_duration (float): Total duration of the audio/video.
-        chunk_duration (float): Duration of each chunk.
-        num_chunks (int): Number of chunks to sample.
+        chunk_duration (float): Duration of each chunk(clip duration).
+        num_chunks (int): Number of chunks to sample(number of clips per video).
 
     Returns:
         List[Tuple[float, float]]: List of tuples where each tuple contains the start and end time of a chunk.
     """
+    _current_clip_index = 0
+    _current_aug_index = 0
+    _augs_per_clip: int = 1
+    
     chunk_duration_fraction = Fraction(chunk_duration)
-    max_possible_clip_start = Fraction(max(total_duration - chunk_duration, 0))
+    max_possible_clip_start = Fraction(max(total_duration - chunk_duration_fraction, 0)) # Previously chunk_duration was used instead of chunk_duration_fraction so that could be the reason for pixel values not matching
     uniform_clip = Fraction(max_possible_clip_start / max(num_chunks - 1, 1))
 
     result = []
-    for clip_index in range(num_chunks):
-        clip_start_sec = uniform_clip * clip_index
+    is_last_clip = False
+    while not is_last_clip:
+        clip_start_sec = uniform_clip * _current_clip_index
+        _current_aug_index += 1
+        if _current_aug_index >= _augs_per_clip:
+            _current_clip_index += 1
+            _current_aug_index = 0
+        
+        # Last clip is True if sampled self._clips_per_video or if end of video is reached.
+        is_last_clip = False
+        if (
+            _current_clip_index >= num_chunks
+            or uniform_clip * _current_clip_index > max_possible_clip_start
+        ):
+            _current_clip_index = 0
+            is_last_clip = True
+
+        # reset
+        if is_last_clip:
+            _current_clip_index = 0
+            _current_aug_index = 0
+
         clip_end_sec = clip_start_sec + chunk_duration_fraction
         result.append((clip_start_sec, clip_end_sec))
-
+    
     return result
 
 
@@ -336,9 +360,9 @@ class ImageBindImageProcessor(BaseImageProcessor):
         do_chunk (`bool`, *optional*, defaults to `False`):
             Whether to chunk the video into multiple clips.
         chunk_duration (`float`, *optional*, defaults to 2.0):
-            Duration of each chunk in seconds.
+            Duration of each chunk in seconds(clip duration).
         num_chunks (`int`, *optional*, defaults to 5):
-            Number of chunks to sample.
+            Number of chunks to sample(number of clips per video).
         num_frames_per_chunk (`int`, *optional*, defaults to 2):
             Number of frames to sample per chunk.
         fps (`int`, *optional*, defaults to 30):
@@ -481,13 +505,13 @@ def chunk(
             fps (`int`):
                 Frame rate of the video
             chunk_duration (`float`):
-                Duration of each chunk.
+                Duration of each chunk(clip duration).
             num_chunks (`int`):
-                Number of chunks to sample.
+                Number of chunks to sample(number of clips per video).
             num_frames_per_chunk (`int`):
                 Number of frames to sample per chunk.
         """
-        video_duration = len(video) / fps
+        video_duration = video.duration # EncodedVideoDecord obj
         if video_duration < chunk_duration:
             logger.warning_once(
                 "Chunk duration is greater than audio duration. Chunks will be repeated, consider adjusting either `chunk_duration` or `num_chunks`"
@@ -646,9 +670,9 @@ def preprocess(
             do_chunk (`bool`, *optional*, defaults to `self.do_chunk`):
                 Whether to chunk the video into multiple clips.
             chunk_duration (`float`, *optional*, defaults to `self.chunk_duration`):
-                Duration of each chunk in seconds.
+                Duration of each chunk in seconds(clip duration).
             num_chunks (`int`, *optional*, defaults to `self.num_chunks`):
-                Number of chunks to sample.
+                Number of chunks to sample(number of clips per video).
             num_frames_per_chunk (`int`, *optional*, defaults to `self.num_frames_per_chunk`):
                 Number of frames to sample per chunk.
             fps (`int`, *optional*, defaults to `self.fps`):

From 53fe0801154466504c7c0ea45bb0e4f080182640 Mon Sep 17 00:00:00 2001
From: RUFFY-369 <prakarshkaushik369@gmail.com>
Date: Sun, 4 Aug 2024 17:07:11 +0530
Subject: [PATCH 03/11] chore:change chunk duration val and type

---
 .../imagebind/image_processing_imagebind.py   | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/imagebind/image_processing_imagebind.py b/src/transformers/models/imagebind/image_processing_imagebind.py
index c11896e9e9c2..83e887755008 100644
--- a/src/transformers/models/imagebind/image_processing_imagebind.py
+++ b/src/transformers/models/imagebind/image_processing_imagebind.py
@@ -106,14 +106,14 @@ def make_batched_videos(videos) -> List[VideoInput]:
 
 # Copy from models.imagebind.feature_extraction_imagebind.uniform_chunk_sampling
 def uniform_chunk_sampling(
-    total_duration: float, chunk_duration: float, num_chunks: int
+    total_duration: float, chunk_duration: int, num_chunks: int
 ) -> List[Tuple[Fraction, Fraction]]:
     """
     Uniformly sample `num_chunks` chunks of duration `chunk_duration` from an audio/video of total duration `total_duration`.
 
     Args:
         total_duration (float): Total duration of the audio/video.
-        chunk_duration (float): Duration of each chunk(clip duration).
+        chunk_duration (int): Duration of each chunk(clip duration).
         num_chunks (int): Number of chunks to sample(number of clips per video).
 
     Returns:
@@ -359,7 +359,7 @@ class ImageBindImageProcessor(BaseImageProcessor):
             Whether to convert the image to RGB.
         do_chunk (`bool`, *optional*, defaults to `False`):
             Whether to chunk the video into multiple clips.
-        chunk_duration (`float`, *optional*, defaults to 2.0):
+        chunk_duration (`int`, *optional*, defaults to 2):
             Duration of each chunk in seconds(clip duration).
         num_chunks (`int`, *optional*, defaults to 5):
             Number of chunks to sample(number of clips per video).
@@ -385,7 +385,7 @@ def __init__(
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
         do_chunk: bool = False,
-        chunk_duration: float = 2.0,
+        chunk_duration: int = 2,
         num_chunks: int = 5,
         num_frames_per_chunk: int = 2,
         fps: int = 30,
@@ -494,7 +494,7 @@ def resize(
         )
 
     def chunk(
-        self, video: VideoInput, fps: int, chunk_duration: float, num_chunks: int, num_frames_per_chunk: int
+        self, video: VideoInput, fps: int, chunk_duration: int, num_chunks: int, num_frames_per_chunk: int
     ) -> List[VideoInput]:
         """
         Uniformly sample `num_chunks` chunks of duration `chunk_duration` from a video.
@@ -504,7 +504,7 @@ def chunk(
                 Video to chunk.
             fps (`int`):
                 Frame rate of the video
-            chunk_duration (`float`):
+            chunk_duration (`int`):
                 Duration of each chunk(clip duration).
             num_chunks (`int`):
                 Number of chunks to sample(number of clips per video).
@@ -522,7 +522,10 @@ def chunk(
 
         all_clips = []
         for clip_timepoints in all_clips_timepoints:
-            video_clip = video[int(clip_timepoints[0] * fps) : int(clip_timepoints[1] * fps)]
+            # Read the clip, get frames
+            video_clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
+            if video_clip is None:
+                raise ValueError("No clip found")
             video_clip = uniform_temporal_subsample(video_clip, num_samples=num_frames_per_chunk)
             all_clips.append(video_clip)
 
@@ -621,7 +624,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = None,
         do_chunk: bool = None,
-        chunk_duration: float = None,
+        chunk_duration: int = None,
         num_chunks: int = None,
         num_frames_per_chunk: int = None,
         fps: int = None,
@@ -669,7 +672,7 @@ def preprocess(
                 Whether to convert the image to RGB.
             do_chunk (`bool`, *optional*, defaults to `self.do_chunk`):
                 Whether to chunk the video into multiple clips.
-            chunk_duration (`float`, *optional*, defaults to `self.chunk_duration`):
+            chunk_duration (`int`, *optional*, defaults to `self.chunk_duration`):
                 Duration of each chunk in seconds(clip duration).
             num_chunks (`int`, *optional*, defaults to `self.num_chunks`):
                 Number of chunks to sample(number of clips per video).

From 99306ab091974137eca9055409fdf59c04457312 Mon Sep 17 00:00:00 2001
From: RUFFY-369 <prakarshkaushik369@gmail.com>
Date: Sun, 4 Aug 2024 18:34:50 +0530
Subject: [PATCH 04/11] chore:update uniform_temporal_subsample()

---
 .../imagebind/image_processing_imagebind.py   | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/imagebind/image_processing_imagebind.py b/src/transformers/models/imagebind/image_processing_imagebind.py
index 83e887755008..7c3b820394f5 100644
--- a/src/transformers/models/imagebind/image_processing_imagebind.py
+++ b/src/transformers/models/imagebind/image_processing_imagebind.py
@@ -169,13 +169,21 @@ def uniform_temporal_subsample(video: VideoInput, num_samples: int) -> VideoInpu
         num_samples (`int`):
             Number of frames to sample.
     """
-    num_frames = len(video)
+    # num_frames = len(video)
 
-    # Sample by nearest neighbor interpolation if num_samples > t.
-    indices = np.linspace(0, num_frames - 1, num_samples)
-    indices = np.clip(indices, 0, num_frames - 1).astype(int)
+    # # Sample by nearest neighbor interpolation if num_samples > t.
+    # indices = np.linspace(0, num_frames - 1, num_samples)
+    # indices = np.clip(indices, 0, num_frames - 1).astype(int)
 
-    return [video[i] for i in indices]
+    # return [video[i] for i in indices]
+
+    temporal_dim: int = -3
+    num_frames = video.shape[temporal_dim]
+    assert num_samples > 0 and num_frames > 0
+    # Sample by nearest neighbor interpolation if num_samples > num_frames.
+    indices = torch.linspace(0, num_frames - 1, num_samples)
+    indices = torch.clamp(indices, 0, num_frames - 1).long()
+    return torch.index_select(video, temporal_dim, indices)
 
 #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/data/encoded_video_decord.py#L28
 class EncodedVideoDecord():
@@ -509,7 +517,7 @@ def chunk(
             num_chunks (`int`):
                 Number of chunks to sample(number of clips per video).
             num_frames_per_chunk (`int`):
-                Number of frames to sample per chunk.
+                Number of frames to sample per chunk.######(WHY IS IT DEFINED WHEN chunk_duration can fulfill its purpose?)######
         """
         video_duration = video.duration # EncodedVideoDecord obj
         if video_duration < chunk_duration:
@@ -526,7 +534,8 @@ def chunk(
             video_clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
             if video_clip is None:
                 raise ValueError("No clip found")
-            video_clip = uniform_temporal_subsample(video_clip, num_samples=num_frames_per_chunk)
+            video_clip = uniform_temporal_subsample(video_clip, num_samples=chunk_duration)
+            video_clip = video_clip / 255.0  # since this is float, need 0-1
             all_clips.append(video_clip)
 
         return all_clips

From 082be8b8c202ef55d732d27485243ed3c035e0f9 Mon Sep 17 00:00:00 2001
From: RUFFY-369 <prakarshkaushik369@gmail.com>
Date: Sun, 4 Aug 2024 22:31:21 +0530
Subject: [PATCH 05/11] chore:update video transforms and few nits

---
 .../imagebind/image_processing_imagebind.py   | 247 +++++++++++++++---
 1 file changed, 215 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/imagebind/image_processing_imagebind.py b/src/transformers/models/imagebind/image_processing_imagebind.py
index 7c3b820394f5..920e675b5a57 100644
--- a/src/transformers/models/imagebind/image_processing_imagebind.py
+++ b/src/transformers/models/imagebind/image_processing_imagebind.py
@@ -21,6 +21,9 @@
 import pathlib
 from pathlib import Path
 import torch
+import torch.nn as nn
+from torchvision import transforms
+from torchvision.transforms._transforms_video import NormalizeVideo
 from typing import BinaryIO, Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -185,6 +188,125 @@ def uniform_temporal_subsample(video: VideoInput, num_samples: int) -> VideoInpu
     indices = torch.clamp(indices, 0, num_frames - 1).long()
     return torch.index_select(video, temporal_dim, indices)
 
+def crop_boxes(boxes, x_offset, y_offset):
+    """
+    Perform crop on the bounding boxes given the offsets.
+    Args:
+        boxes (ndarray or None): bounding boxes to perform crop. The dimension
+            is `num boxes` x 4.
+        x_offset (int): cropping offset in the x axis.
+        y_offset (int): cropping offset in the y axis.
+    Returns:
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    cropped_boxes = boxes.copy()
+    cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
+    cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
+
+    return cropped_boxes
+
+def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
+    """
+    Perform uniform spatial sampling on the images and corresponding boxes.
+    Args:
+        images (tensor): images to perform uniform crop. The dimension is
+            `num frames` x `channel` x `height` x `width`.
+        size (int): size of height and weight to crop the images.
+        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
+            is larger than height. Or 0, 1, or 2 for top, center, and bottom
+            crop if height is larger than width.
+        boxes (ndarray or None): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+        scale_size (int): optinal. If not None, resize the images to scale_size before
+            performing any crop.
+    Returns:
+        cropped (tensor): images with dimension of
+            `num frames` x `channel` x `size` x `size`.
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    assert spatial_idx in [0, 1, 2]
+    ndim = len(images.shape)
+    if ndim == 3:
+        images = images.unsqueeze(0)
+    height = images.shape[2]
+    width = images.shape[3]
+
+    if scale_size is not None:
+        if width <= height:
+            width, height = scale_size, int(height / width * scale_size)
+        else:
+            width, height = int(width / height * scale_size), scale_size
+        images = torch.nn.functional.interpolate(
+            images,
+            size=(height, width),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+    y_offset = int(math.ceil((height - size) / 2))
+    x_offset = int(math.ceil((width - size) / 2))
+
+    if height > width:
+        if spatial_idx == 0:
+            y_offset = 0
+        elif spatial_idx == 2:
+            y_offset = height - size
+    else:
+        if spatial_idx == 0:
+            x_offset = 0
+        elif spatial_idx == 2:
+            x_offset = width - size
+    cropped = images[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
+    cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
+    if ndim == 3:
+        cropped = cropped.squeeze(0)
+    return cropped, cropped_boxes
+
+
+class SpatialCrop(nn.Module):
+    """
+    Convert the video into 3 smaller clips spatially. Must be used after the
+        temporal crops to get spatial crops, and should be used with
+        -2 in the spatial crop at the slowfast augmentation stage (so full
+        frames are passed in here). Will return a larger list with the
+        3x spatial crops as well.
+    """
+
+    def __init__(self, crop_size: int = 224, num_crops: int = 3):
+        super().__init__()
+        self.crop_size = crop_size
+        if num_crops == 3:
+            self.crops_to_ext = [0, 1, 2]
+            self.flipped_crops_to_ext = []
+        elif num_crops == 1:
+            self.crops_to_ext = [1]
+            self.flipped_crops_to_ext = []
+        else:
+            raise NotImplementedError("Nothing else supported yet")
+
+    def forward(self, videos):
+        """
+        Args:
+            videos: A list of C, T, H, W videos.
+        Returns:
+            videos: A list with 3x the number of elements. Each video converted
+                to C, T, H', W' by spatial cropping.
+        """
+        assert isinstance(videos, list), "Must be a list of videos after temporal crops"
+        assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)"
+        res = []
+        for video in videos:
+            for spatial_idx in self.crops_to_ext:
+                res.append(uniform_crop(video, self.crop_size, spatial_idx)[0])
+            if not self.flipped_crops_to_ext:
+                continue
+            flipped_video = transforms.functional.hflip(video)
+            for spatial_idx in self.flipped_crops_to_ext:
+                res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
+        return res
+
 #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/data/encoded_video_decord.py#L28
 class EncodedVideoDecord():
     """
@@ -501,6 +623,47 @@ def resize(
             **kwargs,
         )
 
+    #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/transforms/functional.py#L92
+    def short_side_scale(
+        self,
+        x: torch.Tensor,
+        size: int = 224,
+        interpolation: str = "bilinear",
+        backend: str = "pytorch",
+    ) -> torch.Tensor:
+        """
+        Determines the shorter spatial dim of the video (i.e. width or height) and scales
+        it to the given size. To maintain aspect ratio, the longer side is then scaled
+        accordingly.
+        Args:
+            x (torch.Tensor): A video tensor of shape (C, T, H, W) and type torch.float32.
+            size (int): The size the shorter side is scaled to.
+            interpolation (str): Algorithm used for upsampling,
+                options: nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'
+            backend (str): backend used to perform interpolation. Options includes
+                `pytorch` as default, and `opencv`. Note that opencv and pytorch behave
+                differently on linear interpolation on some versions.
+                https://discuss.pytorch.org/t/pytorch-linear-interpolation-is-different-from-pil-opencv/71181
+        Returns:
+            An x-like Tensor with scaled spatial dims.
+        """  # noqa
+        assert len(x.shape) == 4
+        assert x.dtype == torch.float32
+        _, _, h, w = x.shape
+        if w < h:
+            new_h = int(math.floor((float(h) / w) * size))
+            new_w = size
+        else:
+            new_h = size
+            new_w = int(math.floor((float(w) / h) * size))
+        if backend == "pytorch":
+            return torch.nn.functional.interpolate(
+                x, size=(new_h, new_w), mode=interpolation, align_corners=False
+            )
+        else:
+            raise NotImplementedError(f"{backend} backend not supported.")
+
+
     def chunk(
         self, video: VideoInput, fps: int, chunk_duration: int, num_chunks: int, num_frames_per_chunk: int
     ) -> List[VideoInput]:
@@ -544,6 +707,7 @@ def chunk(
     def _preprocess_image(
         self,
         images: ImageInput,
+        is_video: bool = None,
         do_resize: bool = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
@@ -571,48 +735,58 @@ def _preprocess_image(
             resample=resample,
         )
 
-        if do_convert_rgb:
+        if do_convert_rgb and not is_video:
             images = [convert_to_rgb(image) for image in images]
 
         # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
+        if not is_video:
+            images = [to_numpy_array(image) for image in images]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if is_scaled_image(images[0]) and do_rescale and not is_video:
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
             )
 
-        if input_data_format is None:
+        if input_data_format is None and not is_video:
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
 
-        if do_resize:
-            images = [
-                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        if do_center_crop:
-            images = [
-                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
-            ]
+        if not is_video:
+            if do_resize:
+                images = [
+                    self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+            if do_center_crop:
+                images = [
+                    self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+                ]
+
+            if do_rescale:
+                images = [
+                    self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+            if do_normalize:
+                images = [
+                    self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                    for image in images
+                ]
 
-        if do_rescale:
             images = [
-                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-                for image in images
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
             ]
-
-        if do_normalize:
-            images = [
-                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        images = [
-            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
-        ]
+        else:
+            if do_resize:
+                images = self.short_side_scale(images)
+            if do_normalize:
+                images = NormalizeVideo(
+                            mean=image_mean,
+                            std=image_std,
+                        ),
 
         return images
 
@@ -734,8 +908,10 @@ def preprocess(
         fps = fps if fps is not None else self.fps
 
         if images is not None:
+            is_video = True
             images = make_list_of_images(images)
-        if videos is not None:
+        if videos is not None and (not check_for_video_paths(videos)):
+            is_video = True
             videos = make_batched_videos(videos)
 
         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
@@ -749,6 +925,7 @@ def preprocess(
         if images is not None:
             pixel_values = self._preprocess_image(
                 images=images,
+                is_video = is_video,
                 do_resize=do_resize,
                 size=size,
                 resample=resample,
@@ -768,7 +945,8 @@ def preprocess(
                               
             for video in videos:
                 if check_for_video_paths(videos):
-                     video = encoded_video_from_path(
+                    is_video = True
+                    video = encoded_video_from_path(
                         video,
                     )
                 if do_chunk:
@@ -783,6 +961,7 @@ def preprocess(
                     _pixel_values = [
                         self._preprocess_image(
                             images=clip,
+                            is_video = is_video,
                             do_resize=do_resize,
                             size=size,
                             resample=PILImageResampling.BILINEAR,
@@ -803,6 +982,7 @@ def preprocess(
                     _pixel_values = [
                         self._preprocess_image(
                             images=video,
+                            is_video = is_video,
                             do_resize=do_resize,
                             size=size,
                             resample=resample,
@@ -819,11 +999,14 @@ def preprocess(
                         )
                     ]
 
+                _pixel_values = SpatialCrop(224, num_crops=3)(_pixel_values)
                 # Avoid List[List[List[np.ndarray]]]
-                _pixel_values = np.stack(_pixel_values)
-                # Make it shape (num_chunks, num_channels, num_frames_per_chunk, height, width)
-                _pixel_values = np.swapaxes(_pixel_values, 1, 2)
+                _pixel_values = torch.stack(_pixel_values, dim = 0)
                 pixel_values.append(_pixel_values)
+                # _pixel_values = np.stack(_pixel_values)
+                # # Make it shape (num_chunks, num_channels, num_frames_per_chunk, height, width)
+                # _pixel_values = np.swapaxes(_pixel_values, 1, 2)
+                # pixel_values.append(_pixel_values)
 
         return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
 

From 1d6c4ea8cdf8a2ed8725be5d571a8f117c37d73a Mon Sep 17 00:00:00 2001
From: RUFFY-369 <prakarshkaushik369@gmail.com>
Date: Mon, 5 Aug 2024 02:22:00 +0530
Subject: [PATCH 06/11] fix:bug in image processor call on video paths

---
 .../imagebind/image_processing_imagebind.py   | 51 ++++++++++---------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/imagebind/image_processing_imagebind.py b/src/transformers/models/imagebind/image_processing_imagebind.py
index 920e675b5a57..117bd83e1700 100644
--- a/src/transformers/models/imagebind/image_processing_imagebind.py
+++ b/src/transformers/models/imagebind/image_processing_imagebind.py
@@ -56,12 +56,13 @@
 
 logger = logging.get_logger(__name__)
 
+decord.bridge.set_bridge("torch")
 
 if is_vision_available():
     import PIL
 
 def check_for_video_paths(videos) -> bool:
-    return (isinstance(videos, list) and all(isinstance(video, Path) and mimetypes.guess_type(video)[0].startswith('video/') for video in videos))
+    return (isinstance(videos, list) and all(isinstance(video, str) and mimetypes.guess_type(video)[0].startswith('video/') for video in videos))
 
 #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/data/encoded_video.py#L42
 def encoded_video_from_path(video_path):
@@ -295,14 +296,14 @@ def forward(self, videos):
                 to C, T, H', W' by spatial cropping.
         """
         assert isinstance(videos, list), "Must be a list of videos after temporal crops"
-        assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)"
+        assert all([video[0].ndim == 4 for video in videos]), "Must be (C,T,H,W)"
         res = []
         for video in videos:
             for spatial_idx in self.crops_to_ext:
-                res.append(uniform_crop(video, self.crop_size, spatial_idx)[0])
+                res.append(uniform_crop(video[0], self.crop_size, spatial_idx)[0])
             if not self.flipped_crops_to_ext:
                 continue
-            flipped_video = transforms.functional.hflip(video)
+            flipped_video = transforms.functional.hflip(video[0])
             for spatial_idx in self.flipped_crops_to_ext:
                 res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
         return res
@@ -735,22 +736,23 @@ def _preprocess_image(
             resample=resample,
         )
 
-        if do_convert_rgb and not is_video:
-            images = [convert_to_rgb(image) for image in images]
+        if not is_video:
+          if do_convert_rgb:
+              images = [convert_to_rgb(image) for image in images]
 
         # All transformations expect numpy arrays.
         if not is_video:
             images = [to_numpy_array(image) for image in images]
-
-        if is_scaled_image(images[0]) and do_rescale and not is_video:
-            logger.warning_once(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-
-        if input_data_format is None and not is_video:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
+        if not is_video:
+          if is_scaled_image(images[0]) and do_rescale:
+              logger.warning_once(
+                  "It looks like you are trying to rescale already rescaled images. If the input"
+                  " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+              )
+        if not is_video:
+          if input_data_format is None:
+              # We assume that all images have the same channel dimension format.
+              input_data_format = infer_channel_dimension_format(images[0])
 
         if not is_video:
             if do_resize:
@@ -786,7 +788,7 @@ def _preprocess_image(
                 images = NormalizeVideo(
                             mean=image_mean,
                             std=image_std,
-                        ),
+                        )(images),
 
         return images
 
@@ -908,7 +910,7 @@ def preprocess(
         fps = fps if fps is not None else self.fps
 
         if images is not None:
-            is_video = True
+            is_video = False
             images = make_list_of_images(images)
         if videos is not None and (not check_for_video_paths(videos)):
             is_video = True
@@ -916,11 +918,12 @@ def preprocess(
 
         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
 
-        if (videos is not None and not valid_images(videos)) or (images is not None and not valid_images(images)):
-            raise ValueError(
-                "Invalid input type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
+        if not check_for_video_paths(videos):
+          if (videos is not None and not valid_images(videos)) or (images is not None and not valid_images(images)):
+              raise ValueError(
+                  "Invalid input type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                  "torch.Tensor, tf.Tensor or jax.ndarray."
+              )
 
         if images is not None:
             pixel_values = self._preprocess_image(
@@ -1007,7 +1010,7 @@ def preprocess(
                 # # Make it shape (num_chunks, num_channels, num_frames_per_chunk, height, width)
                 # _pixel_values = np.swapaxes(_pixel_values, 1, 2)
                 # pixel_values.append(_pixel_values)
-
+            pixel_values = torch.stack(pixel_values, dim=0)
         return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
 
 

From 64d6c38b175948c02f0d62c3505fc822c46ce07f Mon Sep 17 00:00:00 2001
From: RUFFY-369 <prakarshkaushik369@gmail.com>
Date: Tue, 6 Aug 2024 16:50:21 +0530
Subject: [PATCH 07/11] chore:revert to original to test for unmatched outputs

---
 .../imagebind/image_processing_imagebind.py   | 1592 ++++++++++++-----
 1 file changed, 1107 insertions(+), 485 deletions(-)

diff --git a/src/transformers/models/imagebind/image_processing_imagebind.py b/src/transformers/models/imagebind/image_processing_imagebind.py
index 117bd83e1700..4b5b4bae053b 100644
--- a/src/transformers/models/imagebind/image_processing_imagebind.py
+++ b/src/transformers/models/imagebind/image_processing_imagebind.py
@@ -1,3 +1,1031 @@
+# # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+# """Image processor class for ImageBind."""
+
+# import decord
+# from fractions import Fraction
+# import io
+# import math
+# import mimetypes
+# import pathlib
+# from pathlib import Path
+# import torch
+# import torch.nn as nn
+# from torchvision import transforms
+# from torchvision.transforms._transforms_video import NormalizeVideo
+# from typing import BinaryIO, Dict, List, Optional, Tuple, Union
+
+# import numpy as np
+
+# from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+# from ...image_transforms import (
+#     convert_to_rgb,
+#     get_resize_output_image_size,
+#     resize,
+#     to_channel_dimension_format,
+# )
+# from ...image_utils import (
+#     OPENAI_CLIP_MEAN,
+#     OPENAI_CLIP_STD,
+#     ChannelDimension,
+#     ImageInput,
+#     PILImageResampling,
+#     VideoInput,
+#     infer_channel_dimension_format,
+#     is_scaled_image,
+#     is_valid_image,
+#     make_list_of_images,
+#     to_numpy_array,
+#     valid_images,
+#     validate_kwargs,
+#     validate_preprocess_arguments,
+# )
+# from ...utils import TensorType, is_vision_available, logging
+
+
+# logger = logging.get_logger(__name__)
+
+# decord.bridge.set_bridge("torch")
+
+# if is_vision_available():
+#     import PIL
+
+# # def check_for_video_paths(videos) -> bool:
+# #     return (isinstance(videos, list) and all(isinstance(video, str) and mimetypes.guess_type(video)[0].startswith('video/') for video in videos))
+
+# #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/data/encoded_video.py#L42
+# def encoded_video_from_path(video_path):
+#     """
+#     Fetches the given video path using PathManager (allowing remote uris to be
+#     fetched) and constructs the EncodedVideo object.
+
+#     Args:
+#         file_path (str): a PathManager file-path.
+#     """
+#     video_path = Path(video_path)
+#     if video_path.is_file():
+#         with video_path.open('rb') as file:
+#             video_file = io.BytesIO(file.read())
+#     else:
+#         raise FileNotFoundError(f"{video_path} does not exist or is not a file")
+    
+#     sample_rate=16000
+#     video = EncodedVideoDecord(
+#         file=video_file,
+#         video_name=pathlib.Path(video_path).name,
+#         decode_video=True,
+#         decode_audio=False,
+#         **{"sample_rate": sample_rate},
+#     )
+#     return video
+    
+
+# # Copy from models.video_llava.image_processing_video_llava.make_batched_videos
+# def make_batched_videos(videos) -> List[VideoInput]:
+#     if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+#         return videos
+
+#     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+#         if isinstance(videos[0], PIL.Image.Image):
+#             return [videos]
+#         elif len(videos[0].shape) == 4:
+#             return [list(video) for video in videos]
+
+#     elif is_valid_image(videos) and len(videos.shape) == 4:
+#         return [list(videos)]
+
+#     raise ValueError(f"Could not make batched video from {videos}")
+
+
+# # Copy from models.imagebind.feature_extraction_imagebind.uniform_chunk_sampling
+# def uniform_chunk_sampling(
+#     total_duration: float, chunk_duration: int, num_chunks: int
+# ) -> List[Tuple[Fraction, Fraction]]:
+#     """
+#     Uniformly sample `num_chunks` chunks of duration `chunk_duration` from an audio/video of total duration `total_duration`.
+
+#     Args:
+#         total_duration (float): Total duration of the audio/video.
+#         chunk_duration (int): Duration of each chunk(clip duration).
+#         num_chunks (int): Number of chunks to sample(number of clips per video).
+
+#     Returns:
+#         List[Tuple[float, float]]: List of tuples where each tuple contains the start and end time of a chunk.
+#     """
+#     _current_clip_index = 0
+#     _current_aug_index = 0
+#     _augs_per_clip: int = 1
+    
+#     chunk_duration_fraction = Fraction(chunk_duration)
+#     max_possible_clip_start = Fraction(max(total_duration - chunk_duration_fraction, 0)) # Previously chunk_duration was used instead of chunk_duration_fraction so that could be the reason for pixel values not matching
+#     uniform_clip = Fraction(max_possible_clip_start / max(num_chunks - 1, 1))
+
+#     result = []
+#     is_last_clip = False
+#     while not is_last_clip:
+#         clip_start_sec = uniform_clip * _current_clip_index
+#         _current_aug_index += 1
+#         if _current_aug_index >= _augs_per_clip:
+#             _current_clip_index += 1
+#             _current_aug_index = 0
+        
+#         # Last clip is True if sampled self._clips_per_video or if end of video is reached.
+#         is_last_clip = False
+#         if (
+#             _current_clip_index >= num_chunks
+#             or uniform_clip * _current_clip_index > max_possible_clip_start
+#         ):
+#             _current_clip_index = 0
+#             is_last_clip = True
+
+#         # reset
+#         if is_last_clip:
+#             _current_clip_index = 0
+#             _current_aug_index = 0
+
+#         clip_end_sec = clip_start_sec + chunk_duration_fraction
+#         result.append((clip_start_sec, clip_end_sec))
+    
+#     return result
+
+
+# # Adapted from https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
+# def uniform_temporal_subsample(video: VideoInput, num_samples: int) -> VideoInput:
+#     """
+#     Uniformly subsamples num_samples indices from the temporal dimension of the video.
+#     When num_samples is larger than the size of temporal dimension of the video, it
+#     will sample frames based on nearest neighbor interpolation.
+
+#     Args:
+#         video (`VideoInput`):
+#             Video to subsample.
+#         num_samples (`int`):
+#             Number of frames to sample.
+#     """
+#     # num_frames = len(video)
+
+#     # # Sample by nearest neighbor interpolation if num_samples > t.
+#     # indices = np.linspace(0, num_frames - 1, num_samples)
+#     # indices = np.clip(indices, 0, num_frames - 1).astype(int)
+
+#     # return [video[i] for i in indices]
+
+#     temporal_dim: int = -3
+#     num_frames = video.shape[temporal_dim]
+#     assert num_samples > 0 and num_frames > 0
+#     # Sample by nearest neighbor interpolation if num_samples > num_frames.
+#     indices = torch.linspace(0, num_frames - 1, num_samples)
+#     indices = torch.clamp(indices, 0, num_frames - 1).long()
+#     return torch.index_select(video, temporal_dim, indices)
+
+# def crop_boxes(boxes, x_offset, y_offset):
+#     """
+#     Perform crop on the bounding boxes given the offsets.
+#     Args:
+#         boxes (ndarray or None): bounding boxes to perform crop. The dimension
+#             is `num boxes` x 4.
+#         x_offset (int): cropping offset in the x axis.
+#         y_offset (int): cropping offset in the y axis.
+#     Returns:
+#         cropped_boxes (ndarray or None): the cropped boxes with dimension of
+#             `num boxes` x 4.
+#     """
+#     cropped_boxes = boxes.copy()
+#     cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
+#     cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
+
+#     return cropped_boxes
+
+# def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
+#     """
+#     Perform uniform spatial sampling on the images and corresponding boxes.
+#     Args:
+#         images (tensor): images to perform uniform crop. The dimension is
+#             `num frames` x `channel` x `height` x `width`.
+#         size (int): size of height and weight to crop the images.
+#         spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
+#             is larger than height. Or 0, 1, or 2 for top, center, and bottom
+#             crop if height is larger than width.
+#         boxes (ndarray or None): optional. Corresponding boxes to images.
+#             Dimension is `num boxes` x 4.
+#         scale_size (int): optinal. If not None, resize the images to scale_size before
+#             performing any crop.
+#     Returns:
+#         cropped (tensor): images with dimension of
+#             `num frames` x `channel` x `size` x `size`.
+#         cropped_boxes (ndarray or None): the cropped boxes with dimension of
+#             `num boxes` x 4.
+#     """
+#     assert spatial_idx in [0, 1, 2]
+#     ndim = len(images.shape)
+#     if ndim == 3:
+#         images = images.unsqueeze(0)
+#     height = images.shape[2]
+#     width = images.shape[3]
+
+#     if scale_size is not None:
+#         if width <= height:
+#             width, height = scale_size, int(height / width * scale_size)
+#         else:
+#             width, height = int(width / height * scale_size), scale_size
+#         images = torch.nn.functional.interpolate(
+#             images,
+#             size=(height, width),
+#             mode="bilinear",
+#             align_corners=False,
+#         )
+
+#     y_offset = int(math.ceil((height - size) / 2))
+#     x_offset = int(math.ceil((width - size) / 2))
+
+#     if height > width:
+#         if spatial_idx == 0:
+#             y_offset = 0
+#         elif spatial_idx == 2:
+#             y_offset = height - size
+#     else:
+#         if spatial_idx == 0:
+#             x_offset = 0
+#         elif spatial_idx == 2:
+#             x_offset = width - size
+#     cropped = images[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
+#     cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
+#     if ndim == 3:
+#         cropped = cropped.squeeze(0)
+#     return cropped, cropped_boxes
+
+
+# class SpatialCrop(nn.Module):
+#     """
+#     Convert the video into 3 smaller clips spatially. Must be used after the
+#         temporal crops to get spatial crops, and should be used with
+#         -2 in the spatial crop at the slowfast augmentation stage (so full
+#         frames are passed in here). Will return a larger list with the
+#         3x spatial crops as well.
+#     """
+
+#     def __init__(self, crop_size: int = 224, num_crops: int = 3):
+#         super().__init__()
+#         self.crop_size = crop_size
+#         if num_crops == 3:
+#             self.crops_to_ext = [0, 1, 2]
+#             self.flipped_crops_to_ext = []
+#         elif num_crops == 1:
+#             self.crops_to_ext = [1]
+#             self.flipped_crops_to_ext = []
+#         else:
+#             raise NotImplementedError("Nothing else supported yet")
+
+#     def forward(self, videos):
+#         """
+#         Args:
+#             videos: A list of C, T, H, W videos.
+#         Returns:
+#             videos: A list with 3x the number of elements. Each video converted
+#                 to C, T, H', W' by spatial cropping.
+#         """
+#         assert isinstance(videos, list), "Must be a list of videos after temporal crops"
+#         assert all([video[0].ndim == 4 for video in videos]), "Must be (C,T,H,W)"
+#         res = []
+#         for video in videos:
+#             for spatial_idx in self.crops_to_ext:
+#                 res.append(uniform_crop(video[0], self.crop_size, spatial_idx)[0])
+#             if not self.flipped_crops_to_ext:
+#                 continue
+#             flipped_video = transforms.functional.hflip(video[0])
+#             for spatial_idx in self.flipped_crops_to_ext:
+#                 res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
+#         return res
+
+# #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/data/encoded_video_decord.py#L28
+# class EncodedVideoDecord():
+#     """
+
+#     Accessing clips from an encoded video using Decord video reading API
+#     as the decoding backend. For more details, please refer to -
+#     `Decord <https://github.com/dmlc/decord>`
+#     """
+
+#     def __init__(
+#         self,
+#         file: BinaryIO,
+#         video_name: Optional[str] = None,
+#         decode_video: bool = True,
+#         decode_audio: bool = False,
+#         sample_rate: int = 44100,
+#         mono: bool = True,
+#         width: int = -1,
+#         height: int = -1,
+#         num_threads: int = 0,
+#         fault_tol: int = -1,
+#     ) -> None:
+#         """
+#         Args:
+#             file (BinaryIO): a file-like object (e.g. io.BytesIO or io.StringIO) that
+#                 contains the encoded video.
+#             video_name (str): An optional name assigned to the video.
+#             decode_video (bool): If disabled, video is not decoded.
+#             decode_audio (bool): If disabled, audio is not decoded.
+#             sample_rate: int, default is -1
+#                 Desired output sample rate of the audio, unchanged if `-1` is specified.
+#             mono: bool, default is True
+#                 Desired output channel layout of the audio. `True` is mono layout. `False`
+#                 is unchanged.
+#             width : int, default is -1
+#                 Desired output width of the video, unchanged if `-1` is specified.
+#             height : int, default is -1
+#                 Desired output height of the video, unchanged if `-1` is specified.
+#             num_threads : int, default is 0
+#                 Number of decoding thread, auto if `0` is specified.
+#             fault_tol : int, default is -1
+#                 The threshold of corrupted and recovered frames. This is to prevent silent fault
+#                 tolerance when for example 50% frames of a video cannot be decoded and duplicate
+#                 frames are returned. You may find the fault tolerant feature sweet in many
+#                 cases, but not for training models. Say `N = # recovered frames`
+#                 If `fault_tol` < 0, nothing will happen.
+#                 If 0 < `fault_tol` < 1.0, if N > `fault_tol * len(video)`,
+#                 raise `DECORDLimitReachedError`.
+#                 If 1 < `fault_tol`, if N > `fault_tol`, raise `DECORDLimitReachedError`.
+#         """
+#         if not decode_video:
+#             raise NotImplementedError()
+
+#         self._video_name = video_name
+
+#         try:
+#             self._av_reader = decord.VideoReader(
+#                 uri=file,
+#                 ctx=decord.cpu(0),
+#                 width=width,
+#                 height=height,
+#                 num_threads=num_threads,
+#                 fault_tol=fault_tol,
+#             )
+#         except Exception as e:
+#             raise RuntimeError(f"Failed to open video {video_name} with Decord. {e}")
+
+#         self._fps = self._av_reader.get_avg_fps()
+
+#         self._duration = float(len(self._av_reader)) / float(self._fps)
+
+#     @property
+#     def name(self) -> Optional[str]:
+#         """
+#         Returns:
+#             name: the name of the stored video if set.
+#         """
+#         return self._video_name
+
+#     @property
+#     def duration(self) -> float:
+#         """
+#         Returns:
+#             duration: the video's duration/end-time in seconds.
+#         """
+#         return self._duration
+
+#     def close(self):
+#         if self._av_reader is not None:
+#             del self._av_reader
+#             self._av_reader = None
+
+#     def get_clip(
+#         self, start_sec: float, end_sec: float
+#     ) -> Dict[str, Optional[torch.Tensor]]:
+#         """
+#         Retrieves frames from the encoded video at the specified start and end times
+#         in seconds (the video always starts at 0 seconds).
+
+#         Args:
+#             start_sec (float): the clip start time in seconds
+#             end_sec (float): the clip end time in seconds
+#         Returns:
+#             clip_data:
+#                 A dictionary mapping the entries at "video" and "audio" to a tensors.
+
+#                 "video": A tensor of the clip's RGB frames with shape:
+#                 (channel, time, height, width). The frames are of type torch.float32 and
+#                 in the range [0 - 255].
+
+#                 "audio": A tensor of the clip's audio samples with shape:
+#                 (samples). The samples are of type torch.float32 and
+#                 in the range [0 - 255].
+
+#             Returns None if no video or audio found within time range.
+
+#         """
+#         if start_sec > end_sec or start_sec > self._duration:
+#             raise RuntimeError(
+#                 f"Incorrect time window for Decord decoding for video: {self._video_name}."
+#             )
+
+#         start_idx = math.ceil(self._fps * start_sec)
+#         end_idx = math.ceil(self._fps * end_sec)
+#         end_idx = min(end_idx, len(self._av_reader))
+#         frame_idxs = list(range(start_idx, end_idx))
+
+#         try:
+#             outputs = self._av_reader.get_batch(frame_idxs)
+#         except Exception as e:
+#             logger.debug(f"Failed to decode video with Decord: {self._video_name}. {e}")
+#             raise e
+
+#         video = outputs
+
+#         if video is not None:
+#             video = video.to(torch.float32)
+#             #Permute tensor from (time, height, weight, channel) to (channel, height, width, time).
+#             video = video.permute(3, 0, 1, 2)
+
+
+#         return video
+
+# class ImageBindImageProcessor(BaseImageProcessor):
+#     r"""
+#     Constructs an ImageBind image processor.
+
+#     Args:
+#         do_resize (`bool`, *optional*, defaults to `True`):
+#             Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+#             `do_resize` in the `preprocess` method.
+#         size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+#             Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+#             the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+#             method.
+#         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+#             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+#         do_center_crop (`bool`, *optional*, defaults to `True`):
+#             Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+#             `preprocess` method.
+#         crop_size (`Dict[str, int]` *optional*, defaults to 224):
+#             Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+#             method.
+#         do_rescale (`bool`, *optional*, defaults to `True`):
+#             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+#             the `preprocess` method.
+#         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+#             Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+#             method.
+#         do_normalize (`bool`, *optional*, defaults to `True`):
+#             Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+#         image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+#             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+#             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+#         image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+#             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+#             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+#             Can be overridden by the `image_std` parameter in the `preprocess` method.
+#         do_convert_rgb (`bool`, *optional*, defaults to `True`):
+#             Whether to convert the image to RGB.
+#         do_chunk (`bool`, *optional*, defaults to `False`):
+#             Whether to chunk the video into multiple clips.
+#         chunk_duration (`int`, *optional*, defaults to 2):
+#             Duration of each chunk in seconds(clip duration).
+#         num_chunks (`int`, *optional*, defaults to 5):
+#             Number of chunks to sample(number of clips per video).
+#         num_frames_per_chunk (`int`, *optional*, defaults to 2):
+#             Number of frames to sample per chunk.
+#         fps (`int`, *optional*, defaults to 30):
+#             Frame rate of the video. It's assumed that all videos have the same frame rate.
+#     """
+
+#     model_input_names = ["pixel_values"]
+
+#     def __init__(
+#         self,
+#         do_resize: bool = True,
+#         size: Dict[str, int] = None,
+#         resample: PILImageResampling = PILImageResampling.BICUBIC,
+#         do_center_crop: bool = True,
+#         crop_size: Dict[str, int] = None,
+#         do_rescale: bool = True,
+#         rescale_factor: Union[int, float] = 1 / 255,
+#         do_normalize: bool = True,
+#         image_mean: Optional[Union[float, List[float]]] = None,
+#         image_std: Optional[Union[float, List[float]]] = None,
+#         do_convert_rgb: bool = True,
+#         do_chunk: bool = False,
+#         chunk_duration: int = 2,
+#         num_chunks: int = 5,
+#         num_frames_per_chunk: int = 2,
+#         fps: int = 30,
+#         **kwargs,
+#     ) -> None:
+#         super().__init__(**kwargs)
+#         size = size if size is not None else {"shortest_edge": 224}
+#         size = get_size_dict(size, default_to_square=False)
+#         crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+#         crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+#         self.do_resize = do_resize
+#         self.size = size
+#         self.resample = resample
+#         self.do_center_crop = do_center_crop
+#         self.crop_size = crop_size
+#         self.do_rescale = do_rescale
+#         self.rescale_factor = rescale_factor
+#         self.do_normalize = do_normalize
+#         self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+#         self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+#         self.do_convert_rgb = do_convert_rgb
+#         self.do_chunk = do_chunk
+#         self.chunk_duration = chunk_duration
+#         self.num_chunks = num_chunks
+#         self.num_frames_per_chunk = num_frames_per_chunk
+#         self.fps = fps
+#         self._valid_processor_keys = [
+#             "images",
+#             "do_resize",
+#             "size",
+#             "resample",
+#             "do_center_crop",
+#             "crop_size",
+#             "do_rescale",
+#             "rescale_factor",
+#             "do_normalize",
+#             "image_mean",
+#             "image_std",
+#             "do_convert_rgb",
+#             "do_chunk",
+#             "chunk_duration",
+#             "num_chunks",
+#             "fps",
+#             "return_tensors",
+#             "data_format",
+#             "input_data_format",
+#         ]
+
+#         # for backwards compatibility of KOSMOS-2
+#         if "use_square_size" in kwargs and kwargs["use_square_size"]:
+#             self.size = {"height": size["shortest_edge"], "width": size["shortest_edge"]}
+#             # Let's remove `use_square_size` (as it is removed from #27690), so the future Kosmos-2 image processors
+#             # won't have this attr. being saved. (otherwise, it will enter this if branch while there is no more
+#             # `shortest_edge` key.
+#             delattr(self, "use_square_size")
+
+#     # Copied from models.clip.image_processing_clip.CLIPImageProcessor.resize
+#     def resize(
+#         self,
+#         image: np.ndarray,
+#         size: Dict[str, int],
+#         resample: PILImageResampling = PILImageResampling.BICUBIC,
+#         data_format: Optional[Union[str, ChannelDimension]] = None,
+#         input_data_format: Optional[Union[str, ChannelDimension]] = None,
+#         **kwargs,
+#     ) -> np.ndarray:
+#         """
+#         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+#         resized to keep the input aspect ratio.
+
+#         Args:
+#             image (`np.ndarray`):
+#                 Image to resize.
+#             size (`Dict[str, int]`):
+#                 Size of the output image.
+#             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+#                 Resampling filter to use when resiizing the image.
+#             data_format (`str` or `ChannelDimension`, *optional*):
+#                 The channel dimension format of the image. If not provided, it will be the same as the input image.
+#             input_data_format (`ChannelDimension` or `str`, *optional*):
+#                 The channel dimension format of the input image. If not provided, it will be inferred.
+#         """
+#         default_to_square = True
+#         if "shortest_edge" in size:
+#             size = size["shortest_edge"]
+#             default_to_square = False
+#         elif "height" in size and "width" in size:
+#             size = (size["height"], size["width"])
+#         else:
+#             raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+#         output_size = get_resize_output_image_size(
+#             image,
+#             size=size,
+#             default_to_square=default_to_square,
+#             input_data_format=input_data_format,
+#         )
+#         return resize(
+#             image,
+#             size=output_size,
+#             resample=resample,
+#             data_format=data_format,
+#             input_data_format=input_data_format,
+#             **kwargs,
+#         )
+
+#     #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/transforms/functional.py#L92
+#     def short_side_scale(
+#         self,
+#         x: torch.Tensor,
+#         size: int = 224,
+#         interpolation: str = "bilinear",
+#         backend: str = "pytorch",
+#     ) -> torch.Tensor:
+#         """
+#         Determines the shorter spatial dim of the video (i.e. width or height) and scales
+#         it to the given size. To maintain aspect ratio, the longer side is then scaled
+#         accordingly.
+#         Args:
+#             x (torch.Tensor): A video tensor of shape (C, T, H, W) and type torch.float32.
+#             size (int): The size the shorter side is scaled to.
+#             interpolation (str): Algorithm used for upsampling,
+#                 options: nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'
+#             backend (str): backend used to perform interpolation. Options includes
+#                 `pytorch` as default, and `opencv`. Note that opencv and pytorch behave
+#                 differently on linear interpolation on some versions.
+#                 https://discuss.pytorch.org/t/pytorch-linear-interpolation-is-different-from-pil-opencv/71181
+#         Returns:
+#             An x-like Tensor with scaled spatial dims.
+#         """  # noqa
+#         assert len(x.shape) == 4
+#         assert x.dtype == torch.float32
+#         _, _, h, w = x.shape
+#         if w < h:
+#             new_h = int(math.floor((float(h) / w) * size))
+#             new_w = size
+#         else:
+#             new_h = size
+#             new_w = int(math.floor((float(w) / h) * size))
+#         if backend == "pytorch":
+#             return torch.nn.functional.interpolate(
+#                 x, size=(new_h, new_w), mode=interpolation, align_corners=False
+#             )
+#         else:
+#             raise NotImplementedError(f"{backend} backend not supported.")
+
+
+#     def chunk(
+#         self, video: VideoInput, fps: int, chunk_duration: int, num_chunks: int, num_frames_per_chunk: int
+#     ) -> List[VideoInput]:
+#         """
+#         Uniformly sample `num_chunks` chunks of duration `chunk_duration` from a video.
+
+#         Args:
+#             video (`VideoInput`):
+#                 Video to chunk.
+#             fps (`int`):
+#                 Frame rate of the video
+#             chunk_duration (`int`):
+#                 Duration of each chunk(clip duration).
+#             num_chunks (`int`):
+#                 Number of chunks to sample(number of clips per video).
+#             num_frames_per_chunk (`int`):
+#                 Number of frames to sample per chunk.######(WHY IS IT DEFINED WHEN chunk_duration can fulfill its purpose?)######
+#         """
+#         video_duration = video.duration # EncodedVideoDecord obj
+#         if video_duration < chunk_duration:
+#             logger.warning_once(
+#                 "Chunk duration is greater than audio duration. Chunks will be repeated, consider adjusting either `chunk_duration` or `num_chunks`"
+#                 "to avoid unnecessary memory/compute usage."
+#             )
+
+#         all_clips_timepoints = uniform_chunk_sampling(video_duration, chunk_duration, num_chunks)
+
+#         all_clips = []
+#         for clip_timepoints in all_clips_timepoints:
+#             # Read the clip, get frames
+#             video_clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
+#             if video_clip is None:
+#                 raise ValueError("No clip found")
+#             video_clip = uniform_temporal_subsample(video_clip, num_samples=chunk_duration)
+#             video_clip = video_clip / 255.0  # since this is float, need 0-1
+#             all_clips.append(video_clip)
+
+#         return all_clips
+
+#     # Copied from models.clip.image_processing_clip.CLIPImageProcessor.preprocess with preprocess->_preprocess_image
+#     def _preprocess_image(
+#         self,
+#         images: ImageInput,
+#         is_video: bool = None,
+#         do_resize: bool = None,
+#         size: Dict[str, int] = None,
+#         resample: PILImageResampling = None,
+#         do_center_crop: bool = None,
+#         crop_size: int = None,
+#         do_rescale: bool = None,
+#         rescale_factor: float = None,
+#         do_normalize: bool = None,
+#         image_mean: Optional[Union[float, List[float]]] = None,
+#         image_std: Optional[Union[float, List[float]]] = None,
+#         do_convert_rgb: bool = None,
+#         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+#         input_data_format: Optional[Union[str, ChannelDimension]] = None,
+#     ) -> np.ndarray:
+#         validate_preprocess_arguments(
+#             do_rescale=do_rescale,
+#             rescale_factor=rescale_factor,
+#             do_normalize=do_normalize,
+#             image_mean=image_mean,
+#             image_std=image_std,
+#             do_center_crop=do_center_crop,
+#             crop_size=crop_size,
+#             do_resize=do_resize,
+#             size=size,
+#             resample=resample,
+#         )
+
+#         if not is_video:
+#           if do_convert_rgb:
+#               images = [convert_to_rgb(image) for image in images]
+
+#         # All transformations expect numpy arrays.
+#         if not is_video:
+#             images = [to_numpy_array(image) for image in images]
+#         if not is_video:
+#           if is_scaled_image(images[0]) and do_rescale:
+#               logger.warning_once(
+#                   "It looks like you are trying to rescale already rescaled images. If the input"
+#                   " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+#               )
+#         if not is_video:
+#           if input_data_format is None:
+#               # We assume that all images have the same channel dimension format.
+#               input_data_format = infer_channel_dimension_format(images[0])
+
+#         if not is_video:
+#             if do_resize:
+#                 images = [
+#                     self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+#                     for image in images
+#                 ]
+
+#             if do_center_crop:
+#                 images = [
+#                     self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+#                 ]
+
+#             if do_rescale:
+#                 images = [
+#                     self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+#                     for image in images
+#                 ]
+
+#             if do_normalize:
+#                 images = [
+#                     self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+#                     for image in images
+#                 ]
+
+#             images = [
+#                 to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+#             ]
+#         else:
+#             if do_resize:
+#                 images = self.short_side_scale(images)
+#             if do_normalize:
+#                 images = NormalizeVideo(
+#                             mean=image_mean,
+#                             std=image_std,
+#                         )(images),
+
+#         return images
+
+#     # Ignore copy
+#     def preprocess(
+#         self,
+#         images: Optional[ImageInput] = None,
+#         videos: Optional[VideoInput] = None,
+#         do_resize: bool = None,
+#         size: Dict[str, int] = None,
+#         resample: PILImageResampling = None,
+#         do_center_crop: bool = None,
+#         crop_size: int = None,
+#         do_rescale: bool = None,
+#         rescale_factor: float = None,
+#         do_normalize: bool = None,
+#         image_mean: Optional[Union[float, List[float]]] = None,
+#         image_std: Optional[Union[float, List[float]]] = None,
+#         do_convert_rgb: bool = None,
+#         do_chunk: bool = None,
+#         chunk_duration: int = None,
+#         num_chunks: int = None,
+#         num_frames_per_chunk: int = None,
+#         fps: int = None,
+#         return_tensors: Optional[Union[str, TensorType]] = None,
+#         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+#         input_data_format: Optional[Union[str, ChannelDimension]] = None,
+#         **kwargs,
+#     ) -> PIL.Image.Image:
+#         """
+#         Preprocess an image or batch of images.
+
+#         Args:
+#             images (`ImageInput`, *optional*):
+#                 Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+#                 passing in images with pixel values between 0 and 1, set `do_rescale=False`. Either `images` or
+#                 `videos` must be provided.
+#             videos (`VideoInput`, *optional*):
+#                 Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+#                 passing in videos with pixel values between 0 and 1, set `do_rescale=False`. Either `images` or
+#                 `videos` must be provided.
+#             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+#                 Whether to resize the image.
+#             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+#                 Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+#                 the longest edge resized to keep the input aspect ratio.
+#             resample (`int`, *optional*, defaults to `self.resample`):
+#                 Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+#                 has an effect if `do_resize` is set to `True`.
+#             do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+#                 Whether to center crop the image.
+#             crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+#                 Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+#             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+#                 Whether to rescale the image.
+#             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+#                 Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+#             do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+#                 Whether to normalize the image.
+#             image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+#                 Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+#             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+#                 Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+#                 `True`.
+#             do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+#                 Whether to convert the image to RGB.
+#             do_chunk (`bool`, *optional*, defaults to `self.do_chunk`):
+#                 Whether to chunk the video into multiple clips.
+#             chunk_duration (`int`, *optional*, defaults to `self.chunk_duration`):
+#                 Duration of each chunk in seconds(clip duration).
+#             num_chunks (`int`, *optional*, defaults to `self.num_chunks`):
+#                 Number of chunks to sample(number of clips per video).
+#             num_frames_per_chunk (`int`, *optional*, defaults to `self.num_frames_per_chunk`):
+#                 Number of frames to sample per chunk.
+#             fps (`int`, *optional*, defaults to `self.fps`):
+#                 Frame rate of the video. It's assumed that all videos have the same frame rate.
+#             return_tensors (`str` or `TensorType`, *optional*):
+#                 The type of tensors to return. Can be one of:
+#                 - Unset: Return a list of `np.ndarray`.
+#                 - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+#                 - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+#                 - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+#                 - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+#             data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+#                 The channel dimension format for the output image. Can be one of:
+#                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+#                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+#                 - Unset: Use the channel dimension format of the input image.
+#             input_data_format (`ChannelDimension` or `str`, *optional*):
+#                 The channel dimension format for the input image. If unset, the channel dimension format is inferred
+#                 from the input image. Can be one of:
+#                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+#                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+#                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+#         """
+#         if images is None and videos is None:
+#             raise ValueError("Either `images` or `videos` must be provided.")
+
+#         if images is not None and videos is not None:
+#             raise ValueError("Only one of `images` or `videos` can be provided.")
+
+#         do_resize = do_resize if do_resize is not None else self.do_resize
+#         size = size if size is not None else self.size
+#         size = get_size_dict(size, param_name="size", default_to_square=False)
+#         resample = resample if resample is not None else self.resample
+#         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+#         crop_size = crop_size if crop_size is not None else self.crop_size
+#         crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+#         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+#         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+#         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+#         image_mean = image_mean if image_mean is not None else self.image_mean
+#         image_std = image_std if image_std is not None else self.image_std
+#         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+#         do_chunk = do_chunk if do_chunk is not None else self.do_chunk
+#         chunk_duration = chunk_duration if chunk_duration is not None else self.chunk_duration
+#         num_chunks = num_chunks if num_chunks is not None else self.num_chunks
+#         num_frames_per_chunk = num_frames_per_chunk if num_frames_per_chunk is not None else self.num_frames_per_chunk
+#         fps = fps if fps is not None else self.fps
+
+#         if images is not None:
+#             is_video = False
+#             images = make_list_of_images(images)
+#         if videos is not None:
+#             is_video = True
+#             videos = make_batched_videos(videos)
+
+#         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+#         if (videos is not None and not valid_images(videos)) or (images is not None and not valid_images(images)):
+#             raise ValueError(
+#                 "Invalid input type. Must be of type PIL.Image.Image, numpy.ndarray, "
+#                 "torch.Tensor, tf.Tensor or jax.ndarray."
+#             )
+
+#         if images is not None:
+#             pixel_values = self._preprocess_image(
+#                 images=images,
+#                 is_video = is_video,
+#                 do_resize=do_resize,
+#                 size=size,
+#                 resample=resample,
+#                 do_center_crop=do_center_crop,
+#                 crop_size=crop_size,
+#                 do_rescale=do_rescale,
+#                 rescale_factor=rescale_factor,
+#                 do_normalize=do_normalize,
+#                 image_mean=image_mean,
+#                 image_std=image_std,
+#                 do_convert_rgb=do_convert_rgb,
+#                 data_format=data_format,
+#                 input_data_format=input_data_format,
+#             )
+#         else:
+#             pixel_values = []
+                              
+#             for video in videos:
+#                 # if check_for_video_paths(videos):
+#                 #     is_video = True
+#                 #     video = encoded_video_from_path(
+#                 #         video,
+#                 #     )
+#                 if do_chunk:
+#                     clips = self.chunk(
+#                         video=video,
+#                         fps=fps,
+#                         chunk_duration=chunk_duration,
+#                         num_chunks=num_chunks,
+#                         num_frames_per_chunk=num_frames_per_chunk,
+#                     )
+
+#                     _pixel_values = [
+#                         self._preprocess_image(
+#                             images=clip,
+#                             is_video = is_video,
+#                             do_resize=do_resize,
+#                             size=size,
+#                             resample=PILImageResampling.BILINEAR,
+#                             do_center_crop=do_center_crop,
+#                             crop_size=crop_size,
+#                             do_rescale=do_rescale,
+#                             rescale_factor=rescale_factor,
+#                             do_normalize=do_normalize,
+#                             image_mean=image_mean,
+#                             image_std=image_std,
+#                             do_convert_rgb=do_convert_rgb,
+#                             data_format=data_format,
+#                             input_data_format=input_data_format,
+#                         )
+#                         for clip in clips
+#                     ]
+#                 else:
+#                     _pixel_values = [
+#                         self._preprocess_image(
+#                             images=video,
+#                             is_video = is_video,
+#                             do_resize=do_resize,
+#                             size=size,
+#                             resample=resample,
+#                             do_center_crop=do_center_crop,
+#                             crop_size=crop_size,
+#                             do_rescale=do_rescale,
+#                             rescale_factor=rescale_factor,
+#                             do_normalize=do_normalize,
+#                             image_mean=image_mean,
+#                             image_std=image_std,
+#                             do_convert_rgb=do_convert_rgb,
+#                             data_format=data_format,
+#                             input_data_format=input_data_format,
+#                         )
+#                     ]
+
+#                 _pixel_values = SpatialCrop(224, num_crops=3)(_pixel_values)
+#                 # Avoid List[List[List[np.ndarray]]]
+#                 _pixel_values = torch.stack(_pixel_values, dim = 0)
+#                 pixel_values.append(_pixel_values)
+#                 # _pixel_values = np.stack(_pixel_values)
+#                 # # Make it shape (num_chunks, num_channels, num_frames_per_chunk, height, width)
+#                 # _pixel_values = np.swapaxes(_pixel_values, 1, 2)
+#                 # pixel_values.append(_pixel_values)
+#             pixel_values = torch.stack(pixel_values, dim=0)
+#         return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,18 +1041,9 @@
 # limitations under the License.
 """Image processor class for ImageBind."""
 
-import decord
-from fractions import Fraction
-import io
 import math
-import mimetypes
-import pathlib
-from pathlib import Path
-import torch
-import torch.nn as nn
-from torchvision import transforms
-from torchvision.transforms._transforms_video import NormalizeVideo
-from typing import BinaryIO, Dict, List, Optional, Tuple, Union
+from fractions import Fraction
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -56,40 +1075,10 @@
 
 logger = logging.get_logger(__name__)
 
-decord.bridge.set_bridge("torch")
 
 if is_vision_available():
     import PIL
 
-def check_for_video_paths(videos) -> bool:
-    return (isinstance(videos, list) and all(isinstance(video, str) and mimetypes.guess_type(video)[0].startswith('video/') for video in videos))
-
-#Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/data/encoded_video.py#L42
-def encoded_video_from_path(video_path):
-    """
-    Fetches the given video path using PathManager (allowing remote uris to be
-    fetched) and constructs the EncodedVideo object.
-
-    Args:
-        file_path (str): a PathManager file-path.
-    """
-    video_path = Path(video_path)
-    if video_path.is_file():
-        with video_path.open('rb') as file:
-            video_file = io.BytesIO(file.read())
-    else:
-        raise FileNotFoundError(f"{video_path} does not exist or is not a file")
-    
-    sample_rate=16000
-    video = EncodedVideoDecord(
-        file=video_file,
-        video_name=pathlib.Path(video_path).name,
-        decode_video=True,
-        decode_audio=False,
-        **{"sample_rate": sample_rate},
-    )
-    return video
-    
 
 # Copy from models.video_llava.image_processing_video_llava.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
@@ -110,53 +1099,29 @@ def make_batched_videos(videos) -> List[VideoInput]:
 
 # Copy from models.imagebind.feature_extraction_imagebind.uniform_chunk_sampling
 def uniform_chunk_sampling(
-    total_duration: float, chunk_duration: int, num_chunks: int
+    total_duration: float, chunk_duration: float, num_chunks: int
 ) -> List[Tuple[Fraction, Fraction]]:
     """
     Uniformly sample `num_chunks` chunks of duration `chunk_duration` from an audio/video of total duration `total_duration`.
 
     Args:
         total_duration (float): Total duration of the audio/video.
-        chunk_duration (int): Duration of each chunk(clip duration).
-        num_chunks (int): Number of chunks to sample(number of clips per video).
+        chunk_duration (float): Duration of each chunk.
+        num_chunks (int): Number of chunks to sample.
 
     Returns:
         List[Tuple[float, float]]: List of tuples where each tuple contains the start and end time of a chunk.
     """
-    _current_clip_index = 0
-    _current_aug_index = 0
-    _augs_per_clip: int = 1
-    
     chunk_duration_fraction = Fraction(chunk_duration)
-    max_possible_clip_start = Fraction(max(total_duration - chunk_duration_fraction, 0)) # Previously chunk_duration was used instead of chunk_duration_fraction so that could be the reason for pixel values not matching
+    max_possible_clip_start = Fraction(max(total_duration - chunk_duration, 0))
     uniform_clip = Fraction(max_possible_clip_start / max(num_chunks - 1, 1))
 
     result = []
-    is_last_clip = False
-    while not is_last_clip:
-        clip_start_sec = uniform_clip * _current_clip_index
-        _current_aug_index += 1
-        if _current_aug_index >= _augs_per_clip:
-            _current_clip_index += 1
-            _current_aug_index = 0
-        
-        # Last clip is True if sampled self._clips_per_video or if end of video is reached.
-        is_last_clip = False
-        if (
-            _current_clip_index >= num_chunks
-            or uniform_clip * _current_clip_index > max_possible_clip_start
-        ):
-            _current_clip_index = 0
-            is_last_clip = True
-
-        # reset
-        if is_last_clip:
-            _current_clip_index = 0
-            _current_aug_index = 0
-
+    for clip_index in range(num_chunks):
+        clip_start_sec = uniform_clip * clip_index
         clip_end_sec = clip_start_sec + chunk_duration_fraction
         result.append((clip_start_sec, clip_end_sec))
-    
+
     return result
 
 
@@ -173,283 +1138,14 @@ def uniform_temporal_subsample(video: VideoInput, num_samples: int) -> VideoInpu
         num_samples (`int`):
             Number of frames to sample.
     """
-    # num_frames = len(video)
-
-    # # Sample by nearest neighbor interpolation if num_samples > t.
-    # indices = np.linspace(0, num_frames - 1, num_samples)
-    # indices = np.clip(indices, 0, num_frames - 1).astype(int)
-
-    # return [video[i] for i in indices]
-
-    temporal_dim: int = -3
-    num_frames = video.shape[temporal_dim]
-    assert num_samples > 0 and num_frames > 0
-    # Sample by nearest neighbor interpolation if num_samples > num_frames.
-    indices = torch.linspace(0, num_frames - 1, num_samples)
-    indices = torch.clamp(indices, 0, num_frames - 1).long()
-    return torch.index_select(video, temporal_dim, indices)
-
-def crop_boxes(boxes, x_offset, y_offset):
-    """
-    Perform crop on the bounding boxes given the offsets.
-    Args:
-        boxes (ndarray or None): bounding boxes to perform crop. The dimension
-            is `num boxes` x 4.
-        x_offset (int): cropping offset in the x axis.
-        y_offset (int): cropping offset in the y axis.
-    Returns:
-        cropped_boxes (ndarray or None): the cropped boxes with dimension of
-            `num boxes` x 4.
-    """
-    cropped_boxes = boxes.copy()
-    cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
-    cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
-
-    return cropped_boxes
-
-def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
-    """
-    Perform uniform spatial sampling on the images and corresponding boxes.
-    Args:
-        images (tensor): images to perform uniform crop. The dimension is
-            `num frames` x `channel` x `height` x `width`.
-        size (int): size of height and weight to crop the images.
-        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
-            is larger than height. Or 0, 1, or 2 for top, center, and bottom
-            crop if height is larger than width.
-        boxes (ndarray or None): optional. Corresponding boxes to images.
-            Dimension is `num boxes` x 4.
-        scale_size (int): optinal. If not None, resize the images to scale_size before
-            performing any crop.
-    Returns:
-        cropped (tensor): images with dimension of
-            `num frames` x `channel` x `size` x `size`.
-        cropped_boxes (ndarray or None): the cropped boxes with dimension of
-            `num boxes` x 4.
-    """
-    assert spatial_idx in [0, 1, 2]
-    ndim = len(images.shape)
-    if ndim == 3:
-        images = images.unsqueeze(0)
-    height = images.shape[2]
-    width = images.shape[3]
-
-    if scale_size is not None:
-        if width <= height:
-            width, height = scale_size, int(height / width * scale_size)
-        else:
-            width, height = int(width / height * scale_size), scale_size
-        images = torch.nn.functional.interpolate(
-            images,
-            size=(height, width),
-            mode="bilinear",
-            align_corners=False,
-        )
-
-    y_offset = int(math.ceil((height - size) / 2))
-    x_offset = int(math.ceil((width - size) / 2))
-
-    if height > width:
-        if spatial_idx == 0:
-            y_offset = 0
-        elif spatial_idx == 2:
-            y_offset = height - size
-    else:
-        if spatial_idx == 0:
-            x_offset = 0
-        elif spatial_idx == 2:
-            x_offset = width - size
-    cropped = images[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
-    cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
-    if ndim == 3:
-        cropped = cropped.squeeze(0)
-    return cropped, cropped_boxes
-
-
-class SpatialCrop(nn.Module):
-    """
-    Convert the video into 3 smaller clips spatially. Must be used after the
-        temporal crops to get spatial crops, and should be used with
-        -2 in the spatial crop at the slowfast augmentation stage (so full
-        frames are passed in here). Will return a larger list with the
-        3x spatial crops as well.
-    """
-
-    def __init__(self, crop_size: int = 224, num_crops: int = 3):
-        super().__init__()
-        self.crop_size = crop_size
-        if num_crops == 3:
-            self.crops_to_ext = [0, 1, 2]
-            self.flipped_crops_to_ext = []
-        elif num_crops == 1:
-            self.crops_to_ext = [1]
-            self.flipped_crops_to_ext = []
-        else:
-            raise NotImplementedError("Nothing else supported yet")
-
-    def forward(self, videos):
-        """
-        Args:
-            videos: A list of C, T, H, W videos.
-        Returns:
-            videos: A list with 3x the number of elements. Each video converted
-                to C, T, H', W' by spatial cropping.
-        """
-        assert isinstance(videos, list), "Must be a list of videos after temporal crops"
-        assert all([video[0].ndim == 4 for video in videos]), "Must be (C,T,H,W)"
-        res = []
-        for video in videos:
-            for spatial_idx in self.crops_to_ext:
-                res.append(uniform_crop(video[0], self.crop_size, spatial_idx)[0])
-            if not self.flipped_crops_to_ext:
-                continue
-            flipped_video = transforms.functional.hflip(video[0])
-            for spatial_idx in self.flipped_crops_to_ext:
-                res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
-        return res
-
-#Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/data/encoded_video_decord.py#L28
-class EncodedVideoDecord():
-    """
+    num_frames = len(video)
 
-    Accessing clips from an encoded video using Decord video reading API
-    as the decoding backend. For more details, please refer to -
-    `Decord <https://github.com/dmlc/decord>`
-    """
-
-    def __init__(
-        self,
-        file: BinaryIO,
-        video_name: Optional[str] = None,
-        decode_video: bool = True,
-        decode_audio: bool = False,
-        sample_rate: int = 44100,
-        mono: bool = True,
-        width: int = -1,
-        height: int = -1,
-        num_threads: int = 0,
-        fault_tol: int = -1,
-    ) -> None:
-        """
-        Args:
-            file (BinaryIO): a file-like object (e.g. io.BytesIO or io.StringIO) that
-                contains the encoded video.
-            video_name (str): An optional name assigned to the video.
-            decode_video (bool): If disabled, video is not decoded.
-            decode_audio (bool): If disabled, audio is not decoded.
-            sample_rate: int, default is -1
-                Desired output sample rate of the audio, unchanged if `-1` is specified.
-            mono: bool, default is True
-                Desired output channel layout of the audio. `True` is mono layout. `False`
-                is unchanged.
-            width : int, default is -1
-                Desired output width of the video, unchanged if `-1` is specified.
-            height : int, default is -1
-                Desired output height of the video, unchanged if `-1` is specified.
-            num_threads : int, default is 0
-                Number of decoding thread, auto if `0` is specified.
-            fault_tol : int, default is -1
-                The threshold of corrupted and recovered frames. This is to prevent silent fault
-                tolerance when for example 50% frames of a video cannot be decoded and duplicate
-                frames are returned. You may find the fault tolerant feature sweet in many
-                cases, but not for training models. Say `N = # recovered frames`
-                If `fault_tol` < 0, nothing will happen.
-                If 0 < `fault_tol` < 1.0, if N > `fault_tol * len(video)`,
-                raise `DECORDLimitReachedError`.
-                If 1 < `fault_tol`, if N > `fault_tol`, raise `DECORDLimitReachedError`.
-        """
-        if not decode_video:
-            raise NotImplementedError()
-
-        self._video_name = video_name
-
-        try:
-            self._av_reader = decord.VideoReader(
-                uri=file,
-                ctx=decord.cpu(0),
-                width=width,
-                height=height,
-                num_threads=num_threads,
-                fault_tol=fault_tol,
-            )
-        except Exception as e:
-            raise RuntimeError(f"Failed to open video {video_name} with Decord. {e}")
-
-        self._fps = self._av_reader.get_avg_fps()
-
-        self._duration = float(len(self._av_reader)) / float(self._fps)
-
-    @property
-    def name(self) -> Optional[str]:
-        """
-        Returns:
-            name: the name of the stored video if set.
-        """
-        return self._video_name
-
-    @property
-    def duration(self) -> float:
-        """
-        Returns:
-            duration: the video's duration/end-time in seconds.
-        """
-        return self._duration
-
-    def close(self):
-        if self._av_reader is not None:
-            del self._av_reader
-            self._av_reader = None
-
-    def get_clip(
-        self, start_sec: float, end_sec: float
-    ) -> Dict[str, Optional[torch.Tensor]]:
-        """
-        Retrieves frames from the encoded video at the specified start and end times
-        in seconds (the video always starts at 0 seconds).
-
-        Args:
-            start_sec (float): the clip start time in seconds
-            end_sec (float): the clip end time in seconds
-        Returns:
-            clip_data:
-                A dictionary mapping the entries at "video" and "audio" to a tensors.
-
-                "video": A tensor of the clip's RGB frames with shape:
-                (channel, time, height, width). The frames are of type torch.float32 and
-                in the range [0 - 255].
-
-                "audio": A tensor of the clip's audio samples with shape:
-                (samples). The samples are of type torch.float32 and
-                in the range [0 - 255].
-
-            Returns None if no video or audio found within time range.
-
-        """
-        if start_sec > end_sec or start_sec > self._duration:
-            raise RuntimeError(
-                f"Incorrect time window for Decord decoding for video: {self._video_name}."
-            )
-
-        start_idx = math.ceil(self._fps * start_sec)
-        end_idx = math.ceil(self._fps * end_sec)
-        end_idx = min(end_idx, len(self._av_reader))
-        frame_idxs = list(range(start_idx, end_idx))
-
-        try:
-            outputs = self._av_reader.get_batch(frame_idxs)
-        except Exception as e:
-            logger.debug(f"Failed to decode video with Decord: {self._video_name}. {e}")
-            raise e
-
-        video = outputs
-
-        if video is not None:
-            video = video.to(torch.float32)
-            #Permute tensor from (time, height, weight, channel) to (channel, height, width, time).
-            video = video.permute(3, 0, 1, 2)
+    # Sample by nearest neighbor interpolation if num_samples > t.
+    indices = np.linspace(0, num_frames - 1, num_samples)
+    indices = np.clip(indices, 0, num_frames - 1).astype(int)
 
+    return [video[i] for i in indices]
 
-        return video
 
 class ImageBindImageProcessor(BaseImageProcessor):
     r"""
@@ -488,12 +1184,12 @@ class ImageBindImageProcessor(BaseImageProcessor):
             Can be overridden by the `image_std` parameter in the `preprocess` method.
         do_convert_rgb (`bool`, *optional*, defaults to `True`):
             Whether to convert the image to RGB.
-        do_chunk (`bool`, *optional*, defaults to `False`):
+        do_chunk (`bool`, *optional*, defaults to `True`):
             Whether to chunk the video into multiple clips.
-        chunk_duration (`int`, *optional*, defaults to 2):
-            Duration of each chunk in seconds(clip duration).
+        chunk_duration (`float`, *optional*, defaults to 2.0):
+            Duration of each chunk in seconds.
         num_chunks (`int`, *optional*, defaults to 5):
-            Number of chunks to sample(number of clips per video).
+            Number of chunks to sample.
         num_frames_per_chunk (`int`, *optional*, defaults to 2):
             Number of frames to sample per chunk.
         fps (`int`, *optional*, defaults to 30):
@@ -515,8 +1211,8 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        do_chunk: bool = False,
-        chunk_duration: int = 2,
+        do_chunk: bool = True,
+        chunk_duration: float = 2.0,
         num_chunks: int = 5,
         num_frames_per_chunk: int = 2,
         fps: int = 30,
@@ -624,49 +1320,8 @@ def resize(
             **kwargs,
         )
 
-    #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/transforms/functional.py#L92
-    def short_side_scale(
-        self,
-        x: torch.Tensor,
-        size: int = 224,
-        interpolation: str = "bilinear",
-        backend: str = "pytorch",
-    ) -> torch.Tensor:
-        """
-        Determines the shorter spatial dim of the video (i.e. width or height) and scales
-        it to the given size. To maintain aspect ratio, the longer side is then scaled
-        accordingly.
-        Args:
-            x (torch.Tensor): A video tensor of shape (C, T, H, W) and type torch.float32.
-            size (int): The size the shorter side is scaled to.
-            interpolation (str): Algorithm used for upsampling,
-                options: nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'
-            backend (str): backend used to perform interpolation. Options includes
-                `pytorch` as default, and `opencv`. Note that opencv and pytorch behave
-                differently on linear interpolation on some versions.
-                https://discuss.pytorch.org/t/pytorch-linear-interpolation-is-different-from-pil-opencv/71181
-        Returns:
-            An x-like Tensor with scaled spatial dims.
-        """  # noqa
-        assert len(x.shape) == 4
-        assert x.dtype == torch.float32
-        _, _, h, w = x.shape
-        if w < h:
-            new_h = int(math.floor((float(h) / w) * size))
-            new_w = size
-        else:
-            new_h = size
-            new_w = int(math.floor((float(w) / h) * size))
-        if backend == "pytorch":
-            return torch.nn.functional.interpolate(
-                x, size=(new_h, new_w), mode=interpolation, align_corners=False
-            )
-        else:
-            raise NotImplementedError(f"{backend} backend not supported.")
-
-
     def chunk(
-        self, video: VideoInput, fps: int, chunk_duration: int, num_chunks: int, num_frames_per_chunk: int
+        self, video: VideoInput, fps: int, chunk_duration: float, num_chunks: int, num_frames_per_chunk: int
     ) -> List[VideoInput]:
         """
         Uniformly sample `num_chunks` chunks of duration `chunk_duration` from a video.
@@ -676,14 +1331,14 @@ def chunk(
                 Video to chunk.
             fps (`int`):
                 Frame rate of the video
-            chunk_duration (`int`):
-                Duration of each chunk(clip duration).
+            chunk_duration (`float`):
+                Duration of each chunk.
             num_chunks (`int`):
-                Number of chunks to sample(number of clips per video).
+                Number of chunks to sample.
             num_frames_per_chunk (`int`):
-                Number of frames to sample per chunk.######(WHY IS IT DEFINED WHEN chunk_duration can fulfill its purpose?)######
+                Number of frames to sample per chunk.
         """
-        video_duration = video.duration # EncodedVideoDecord obj
+        video_duration = len(video) / fps
         if video_duration < chunk_duration:
             logger.warning_once(
                 "Chunk duration is greater than audio duration. Chunks will be repeated, consider adjusting either `chunk_duration` or `num_chunks`"
@@ -694,12 +1349,8 @@ def chunk(
 
         all_clips = []
         for clip_timepoints in all_clips_timepoints:
-            # Read the clip, get frames
-            video_clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
-            if video_clip is None:
-                raise ValueError("No clip found")
-            video_clip = uniform_temporal_subsample(video_clip, num_samples=chunk_duration)
-            video_clip = video_clip / 255.0  # since this is float, need 0-1
+            video_clip = video[math.ceil(clip_timepoints[0] * fps) : math.ceil(clip_timepoints[1] * fps)]
+            video_clip = uniform_temporal_subsample(video_clip, num_samples=num_frames_per_chunk)
             all_clips.append(video_clip)
 
         return all_clips
@@ -708,7 +1359,6 @@ def chunk(
     def _preprocess_image(
         self,
         images: ImageInput,
-        is_video: bool = None,
         do_resize: bool = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
@@ -736,59 +1386,48 @@ def _preprocess_image(
             resample=resample,
         )
 
-        if not is_video:
-          if do_convert_rgb:
-              images = [convert_to_rgb(image) for image in images]
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
 
         # All transformations expect numpy arrays.
-        if not is_video:
-            images = [to_numpy_array(image) for image in images]
-        if not is_video:
-          if is_scaled_image(images[0]) and do_rescale:
-              logger.warning_once(
-                  "It looks like you are trying to rescale already rescaled images. If the input"
-                  " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-              )
-        if not is_video:
-          if input_data_format is None:
-              # We assume that all images have the same channel dimension format.
-              input_data_format = infer_channel_dimension_format(images[0])
-
-        if not is_video:
-            if do_resize:
-                images = [
-                    self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
-                    for image in images
-                ]
-
-            if do_center_crop:
-                images = [
-                    self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
-                ]
-
-            if do_rescale:
-                images = [
-                    self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-                    for image in images
-                ]
-
-            if do_normalize:
-                images = [
-                    self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
-                    for image in images
-                ]
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
 
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
             images = [
-                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
             ]
-        else:
-            if do_resize:
-                images = self.short_side_scale(images)
-            if do_normalize:
-                images = NormalizeVideo(
-                            mean=image_mean,
-                            std=image_std,
-                        )(images),
+
+        if do_center_crop:
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
 
         return images
 
@@ -809,7 +1448,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = None,
         do_chunk: bool = None,
-        chunk_duration: int = None,
+        chunk_duration: float = None,
         num_chunks: int = None,
         num_frames_per_chunk: int = None,
         fps: int = None,
@@ -857,10 +1496,10 @@ def preprocess(
                 Whether to convert the image to RGB.
             do_chunk (`bool`, *optional*, defaults to `self.do_chunk`):
                 Whether to chunk the video into multiple clips.
-            chunk_duration (`int`, *optional*, defaults to `self.chunk_duration`):
-                Duration of each chunk in seconds(clip duration).
+            chunk_duration (`float`, *optional*, defaults to `self.chunk_duration`):
+                Duration of each chunk in seconds.
             num_chunks (`int`, *optional*, defaults to `self.num_chunks`):
-                Number of chunks to sample(number of clips per video).
+                Number of chunks to sample.
             num_frames_per_chunk (`int`, *optional*, defaults to `self.num_frames_per_chunk`):
                 Number of frames to sample per chunk.
             fps (`int`, *optional*, defaults to `self.fps`):
@@ -910,25 +1549,21 @@ def preprocess(
         fps = fps if fps is not None else self.fps
 
         if images is not None:
-            is_video = False
             images = make_list_of_images(images)
-        if videos is not None and (not check_for_video_paths(videos)):
-            is_video = True
+        if videos is not None:
             videos = make_batched_videos(videos)
 
         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
 
-        if not check_for_video_paths(videos):
-          if (videos is not None and not valid_images(videos)) or (images is not None and not valid_images(images)):
-              raise ValueError(
-                  "Invalid input type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                  "torch.Tensor, tf.Tensor or jax.ndarray."
-              )
+        if (videos is not None and not valid_images(videos)) or (images is not None and not valid_images(images)):
+            raise ValueError(
+                "Invalid input type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
 
         if images is not None:
             pixel_values = self._preprocess_image(
                 images=images,
-                is_video = is_video,
                 do_resize=do_resize,
                 size=size,
                 resample=resample,
@@ -945,13 +1580,7 @@ def preprocess(
             )
         else:
             pixel_values = []
-                              
             for video in videos:
-                if check_for_video_paths(videos):
-                    is_video = True
-                    video = encoded_video_from_path(
-                        video,
-                    )
                 if do_chunk:
                     clips = self.chunk(
                         video=video,
@@ -964,7 +1593,6 @@ def preprocess(
                     _pixel_values = [
                         self._preprocess_image(
                             images=clip,
-                            is_video = is_video,
                             do_resize=do_resize,
                             size=size,
                             resample=PILImageResampling.BILINEAR,
@@ -985,7 +1613,6 @@ def preprocess(
                     _pixel_values = [
                         self._preprocess_image(
                             images=video,
-                            is_video = is_video,
                             do_resize=do_resize,
                             size=size,
                             resample=resample,
@@ -1002,16 +1629,11 @@ def preprocess(
                         )
                     ]
 
-                _pixel_values = SpatialCrop(224, num_crops=3)(_pixel_values)
                 # Avoid List[List[List[np.ndarray]]]
-                _pixel_values = torch.stack(_pixel_values, dim = 0)
+                _pixel_values = np.stack(_pixel_values)
+                # Make it shape (num_chunks, num_channels, num_frames_per_chunk, height, width)
+                _pixel_values = np.swapaxes(_pixel_values, 1, 2)
                 pixel_values.append(_pixel_values)
-                # _pixel_values = np.stack(_pixel_values)
-                # # Make it shape (num_chunks, num_channels, num_frames_per_chunk, height, width)
-                # _pixel_values = np.swapaxes(_pixel_values, 1, 2)
-                # pixel_values.append(_pixel_values)
-            pixel_values = torch.stack(pixel_values, dim=0)
-        return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
-
 
+        return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
     

From 558f5447c933bb4be3ce0875993b2e14c2c8fd9c Mon Sep 17 00:00:00 2001
From: RUFFY-369 <prakarshkaushik369@gmail.com>
Date: Wed, 7 Aug 2024 22:35:24 +0530
Subject: [PATCH 08/11] chore:make transformers compliant and few nits

---
 .../imagebind/image_processing_imagebind.py   | 1325 +++--------------
 1 file changed, 237 insertions(+), 1088 deletions(-)

diff --git a/src/transformers/models/imagebind/image_processing_imagebind.py b/src/transformers/models/imagebind/image_processing_imagebind.py
index 4b5b4bae053b..005b20f4e943 100644
--- a/src/transformers/models/imagebind/image_processing_imagebind.py
+++ b/src/transformers/models/imagebind/image_processing_imagebind.py
@@ -1,1031 +1,3 @@
-# # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #     http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing, software
-# # distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-# """Image processor class for ImageBind."""
-
-# import decord
-# from fractions import Fraction
-# import io
-# import math
-# import mimetypes
-# import pathlib
-# from pathlib import Path
-# import torch
-# import torch.nn as nn
-# from torchvision import transforms
-# from torchvision.transforms._transforms_video import NormalizeVideo
-# from typing import BinaryIO, Dict, List, Optional, Tuple, Union
-
-# import numpy as np
-
-# from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-# from ...image_transforms import (
-#     convert_to_rgb,
-#     get_resize_output_image_size,
-#     resize,
-#     to_channel_dimension_format,
-# )
-# from ...image_utils import (
-#     OPENAI_CLIP_MEAN,
-#     OPENAI_CLIP_STD,
-#     ChannelDimension,
-#     ImageInput,
-#     PILImageResampling,
-#     VideoInput,
-#     infer_channel_dimension_format,
-#     is_scaled_image,
-#     is_valid_image,
-#     make_list_of_images,
-#     to_numpy_array,
-#     valid_images,
-#     validate_kwargs,
-#     validate_preprocess_arguments,
-# )
-# from ...utils import TensorType, is_vision_available, logging
-
-
-# logger = logging.get_logger(__name__)
-
-# decord.bridge.set_bridge("torch")
-
-# if is_vision_available():
-#     import PIL
-
-# # def check_for_video_paths(videos) -> bool:
-# #     return (isinstance(videos, list) and all(isinstance(video, str) and mimetypes.guess_type(video)[0].startswith('video/') for video in videos))
-
-# #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/data/encoded_video.py#L42
-# def encoded_video_from_path(video_path):
-#     """
-#     Fetches the given video path using PathManager (allowing remote uris to be
-#     fetched) and constructs the EncodedVideo object.
-
-#     Args:
-#         file_path (str): a PathManager file-path.
-#     """
-#     video_path = Path(video_path)
-#     if video_path.is_file():
-#         with video_path.open('rb') as file:
-#             video_file = io.BytesIO(file.read())
-#     else:
-#         raise FileNotFoundError(f"{video_path} does not exist or is not a file")
-    
-#     sample_rate=16000
-#     video = EncodedVideoDecord(
-#         file=video_file,
-#         video_name=pathlib.Path(video_path).name,
-#         decode_video=True,
-#         decode_audio=False,
-#         **{"sample_rate": sample_rate},
-#     )
-#     return video
-    
-
-# # Copy from models.video_llava.image_processing_video_llava.make_batched_videos
-# def make_batched_videos(videos) -> List[VideoInput]:
-#     if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-#         return videos
-
-#     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-#         if isinstance(videos[0], PIL.Image.Image):
-#             return [videos]
-#         elif len(videos[0].shape) == 4:
-#             return [list(video) for video in videos]
-
-#     elif is_valid_image(videos) and len(videos.shape) == 4:
-#         return [list(videos)]
-
-#     raise ValueError(f"Could not make batched video from {videos}")
-
-
-# # Copy from models.imagebind.feature_extraction_imagebind.uniform_chunk_sampling
-# def uniform_chunk_sampling(
-#     total_duration: float, chunk_duration: int, num_chunks: int
-# ) -> List[Tuple[Fraction, Fraction]]:
-#     """
-#     Uniformly sample `num_chunks` chunks of duration `chunk_duration` from an audio/video of total duration `total_duration`.
-
-#     Args:
-#         total_duration (float): Total duration of the audio/video.
-#         chunk_duration (int): Duration of each chunk(clip duration).
-#         num_chunks (int): Number of chunks to sample(number of clips per video).
-
-#     Returns:
-#         List[Tuple[float, float]]: List of tuples where each tuple contains the start and end time of a chunk.
-#     """
-#     _current_clip_index = 0
-#     _current_aug_index = 0
-#     _augs_per_clip: int = 1
-    
-#     chunk_duration_fraction = Fraction(chunk_duration)
-#     max_possible_clip_start = Fraction(max(total_duration - chunk_duration_fraction, 0)) # Previously chunk_duration was used instead of chunk_duration_fraction so that could be the reason for pixel values not matching
-#     uniform_clip = Fraction(max_possible_clip_start / max(num_chunks - 1, 1))
-
-#     result = []
-#     is_last_clip = False
-#     while not is_last_clip:
-#         clip_start_sec = uniform_clip * _current_clip_index
-#         _current_aug_index += 1
-#         if _current_aug_index >= _augs_per_clip:
-#             _current_clip_index += 1
-#             _current_aug_index = 0
-        
-#         # Last clip is True if sampled self._clips_per_video or if end of video is reached.
-#         is_last_clip = False
-#         if (
-#             _current_clip_index >= num_chunks
-#             or uniform_clip * _current_clip_index > max_possible_clip_start
-#         ):
-#             _current_clip_index = 0
-#             is_last_clip = True
-
-#         # reset
-#         if is_last_clip:
-#             _current_clip_index = 0
-#             _current_aug_index = 0
-
-#         clip_end_sec = clip_start_sec + chunk_duration_fraction
-#         result.append((clip_start_sec, clip_end_sec))
-    
-#     return result
-
-
-# # Adapted from https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
-# def uniform_temporal_subsample(video: VideoInput, num_samples: int) -> VideoInput:
-#     """
-#     Uniformly subsamples num_samples indices from the temporal dimension of the video.
-#     When num_samples is larger than the size of temporal dimension of the video, it
-#     will sample frames based on nearest neighbor interpolation.
-
-#     Args:
-#         video (`VideoInput`):
-#             Video to subsample.
-#         num_samples (`int`):
-#             Number of frames to sample.
-#     """
-#     # num_frames = len(video)
-
-#     # # Sample by nearest neighbor interpolation if num_samples > t.
-#     # indices = np.linspace(0, num_frames - 1, num_samples)
-#     # indices = np.clip(indices, 0, num_frames - 1).astype(int)
-
-#     # return [video[i] for i in indices]
-
-#     temporal_dim: int = -3
-#     num_frames = video.shape[temporal_dim]
-#     assert num_samples > 0 and num_frames > 0
-#     # Sample by nearest neighbor interpolation if num_samples > num_frames.
-#     indices = torch.linspace(0, num_frames - 1, num_samples)
-#     indices = torch.clamp(indices, 0, num_frames - 1).long()
-#     return torch.index_select(video, temporal_dim, indices)
-
-# def crop_boxes(boxes, x_offset, y_offset):
-#     """
-#     Perform crop on the bounding boxes given the offsets.
-#     Args:
-#         boxes (ndarray or None): bounding boxes to perform crop. The dimension
-#             is `num boxes` x 4.
-#         x_offset (int): cropping offset in the x axis.
-#         y_offset (int): cropping offset in the y axis.
-#     Returns:
-#         cropped_boxes (ndarray or None): the cropped boxes with dimension of
-#             `num boxes` x 4.
-#     """
-#     cropped_boxes = boxes.copy()
-#     cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
-#     cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
-
-#     return cropped_boxes
-
-# def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
-#     """
-#     Perform uniform spatial sampling on the images and corresponding boxes.
-#     Args:
-#         images (tensor): images to perform uniform crop. The dimension is
-#             `num frames` x `channel` x `height` x `width`.
-#         size (int): size of height and weight to crop the images.
-#         spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
-#             is larger than height. Or 0, 1, or 2 for top, center, and bottom
-#             crop if height is larger than width.
-#         boxes (ndarray or None): optional. Corresponding boxes to images.
-#             Dimension is `num boxes` x 4.
-#         scale_size (int): optinal. If not None, resize the images to scale_size before
-#             performing any crop.
-#     Returns:
-#         cropped (tensor): images with dimension of
-#             `num frames` x `channel` x `size` x `size`.
-#         cropped_boxes (ndarray or None): the cropped boxes with dimension of
-#             `num boxes` x 4.
-#     """
-#     assert spatial_idx in [0, 1, 2]
-#     ndim = len(images.shape)
-#     if ndim == 3:
-#         images = images.unsqueeze(0)
-#     height = images.shape[2]
-#     width = images.shape[3]
-
-#     if scale_size is not None:
-#         if width <= height:
-#             width, height = scale_size, int(height / width * scale_size)
-#         else:
-#             width, height = int(width / height * scale_size), scale_size
-#         images = torch.nn.functional.interpolate(
-#             images,
-#             size=(height, width),
-#             mode="bilinear",
-#             align_corners=False,
-#         )
-
-#     y_offset = int(math.ceil((height - size) / 2))
-#     x_offset = int(math.ceil((width - size) / 2))
-
-#     if height > width:
-#         if spatial_idx == 0:
-#             y_offset = 0
-#         elif spatial_idx == 2:
-#             y_offset = height - size
-#     else:
-#         if spatial_idx == 0:
-#             x_offset = 0
-#         elif spatial_idx == 2:
-#             x_offset = width - size
-#     cropped = images[:, :, y_offset : y_offset + size, x_offset : x_offset + size]
-#     cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
-#     if ndim == 3:
-#         cropped = cropped.squeeze(0)
-#     return cropped, cropped_boxes
-
-
-# class SpatialCrop(nn.Module):
-#     """
-#     Convert the video into 3 smaller clips spatially. Must be used after the
-#         temporal crops to get spatial crops, and should be used with
-#         -2 in the spatial crop at the slowfast augmentation stage (so full
-#         frames are passed in here). Will return a larger list with the
-#         3x spatial crops as well.
-#     """
-
-#     def __init__(self, crop_size: int = 224, num_crops: int = 3):
-#         super().__init__()
-#         self.crop_size = crop_size
-#         if num_crops == 3:
-#             self.crops_to_ext = [0, 1, 2]
-#             self.flipped_crops_to_ext = []
-#         elif num_crops == 1:
-#             self.crops_to_ext = [1]
-#             self.flipped_crops_to_ext = []
-#         else:
-#             raise NotImplementedError("Nothing else supported yet")
-
-#     def forward(self, videos):
-#         """
-#         Args:
-#             videos: A list of C, T, H, W videos.
-#         Returns:
-#             videos: A list with 3x the number of elements. Each video converted
-#                 to C, T, H', W' by spatial cropping.
-#         """
-#         assert isinstance(videos, list), "Must be a list of videos after temporal crops"
-#         assert all([video[0].ndim == 4 for video in videos]), "Must be (C,T,H,W)"
-#         res = []
-#         for video in videos:
-#             for spatial_idx in self.crops_to_ext:
-#                 res.append(uniform_crop(video[0], self.crop_size, spatial_idx)[0])
-#             if not self.flipped_crops_to_ext:
-#                 continue
-#             flipped_video = transforms.functional.hflip(video[0])
-#             for spatial_idx in self.flipped_crops_to_ext:
-#                 res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
-#         return res
-
-# #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/data/encoded_video_decord.py#L28
-# class EncodedVideoDecord():
-#     """
-
-#     Accessing clips from an encoded video using Decord video reading API
-#     as the decoding backend. For more details, please refer to -
-#     `Decord <https://github.com/dmlc/decord>`
-#     """
-
-#     def __init__(
-#         self,
-#         file: BinaryIO,
-#         video_name: Optional[str] = None,
-#         decode_video: bool = True,
-#         decode_audio: bool = False,
-#         sample_rate: int = 44100,
-#         mono: bool = True,
-#         width: int = -1,
-#         height: int = -1,
-#         num_threads: int = 0,
-#         fault_tol: int = -1,
-#     ) -> None:
-#         """
-#         Args:
-#             file (BinaryIO): a file-like object (e.g. io.BytesIO or io.StringIO) that
-#                 contains the encoded video.
-#             video_name (str): An optional name assigned to the video.
-#             decode_video (bool): If disabled, video is not decoded.
-#             decode_audio (bool): If disabled, audio is not decoded.
-#             sample_rate: int, default is -1
-#                 Desired output sample rate of the audio, unchanged if `-1` is specified.
-#             mono: bool, default is True
-#                 Desired output channel layout of the audio. `True` is mono layout. `False`
-#                 is unchanged.
-#             width : int, default is -1
-#                 Desired output width of the video, unchanged if `-1` is specified.
-#             height : int, default is -1
-#                 Desired output height of the video, unchanged if `-1` is specified.
-#             num_threads : int, default is 0
-#                 Number of decoding thread, auto if `0` is specified.
-#             fault_tol : int, default is -1
-#                 The threshold of corrupted and recovered frames. This is to prevent silent fault
-#                 tolerance when for example 50% frames of a video cannot be decoded and duplicate
-#                 frames are returned. You may find the fault tolerant feature sweet in many
-#                 cases, but not for training models. Say `N = # recovered frames`
-#                 If `fault_tol` < 0, nothing will happen.
-#                 If 0 < `fault_tol` < 1.0, if N > `fault_tol * len(video)`,
-#                 raise `DECORDLimitReachedError`.
-#                 If 1 < `fault_tol`, if N > `fault_tol`, raise `DECORDLimitReachedError`.
-#         """
-#         if not decode_video:
-#             raise NotImplementedError()
-
-#         self._video_name = video_name
-
-#         try:
-#             self._av_reader = decord.VideoReader(
-#                 uri=file,
-#                 ctx=decord.cpu(0),
-#                 width=width,
-#                 height=height,
-#                 num_threads=num_threads,
-#                 fault_tol=fault_tol,
-#             )
-#         except Exception as e:
-#             raise RuntimeError(f"Failed to open video {video_name} with Decord. {e}")
-
-#         self._fps = self._av_reader.get_avg_fps()
-
-#         self._duration = float(len(self._av_reader)) / float(self._fps)
-
-#     @property
-#     def name(self) -> Optional[str]:
-#         """
-#         Returns:
-#             name: the name of the stored video if set.
-#         """
-#         return self._video_name
-
-#     @property
-#     def duration(self) -> float:
-#         """
-#         Returns:
-#             duration: the video's duration/end-time in seconds.
-#         """
-#         return self._duration
-
-#     def close(self):
-#         if self._av_reader is not None:
-#             del self._av_reader
-#             self._av_reader = None
-
-#     def get_clip(
-#         self, start_sec: float, end_sec: float
-#     ) -> Dict[str, Optional[torch.Tensor]]:
-#         """
-#         Retrieves frames from the encoded video at the specified start and end times
-#         in seconds (the video always starts at 0 seconds).
-
-#         Args:
-#             start_sec (float): the clip start time in seconds
-#             end_sec (float): the clip end time in seconds
-#         Returns:
-#             clip_data:
-#                 A dictionary mapping the entries at "video" and "audio" to a tensors.
-
-#                 "video": A tensor of the clip's RGB frames with shape:
-#                 (channel, time, height, width). The frames are of type torch.float32 and
-#                 in the range [0 - 255].
-
-#                 "audio": A tensor of the clip's audio samples with shape:
-#                 (samples). The samples are of type torch.float32 and
-#                 in the range [0 - 255].
-
-#             Returns None if no video or audio found within time range.
-
-#         """
-#         if start_sec > end_sec or start_sec > self._duration:
-#             raise RuntimeError(
-#                 f"Incorrect time window for Decord decoding for video: {self._video_name}."
-#             )
-
-#         start_idx = math.ceil(self._fps * start_sec)
-#         end_idx = math.ceil(self._fps * end_sec)
-#         end_idx = min(end_idx, len(self._av_reader))
-#         frame_idxs = list(range(start_idx, end_idx))
-
-#         try:
-#             outputs = self._av_reader.get_batch(frame_idxs)
-#         except Exception as e:
-#             logger.debug(f"Failed to decode video with Decord: {self._video_name}. {e}")
-#             raise e
-
-#         video = outputs
-
-#         if video is not None:
-#             video = video.to(torch.float32)
-#             #Permute tensor from (time, height, weight, channel) to (channel, height, width, time).
-#             video = video.permute(3, 0, 1, 2)
-
-
-#         return video
-
-# class ImageBindImageProcessor(BaseImageProcessor):
-#     r"""
-#     Constructs an ImageBind image processor.
-
-#     Args:
-#         do_resize (`bool`, *optional*, defaults to `True`):
-#             Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
-#             `do_resize` in the `preprocess` method.
-#         size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
-#             Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
-#             the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
-#             method.
-#         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-#             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
-#         do_center_crop (`bool`, *optional*, defaults to `True`):
-#             Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
-#             `preprocess` method.
-#         crop_size (`Dict[str, int]` *optional*, defaults to 224):
-#             Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
-#             method.
-#         do_rescale (`bool`, *optional*, defaults to `True`):
-#             Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
-#             the `preprocess` method.
-#         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-#             Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
-#             method.
-#         do_normalize (`bool`, *optional*, defaults to `True`):
-#             Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
-#         image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
-#             Mean to use if normalizing the image. This is a float or list of floats the length of the number of
-#             channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-#         image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
-#             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
-#             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-#             Can be overridden by the `image_std` parameter in the `preprocess` method.
-#         do_convert_rgb (`bool`, *optional*, defaults to `True`):
-#             Whether to convert the image to RGB.
-#         do_chunk (`bool`, *optional*, defaults to `False`):
-#             Whether to chunk the video into multiple clips.
-#         chunk_duration (`int`, *optional*, defaults to 2):
-#             Duration of each chunk in seconds(clip duration).
-#         num_chunks (`int`, *optional*, defaults to 5):
-#             Number of chunks to sample(number of clips per video).
-#         num_frames_per_chunk (`int`, *optional*, defaults to 2):
-#             Number of frames to sample per chunk.
-#         fps (`int`, *optional*, defaults to 30):
-#             Frame rate of the video. It's assumed that all videos have the same frame rate.
-#     """
-
-#     model_input_names = ["pixel_values"]
-
-#     def __init__(
-#         self,
-#         do_resize: bool = True,
-#         size: Dict[str, int] = None,
-#         resample: PILImageResampling = PILImageResampling.BICUBIC,
-#         do_center_crop: bool = True,
-#         crop_size: Dict[str, int] = None,
-#         do_rescale: bool = True,
-#         rescale_factor: Union[int, float] = 1 / 255,
-#         do_normalize: bool = True,
-#         image_mean: Optional[Union[float, List[float]]] = None,
-#         image_std: Optional[Union[float, List[float]]] = None,
-#         do_convert_rgb: bool = True,
-#         do_chunk: bool = False,
-#         chunk_duration: int = 2,
-#         num_chunks: int = 5,
-#         num_frames_per_chunk: int = 2,
-#         fps: int = 30,
-#         **kwargs,
-#     ) -> None:
-#         super().__init__(**kwargs)
-#         size = size if size is not None else {"shortest_edge": 224}
-#         size = get_size_dict(size, default_to_square=False)
-#         crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
-#         crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
-
-#         self.do_resize = do_resize
-#         self.size = size
-#         self.resample = resample
-#         self.do_center_crop = do_center_crop
-#         self.crop_size = crop_size
-#         self.do_rescale = do_rescale
-#         self.rescale_factor = rescale_factor
-#         self.do_normalize = do_normalize
-#         self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
-#         self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
-#         self.do_convert_rgb = do_convert_rgb
-#         self.do_chunk = do_chunk
-#         self.chunk_duration = chunk_duration
-#         self.num_chunks = num_chunks
-#         self.num_frames_per_chunk = num_frames_per_chunk
-#         self.fps = fps
-#         self._valid_processor_keys = [
-#             "images",
-#             "do_resize",
-#             "size",
-#             "resample",
-#             "do_center_crop",
-#             "crop_size",
-#             "do_rescale",
-#             "rescale_factor",
-#             "do_normalize",
-#             "image_mean",
-#             "image_std",
-#             "do_convert_rgb",
-#             "do_chunk",
-#             "chunk_duration",
-#             "num_chunks",
-#             "fps",
-#             "return_tensors",
-#             "data_format",
-#             "input_data_format",
-#         ]
-
-#         # for backwards compatibility of KOSMOS-2
-#         if "use_square_size" in kwargs and kwargs["use_square_size"]:
-#             self.size = {"height": size["shortest_edge"], "width": size["shortest_edge"]}
-#             # Let's remove `use_square_size` (as it is removed from #27690), so the future Kosmos-2 image processors
-#             # won't have this attr. being saved. (otherwise, it will enter this if branch while there is no more
-#             # `shortest_edge` key.
-#             delattr(self, "use_square_size")
-
-#     # Copied from models.clip.image_processing_clip.CLIPImageProcessor.resize
-#     def resize(
-#         self,
-#         image: np.ndarray,
-#         size: Dict[str, int],
-#         resample: PILImageResampling = PILImageResampling.BICUBIC,
-#         data_format: Optional[Union[str, ChannelDimension]] = None,
-#         input_data_format: Optional[Union[str, ChannelDimension]] = None,
-#         **kwargs,
-#     ) -> np.ndarray:
-#         """
-#         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
-#         resized to keep the input aspect ratio.
-
-#         Args:
-#             image (`np.ndarray`):
-#                 Image to resize.
-#             size (`Dict[str, int]`):
-#                 Size of the output image.
-#             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
-#                 Resampling filter to use when resiizing the image.
-#             data_format (`str` or `ChannelDimension`, *optional*):
-#                 The channel dimension format of the image. If not provided, it will be the same as the input image.
-#             input_data_format (`ChannelDimension` or `str`, *optional*):
-#                 The channel dimension format of the input image. If not provided, it will be inferred.
-#         """
-#         default_to_square = True
-#         if "shortest_edge" in size:
-#             size = size["shortest_edge"]
-#             default_to_square = False
-#         elif "height" in size and "width" in size:
-#             size = (size["height"], size["width"])
-#         else:
-#             raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
-
-#         output_size = get_resize_output_image_size(
-#             image,
-#             size=size,
-#             default_to_square=default_to_square,
-#             input_data_format=input_data_format,
-#         )
-#         return resize(
-#             image,
-#             size=output_size,
-#             resample=resample,
-#             data_format=data_format,
-#             input_data_format=input_data_format,
-#             **kwargs,
-#         )
-
-#     #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/transforms/functional.py#L92
-#     def short_side_scale(
-#         self,
-#         x: torch.Tensor,
-#         size: int = 224,
-#         interpolation: str = "bilinear",
-#         backend: str = "pytorch",
-#     ) -> torch.Tensor:
-#         """
-#         Determines the shorter spatial dim of the video (i.e. width or height) and scales
-#         it to the given size. To maintain aspect ratio, the longer side is then scaled
-#         accordingly.
-#         Args:
-#             x (torch.Tensor): A video tensor of shape (C, T, H, W) and type torch.float32.
-#             size (int): The size the shorter side is scaled to.
-#             interpolation (str): Algorithm used for upsampling,
-#                 options: nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'
-#             backend (str): backend used to perform interpolation. Options includes
-#                 `pytorch` as default, and `opencv`. Note that opencv and pytorch behave
-#                 differently on linear interpolation on some versions.
-#                 https://discuss.pytorch.org/t/pytorch-linear-interpolation-is-different-from-pil-opencv/71181
-#         Returns:
-#             An x-like Tensor with scaled spatial dims.
-#         """  # noqa
-#         assert len(x.shape) == 4
-#         assert x.dtype == torch.float32
-#         _, _, h, w = x.shape
-#         if w < h:
-#             new_h = int(math.floor((float(h) / w) * size))
-#             new_w = size
-#         else:
-#             new_h = size
-#             new_w = int(math.floor((float(w) / h) * size))
-#         if backend == "pytorch":
-#             return torch.nn.functional.interpolate(
-#                 x, size=(new_h, new_w), mode=interpolation, align_corners=False
-#             )
-#         else:
-#             raise NotImplementedError(f"{backend} backend not supported.")
-
-
-#     def chunk(
-#         self, video: VideoInput, fps: int, chunk_duration: int, num_chunks: int, num_frames_per_chunk: int
-#     ) -> List[VideoInput]:
-#         """
-#         Uniformly sample `num_chunks` chunks of duration `chunk_duration` from a video.
-
-#         Args:
-#             video (`VideoInput`):
-#                 Video to chunk.
-#             fps (`int`):
-#                 Frame rate of the video
-#             chunk_duration (`int`):
-#                 Duration of each chunk(clip duration).
-#             num_chunks (`int`):
-#                 Number of chunks to sample(number of clips per video).
-#             num_frames_per_chunk (`int`):
-#                 Number of frames to sample per chunk.######(WHY IS IT DEFINED WHEN chunk_duration can fulfill its purpose?)######
-#         """
-#         video_duration = video.duration # EncodedVideoDecord obj
-#         if video_duration < chunk_duration:
-#             logger.warning_once(
-#                 "Chunk duration is greater than audio duration. Chunks will be repeated, consider adjusting either `chunk_duration` or `num_chunks`"
-#                 "to avoid unnecessary memory/compute usage."
-#             )
-
-#         all_clips_timepoints = uniform_chunk_sampling(video_duration, chunk_duration, num_chunks)
-
-#         all_clips = []
-#         for clip_timepoints in all_clips_timepoints:
-#             # Read the clip, get frames
-#             video_clip = video.get_clip(clip_timepoints[0], clip_timepoints[1])
-#             if video_clip is None:
-#                 raise ValueError("No clip found")
-#             video_clip = uniform_temporal_subsample(video_clip, num_samples=chunk_duration)
-#             video_clip = video_clip / 255.0  # since this is float, need 0-1
-#             all_clips.append(video_clip)
-
-#         return all_clips
-
-#     # Copied from models.clip.image_processing_clip.CLIPImageProcessor.preprocess with preprocess->_preprocess_image
-#     def _preprocess_image(
-#         self,
-#         images: ImageInput,
-#         is_video: bool = None,
-#         do_resize: bool = None,
-#         size: Dict[str, int] = None,
-#         resample: PILImageResampling = None,
-#         do_center_crop: bool = None,
-#         crop_size: int = None,
-#         do_rescale: bool = None,
-#         rescale_factor: float = None,
-#         do_normalize: bool = None,
-#         image_mean: Optional[Union[float, List[float]]] = None,
-#         image_std: Optional[Union[float, List[float]]] = None,
-#         do_convert_rgb: bool = None,
-#         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-#         input_data_format: Optional[Union[str, ChannelDimension]] = None,
-#     ) -> np.ndarray:
-#         validate_preprocess_arguments(
-#             do_rescale=do_rescale,
-#             rescale_factor=rescale_factor,
-#             do_normalize=do_normalize,
-#             image_mean=image_mean,
-#             image_std=image_std,
-#             do_center_crop=do_center_crop,
-#             crop_size=crop_size,
-#             do_resize=do_resize,
-#             size=size,
-#             resample=resample,
-#         )
-
-#         if not is_video:
-#           if do_convert_rgb:
-#               images = [convert_to_rgb(image) for image in images]
-
-#         # All transformations expect numpy arrays.
-#         if not is_video:
-#             images = [to_numpy_array(image) for image in images]
-#         if not is_video:
-#           if is_scaled_image(images[0]) and do_rescale:
-#               logger.warning_once(
-#                   "It looks like you are trying to rescale already rescaled images. If the input"
-#                   " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-#               )
-#         if not is_video:
-#           if input_data_format is None:
-#               # We assume that all images have the same channel dimension format.
-#               input_data_format = infer_channel_dimension_format(images[0])
-
-#         if not is_video:
-#             if do_resize:
-#                 images = [
-#                     self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
-#                     for image in images
-#                 ]
-
-#             if do_center_crop:
-#                 images = [
-#                     self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
-#                 ]
-
-#             if do_rescale:
-#                 images = [
-#                     self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-#                     for image in images
-#                 ]
-
-#             if do_normalize:
-#                 images = [
-#                     self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
-#                     for image in images
-#                 ]
-
-#             images = [
-#                 to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
-#             ]
-#         else:
-#             if do_resize:
-#                 images = self.short_side_scale(images)
-#             if do_normalize:
-#                 images = NormalizeVideo(
-#                             mean=image_mean,
-#                             std=image_std,
-#                         )(images),
-
-#         return images
-
-#     # Ignore copy
-#     def preprocess(
-#         self,
-#         images: Optional[ImageInput] = None,
-#         videos: Optional[VideoInput] = None,
-#         do_resize: bool = None,
-#         size: Dict[str, int] = None,
-#         resample: PILImageResampling = None,
-#         do_center_crop: bool = None,
-#         crop_size: int = None,
-#         do_rescale: bool = None,
-#         rescale_factor: float = None,
-#         do_normalize: bool = None,
-#         image_mean: Optional[Union[float, List[float]]] = None,
-#         image_std: Optional[Union[float, List[float]]] = None,
-#         do_convert_rgb: bool = None,
-#         do_chunk: bool = None,
-#         chunk_duration: int = None,
-#         num_chunks: int = None,
-#         num_frames_per_chunk: int = None,
-#         fps: int = None,
-#         return_tensors: Optional[Union[str, TensorType]] = None,
-#         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-#         input_data_format: Optional[Union[str, ChannelDimension]] = None,
-#         **kwargs,
-#     ) -> PIL.Image.Image:
-#         """
-#         Preprocess an image or batch of images.
-
-#         Args:
-#             images (`ImageInput`, *optional*):
-#                 Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-#                 passing in images with pixel values between 0 and 1, set `do_rescale=False`. Either `images` or
-#                 `videos` must be provided.
-#             videos (`VideoInput`, *optional*):
-#                 Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
-#                 passing in videos with pixel values between 0 and 1, set `do_rescale=False`. Either `images` or
-#                 `videos` must be provided.
-#             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-#                 Whether to resize the image.
-#             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-#                 Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
-#                 the longest edge resized to keep the input aspect ratio.
-#             resample (`int`, *optional*, defaults to `self.resample`):
-#                 Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
-#                 has an effect if `do_resize` is set to `True`.
-#             do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
-#                 Whether to center crop the image.
-#             crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
-#                 Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
-#             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-#                 Whether to rescale the image.
-#             rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-#                 Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-#             do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-#                 Whether to normalize the image.
-#             image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-#                 Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
-#             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-#                 Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
-#                 `True`.
-#             do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-#                 Whether to convert the image to RGB.
-#             do_chunk (`bool`, *optional*, defaults to `self.do_chunk`):
-#                 Whether to chunk the video into multiple clips.
-#             chunk_duration (`int`, *optional*, defaults to `self.chunk_duration`):
-#                 Duration of each chunk in seconds(clip duration).
-#             num_chunks (`int`, *optional*, defaults to `self.num_chunks`):
-#                 Number of chunks to sample(number of clips per video).
-#             num_frames_per_chunk (`int`, *optional*, defaults to `self.num_frames_per_chunk`):
-#                 Number of frames to sample per chunk.
-#             fps (`int`, *optional*, defaults to `self.fps`):
-#                 Frame rate of the video. It's assumed that all videos have the same frame rate.
-#             return_tensors (`str` or `TensorType`, *optional*):
-#                 The type of tensors to return. Can be one of:
-#                 - Unset: Return a list of `np.ndarray`.
-#                 - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-#                 - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-#                 - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-#                 - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-#             data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-#                 The channel dimension format for the output image. Can be one of:
-#                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-#                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-#                 - Unset: Use the channel dimension format of the input image.
-#             input_data_format (`ChannelDimension` or `str`, *optional*):
-#                 The channel dimension format for the input image. If unset, the channel dimension format is inferred
-#                 from the input image. Can be one of:
-#                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-#                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-#                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-#         """
-#         if images is None and videos is None:
-#             raise ValueError("Either `images` or `videos` must be provided.")
-
-#         if images is not None and videos is not None:
-#             raise ValueError("Only one of `images` or `videos` can be provided.")
-
-#         do_resize = do_resize if do_resize is not None else self.do_resize
-#         size = size if size is not None else self.size
-#         size = get_size_dict(size, param_name="size", default_to_square=False)
-#         resample = resample if resample is not None else self.resample
-#         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
-#         crop_size = crop_size if crop_size is not None else self.crop_size
-#         crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
-#         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-#         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-#         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-#         image_mean = image_mean if image_mean is not None else self.image_mean
-#         image_std = image_std if image_std is not None else self.image_std
-#         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-#         do_chunk = do_chunk if do_chunk is not None else self.do_chunk
-#         chunk_duration = chunk_duration if chunk_duration is not None else self.chunk_duration
-#         num_chunks = num_chunks if num_chunks is not None else self.num_chunks
-#         num_frames_per_chunk = num_frames_per_chunk if num_frames_per_chunk is not None else self.num_frames_per_chunk
-#         fps = fps if fps is not None else self.fps
-
-#         if images is not None:
-#             is_video = False
-#             images = make_list_of_images(images)
-#         if videos is not None:
-#             is_video = True
-#             videos = make_batched_videos(videos)
-
-#         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
-
-#         if (videos is not None and not valid_images(videos)) or (images is not None and not valid_images(images)):
-#             raise ValueError(
-#                 "Invalid input type. Must be of type PIL.Image.Image, numpy.ndarray, "
-#                 "torch.Tensor, tf.Tensor or jax.ndarray."
-#             )
-
-#         if images is not None:
-#             pixel_values = self._preprocess_image(
-#                 images=images,
-#                 is_video = is_video,
-#                 do_resize=do_resize,
-#                 size=size,
-#                 resample=resample,
-#                 do_center_crop=do_center_crop,
-#                 crop_size=crop_size,
-#                 do_rescale=do_rescale,
-#                 rescale_factor=rescale_factor,
-#                 do_normalize=do_normalize,
-#                 image_mean=image_mean,
-#                 image_std=image_std,
-#                 do_convert_rgb=do_convert_rgb,
-#                 data_format=data_format,
-#                 input_data_format=input_data_format,
-#             )
-#         else:
-#             pixel_values = []
-                              
-#             for video in videos:
-#                 # if check_for_video_paths(videos):
-#                 #     is_video = True
-#                 #     video = encoded_video_from_path(
-#                 #         video,
-#                 #     )
-#                 if do_chunk:
-#                     clips = self.chunk(
-#                         video=video,
-#                         fps=fps,
-#                         chunk_duration=chunk_duration,
-#                         num_chunks=num_chunks,
-#                         num_frames_per_chunk=num_frames_per_chunk,
-#                     )
-
-#                     _pixel_values = [
-#                         self._preprocess_image(
-#                             images=clip,
-#                             is_video = is_video,
-#                             do_resize=do_resize,
-#                             size=size,
-#                             resample=PILImageResampling.BILINEAR,
-#                             do_center_crop=do_center_crop,
-#                             crop_size=crop_size,
-#                             do_rescale=do_rescale,
-#                             rescale_factor=rescale_factor,
-#                             do_normalize=do_normalize,
-#                             image_mean=image_mean,
-#                             image_std=image_std,
-#                             do_convert_rgb=do_convert_rgb,
-#                             data_format=data_format,
-#                             input_data_format=input_data_format,
-#                         )
-#                         for clip in clips
-#                     ]
-#                 else:
-#                     _pixel_values = [
-#                         self._preprocess_image(
-#                             images=video,
-#                             is_video = is_video,
-#                             do_resize=do_resize,
-#                             size=size,
-#                             resample=resample,
-#                             do_center_crop=do_center_crop,
-#                             crop_size=crop_size,
-#                             do_rescale=do_rescale,
-#                             rescale_factor=rescale_factor,
-#                             do_normalize=do_normalize,
-#                             image_mean=image_mean,
-#                             image_std=image_std,
-#                             do_convert_rgb=do_convert_rgb,
-#                             data_format=data_format,
-#                             input_data_format=input_data_format,
-#                         )
-#                     ]
-
-#                 _pixel_values = SpatialCrop(224, num_crops=3)(_pixel_values)
-#                 # Avoid List[List[List[np.ndarray]]]
-#                 _pixel_values = torch.stack(_pixel_values, dim = 0)
-#                 pixel_values.append(_pixel_values)
-#                 # _pixel_values = np.stack(_pixel_values)
-#                 # # Make it shape (num_chunks, num_channels, num_frames_per_chunk, height, width)
-#                 # _pixel_values = np.swapaxes(_pixel_values, 1, 2)
-#                 # pixel_values.append(_pixel_values)
-#             pixel_values = torch.stack(pixel_values, dim=0)
-#         return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -1041,8 +13,11 @@
 # limitations under the License.
 """Image processor class for ImageBind."""
 
-import math
 from fractions import Fraction
+import math
+import torch
+import torch.nn as nn
+from torchvision import transforms
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -1075,11 +50,9 @@
 
 logger = logging.get_logger(__name__)
 
-
 if is_vision_available():
     import PIL
 
-
 # Copy from models.video_llava.image_processing_video_llava.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
     if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
@@ -1099,29 +72,53 @@ def make_batched_videos(videos) -> List[VideoInput]:
 
 # Copy from models.imagebind.feature_extraction_imagebind.uniform_chunk_sampling
 def uniform_chunk_sampling(
-    total_duration: float, chunk_duration: float, num_chunks: int
+    total_duration: float, chunk_duration: int, num_chunks: int
 ) -> List[Tuple[Fraction, Fraction]]:
     """
     Uniformly sample `num_chunks` chunks of duration `chunk_duration` from an audio/video of total duration `total_duration`.
 
     Args:
         total_duration (float): Total duration of the audio/video.
-        chunk_duration (float): Duration of each chunk.
-        num_chunks (int): Number of chunks to sample.
+        chunk_duration (int): Duration of each chunk(clip duration).
+        num_chunks (int): Number of chunks to sample(number of clips per video).
 
     Returns:
         List[Tuple[float, float]]: List of tuples where each tuple contains the start and end time of a chunk.
     """
+    _current_clip_index = 0
+    _current_aug_index = 0
+    _augs_per_clip: int = 1
+    
     chunk_duration_fraction = Fraction(chunk_duration)
-    max_possible_clip_start = Fraction(max(total_duration - chunk_duration, 0))
+    max_possible_clip_start = Fraction(max(total_duration - chunk_duration_fraction, 0)) # Previously chunk_duration was used instead of chunk_duration_fraction so that could be the reason for pixel values not matching
     uniform_clip = Fraction(max_possible_clip_start / max(num_chunks - 1, 1))
 
     result = []
-    for clip_index in range(num_chunks):
-        clip_start_sec = uniform_clip * clip_index
+    is_last_clip = False
+    while not is_last_clip:
+        clip_start_sec = uniform_clip * _current_clip_index
+        _current_aug_index += 1
+        if _current_aug_index >= _augs_per_clip:
+            _current_clip_index += 1
+            _current_aug_index = 0
+        
+        # Last clip is True if sampled self._clips_per_video or if end of video is reached.
+        is_last_clip = False
+        if (
+            _current_clip_index >= num_chunks
+            or uniform_clip * _current_clip_index > max_possible_clip_start
+        ):
+            _current_clip_index = 0
+            is_last_clip = True
+
+        # reset
+        if is_last_clip:
+            _current_clip_index = 0
+            _current_aug_index = 0
+
         clip_end_sec = clip_start_sec + chunk_duration_fraction
         result.append((clip_start_sec, clip_end_sec))
-
+    
     return result
 
 
@@ -1138,13 +135,13 @@ def uniform_temporal_subsample(video: VideoInput, num_samples: int) -> VideoInpu
         num_samples (`int`):
             Number of frames to sample.
     """
-    num_frames = len(video)
-
+    num_frames = video.shape[-3]#len(video) gives first element of size tensor which is channels instead of frames
+    assert num_samples > 0 and num_frames > 0
     # Sample by nearest neighbor interpolation if num_samples > t.
     indices = np.linspace(0, num_frames - 1, num_samples)
     indices = np.clip(indices, 0, num_frames - 1).astype(int)
 
-    return [video[i] for i in indices]
+    return video[:, indices, :, :]#second index has frames(slicing instead of looping)
 
 
 class ImageBindImageProcessor(BaseImageProcessor):
@@ -1184,16 +181,18 @@ class ImageBindImageProcessor(BaseImageProcessor):
             Can be overridden by the `image_std` parameter in the `preprocess` method.
         do_convert_rgb (`bool`, *optional*, defaults to `True`):
             Whether to convert the image to RGB.
-        do_chunk (`bool`, *optional*, defaults to `True`):
+        do_chunk (`bool`, *optional*, defaults to `False`):
             Whether to chunk the video into multiple clips.
-        chunk_duration (`float`, *optional*, defaults to 2.0):
-            Duration of each chunk in seconds.
+        chunk_duration (`int`, *optional*, defaults to 2):
+            Duration of each chunk in seconds(clip duration).
         num_chunks (`int`, *optional*, defaults to 5):
-            Number of chunks to sample.
+            Number of chunks to sample(number of clips per video).
         num_frames_per_chunk (`int`, *optional*, defaults to 2):
             Number of frames to sample per chunk.
-        fps (`int`, *optional*, defaults to 30):
+        fps (`List[int]`, *optional*, defaults to [30]):
             Frame rate of the video. It's assumed that all videos have the same frame rate.
+        duration('List[float]', *optional*, defaults to [10.0]):
+            Durations of videos 
     """
 
     model_input_names = ["pixel_values"]
@@ -1202,7 +201,7 @@ def __init__(
         self,
         do_resize: bool = True,
         size: Dict[str, int] = None,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
         do_center_crop: bool = True,
         crop_size: Dict[str, int] = None,
         do_rescale: bool = True,
@@ -1211,11 +210,12 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        do_chunk: bool = True,
-        chunk_duration: float = 2.0,
+        do_chunk: bool = False,
+        chunk_duration: int = 2,
         num_chunks: int = 5,
         num_frames_per_chunk: int = 2,
-        fps: int = 30,
+        fps: List[int] = [30],
+        duration: List[float] = [10.0],
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -1240,6 +240,7 @@ def __init__(
         self.num_chunks = num_chunks
         self.num_frames_per_chunk = num_frames_per_chunk
         self.fps = fps
+        self.duration = duration
         self._valid_processor_keys = [
             "images",
             "do_resize",
@@ -1257,6 +258,7 @@ def __init__(
             "chunk_duration",
             "num_chunks",
             "fps",
+            "duration",
             "return_tensors",
             "data_format",
             "input_data_format",
@@ -1275,7 +277,7 @@ def resize(
         self,
         image: np.ndarray,
         size: Dict[str, int],
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
@@ -1289,7 +291,7 @@ def resize(
                 Image to resize.
             size (`Dict[str, int]`):
                 Size of the output image.
-            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                 Resampling filter to use when resiizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
@@ -1320,8 +322,134 @@ def resize(
             **kwargs,
         )
 
+    #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/transforms/functional.py#L92
+    def short_side_scale(
+        self,
+        image: np.ndarray,
+        size: int = 224,
+        resample: str = "bilinear",
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Determines the shorter spatial dim of the video (i.e. width or height) and scales
+        it to the given size. To maintain aspect ratio, the longer side is then scaled
+        accordingly.
+        Args:
+            image (np.ndarray): A video tensor of shape (C, T, H, W) and type numpy.float32.
+            size (int): The size the shorter side is scaled to.
+            resample (str): Algorithm used for upsampling,
+                options: nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        Returns:
+            An image-like numpy array with scaled spatial dims.
+        """  # noqa
+        assert len(image.shape) == 4
+        assert image.dtype == np.float32
+        _, _, h, w = image.shape
+        if w < h:
+            new_h = int(math.floor((float(h) / w) * size))
+            new_w = size
+        else:
+            new_h = size
+            new_w = int(math.floor((float(w) / h) * size))
+        
+        data_format = input_data_format if data_format is None else data_format
+        resized_image =  torch.nn.functional.interpolate(
+            torch.tensor(image).contiguous(), size=(new_h, new_w), mode=resample, align_corners=False
+        ).numpy()
+        #input image in always in FIRST channel dim
+        resized_image = np.array([to_channel_dimension_format(
+            img, data_format, input_channel_dim=ChannelDimension.FIRST
+        ) for img in resized_image])
+        return resized_image
+
+    def uniform_crop(
+        self,
+        images: np.ndarray,
+        crop_size: int = 224,
+        num_crops: int = 3,
+        scale_size=None,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> List[np.ndarray]:
+        """
+        Perform uniform spatial sampling on the images and corresponding boxes.
+        Args:
+            images (np.ndarray): images to perform uniform crop. The dimension is
+                `num frames` x `channel` x `height` x `width`.
+            crop_size (int): size of height/weight to crop the images.
+            spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
+                is larger than height. Or 0, 1, or 2 for top, center, and bottom
+                crop if height is larger than width.
+            scale_size (int): optional. If not None, resize the images to scale_size before
+                performing any crop.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        Returns:
+            cropped (List[np.ndarray]): images with dimension of
+                `num frames` x `channel` x `size` x `size`.
+        """
+        data_format = input_data_format if data_format is None else data_format
+
+        crop_size = crop_size["height"]
+        uniform_cropped = []
+        if num_crops == 3:
+            crops_to_ext = [0, 1, 2]
+        elif num_crops == 1:
+            crops_to_ext = [1]
+        for spatial_idx in crops_to_ext:
+            assert spatial_idx in [0, 1, 2]
+            ndim = len(images.shape)
+            if ndim == 3:
+                images = images.unsqueeze(0)
+            height = images.shape[2]
+            width = images.shape[3]
+
+            if scale_size is not None:
+                if width <= height:
+                    width, height = scale_size, int(height / width * scale_size)
+                else:
+                    width, height = int(width / height * scale_size), scale_size
+                images = torch.nn.functional.interpolate(
+                    images,
+                    size=(height, width),
+                    mode="bilinear",
+                    align_corners=False,
+                )
+
+            y_offset = int(math.ceil((height - crop_size) / 2))
+            x_offset = int(math.ceil((width - crop_size) / 2))
+
+            if height > width:
+                if spatial_idx == 0:
+                    y_offset = 0
+                elif spatial_idx == 2:
+                    y_offset = height - crop_size
+            else:
+                if spatial_idx == 0:
+                    x_offset = 0
+                elif spatial_idx == 2:
+                    x_offset = width - crop_size
+            cropped = images[:, :, y_offset : y_offset + crop_size, x_offset : x_offset + crop_size]
+            if ndim == 3:
+                cropped = cropped.squeeze(0)
+            #input image in always in FIRST channel dim
+            cropped = np.array([to_channel_dimension_format(
+                img, data_format, input_channel_dim=ChannelDimension.FIRST
+            ) for img in cropped])
+
+            uniform_cropped.append(cropped)
+
+        return uniform_cropped
+
     def chunk(
-        self, video: VideoInput, fps: int, chunk_duration: float, num_chunks: int, num_frames_per_chunk: int
+        self, video: VideoInput, fps: int, duration: float, chunk_duration: int, num_chunks: int, num_frames_per_chunk: int
     ) -> List[VideoInput]:
         """
         Uniformly sample `num_chunks` chunks of duration `chunk_duration` from a video.
@@ -1331,14 +459,17 @@ def chunk(
                 Video to chunk.
             fps (`int`):
                 Frame rate of the video
-            chunk_duration (`float`):
-                Duration of each chunk.
+            duration('float', *optional*, defaults to 10.0):
+                Durations of videos 
+            chunk_duration (`int`):
+                Duration of each chunk(clip duration).
             num_chunks (`int`):
-                Number of chunks to sample.
+                Number of chunks to sample(number of clips per video).
             num_frames_per_chunk (`int`):
-                Number of frames to sample per chunk.
+                Number of frames to sample per chunk.######(WHY IS IT DEFINED WHEN chunk_duration can fulfill its purpose?)######
         """
-        video_duration = len(video) / fps
+        fps = float(fps)
+        video_duration = duration
         if video_duration < chunk_duration:
             logger.warning_once(
                 "Chunk duration is greater than audio duration. Chunks will be repeated, consider adjusting either `chunk_duration` or `num_chunks`"
@@ -1349,8 +480,18 @@ def chunk(
 
         all_clips = []
         for clip_timepoints in all_clips_timepoints:
-            video_clip = video[math.ceil(clip_timepoints[0] * fps) : math.ceil(clip_timepoints[1] * fps)]
-            video_clip = uniform_temporal_subsample(video_clip, num_samples=num_frames_per_chunk)
+            #shape of video tensor is (Channel X Frames X Height X Width) so frames dim is accessed at 1 index
+            
+            start_idx = math.ceil(fps * clip_timepoints[0])
+            end_idx = math.ceil(fps * clip_timepoints[1])
+            end_idx = min(end_idx, int(duration*fps))
+            frame_idxs = list(range(start_idx, end_idx))
+            frame_idxs = torch.tensor(frame_idxs).contiguous()
+            video_clip = video[:, frame_idxs, :, :]
+            if video_clip is None:
+                raise ValueError("No clip found")
+            video_clip = uniform_temporal_subsample(video_clip.numpy(), num_samples=chunk_duration)
+            video_clip = video_clip / 255.0  # since this is float, need 0-1
             all_clips.append(video_clip)
 
         return all_clips
@@ -1359,6 +500,7 @@ def chunk(
     def _preprocess_image(
         self,
         images: ImageInput,
+        is_video: bool = None,
         do_resize: bool = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
@@ -1391,40 +533,32 @@ def _preprocess_image(
 
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]
-
         if is_scaled_image(images[0]) and do_rescale:
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
             )
-
         if input_data_format is None:
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
-
         if do_resize:
-            images = [
-                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        if do_center_crop:
-            images = [
-                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
-            ]
+            images  = self.short_side_scale(image = np.array(images), input_data_format=input_data_format)
 
         if do_rescale:
             images = [
                 self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
                 for image in images
             ]
-
+        images = torch.tensor(images).permute(1,0,2,3).numpy()#to interchange channel and frame dim for normalize func as mean and std have shape 3
         if do_normalize:
             images = [
                 self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
                 for image in images
             ]
 
+        if do_center_crop:
+            images = self.uniform_crop(np.array(images), crop_size, num_crops =3,input_data_format=input_data_format)
+
         images = [
             to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
         ]
@@ -1448,10 +582,11 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = None,
         do_chunk: bool = None,
-        chunk_duration: float = None,
+        chunk_duration: int = None,
         num_chunks: int = None,
         num_frames_per_chunk: int = None,
-        fps: int = None,
+        fps: List[int] = None,
+        duration: List[float] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -1496,14 +631,16 @@ def preprocess(
                 Whether to convert the image to RGB.
             do_chunk (`bool`, *optional*, defaults to `self.do_chunk`):
                 Whether to chunk the video into multiple clips.
-            chunk_duration (`float`, *optional*, defaults to `self.chunk_duration`):
-                Duration of each chunk in seconds.
+            chunk_duration (`int`, *optional*, defaults to `self.chunk_duration`):
+                Duration of each chunk in seconds(clip duration).
             num_chunks (`int`, *optional*, defaults to `self.num_chunks`):
-                Number of chunks to sample.
+                Number of chunks to sample(number of clips per video).
             num_frames_per_chunk (`int`, *optional*, defaults to `self.num_frames_per_chunk`):
                 Number of frames to sample per chunk.
-            fps (`int`, *optional*, defaults to `self.fps`):
+            fps (`List[int]`, *optional*, defaults to `self.fps`):
                 Frame rate of the video. It's assumed that all videos have the same frame rate.
+            duration('List[float]', *optional*, defaults to [10.0]):
+                Durations of videos 
             return_tensors (`str` or `TensorType`, *optional*):
                 The type of tensors to return. Can be one of:
                 - Unset: Return a list of `np.ndarray`.
@@ -1547,10 +684,13 @@ def preprocess(
         num_chunks = num_chunks if num_chunks is not None else self.num_chunks
         num_frames_per_chunk = num_frames_per_chunk if num_frames_per_chunk is not None else self.num_frames_per_chunk
         fps = fps if fps is not None else self.fps
+        duration = duration if duration is not None else self.duration
 
         if images is not None:
+            is_video = False
             images = make_list_of_images(images)
         if videos is not None:
+            is_video = True
             videos = make_batched_videos(videos)
 
         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
@@ -1564,6 +704,7 @@ def preprocess(
         if images is not None:
             pixel_values = self._preprocess_image(
                 images=images,
+                is_video = is_video,
                 do_resize=do_resize,
                 size=size,
                 resample=resample,
@@ -1580,11 +721,13 @@ def preprocess(
             )
         else:
             pixel_values = []
-            for video in videos:
+                              
+            for idx,video in enumerate(videos):
                 if do_chunk:
                     clips = self.chunk(
-                        video=video,
-                        fps=fps,
+                        video=video[0],
+                        fps=fps[idx],
+                        duration= duration[idx],
                         chunk_duration=chunk_duration,
                         num_chunks=num_chunks,
                         num_frames_per_chunk=num_frames_per_chunk,
@@ -1593,6 +736,7 @@ def preprocess(
                     _pixel_values = [
                         self._preprocess_image(
                             images=clip,
+                            is_video = is_video,
                             do_resize=do_resize,
                             size=size,
                             resample=PILImageResampling.BILINEAR,
@@ -1613,6 +757,7 @@ def preprocess(
                     _pixel_values = [
                         self._preprocess_image(
                             images=video,
+                            is_video = is_video,
                             do_resize=do_resize,
                             size=size,
                             resample=resample,
@@ -1628,12 +773,16 @@ def preprocess(
                             input_data_format=input_data_format,
                         )
                     ]
-
-                # Avoid List[List[List[np.ndarray]]]
-                _pixel_values = np.stack(_pixel_values)
-                # Make it shape (num_chunks, num_channels, num_frames_per_chunk, height, width)
-                _pixel_values = np.swapaxes(_pixel_values, 1, 2)
+                _pixel_values = np.stack(np.array(_pixel_values))
+                #Exchange frames and channels dim
+                _pixel_values = np.swapaxes(_pixel_values, 2, 3)
                 pixel_values.append(_pixel_values)
-
+            pixel_values = np.stack(pixel_values)
+            # Combine the second and third dimensions for merging num_crops in one dim
+            pixel_values_shape = pixel_values.shape
+            pixel_values_shape = (pixel_values_shape[0], pixel_values_shape[1] * pixel_values_shape[2], *pixel_values_shape[3:])
+            pixel_values = pixel_values.reshape(pixel_values_shape)
         return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
+
+
     

From 9314a5701cdec35d3fff0b057f8db4fdafaba26b Mon Sep 17 00:00:00 2001
From: RUFFY-369 <prakarshkaushik369@gmail.com>
Date: Wed, 7 Aug 2024 23:05:29 +0530
Subject: [PATCH 09/11] style:make fixup

---
 .../imagebind/image_processing_imagebind.py   | 105 ++++++++++--------
 1 file changed, 59 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/imagebind/image_processing_imagebind.py b/src/transformers/models/imagebind/image_processing_imagebind.py
index 005b20f4e943..a42738f21b9c 100644
--- a/src/transformers/models/imagebind/image_processing_imagebind.py
+++ b/src/transformers/models/imagebind/image_processing_imagebind.py
@@ -13,14 +13,12 @@
 # limitations under the License.
 """Image processor class for ImageBind."""
 
-from fractions import Fraction
 import math
-import torch
-import torch.nn as nn
-from torchvision import transforms
+from fractions import Fraction
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
+import torch
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
@@ -53,6 +51,7 @@
 if is_vision_available():
     import PIL
 
+
 # Copy from models.video_llava.image_processing_video_llava.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
     if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
@@ -88,9 +87,11 @@ def uniform_chunk_sampling(
     _current_clip_index = 0
     _current_aug_index = 0
     _augs_per_clip: int = 1
-    
+
     chunk_duration_fraction = Fraction(chunk_duration)
-    max_possible_clip_start = Fraction(max(total_duration - chunk_duration_fraction, 0)) # Previously chunk_duration was used instead of chunk_duration_fraction so that could be the reason for pixel values not matching
+    max_possible_clip_start = Fraction(
+        max(total_duration - chunk_duration_fraction, 0)
+    )  # Previously chunk_duration was used instead of chunk_duration_fraction so that could be the reason for pixel values not matching
     uniform_clip = Fraction(max_possible_clip_start / max(num_chunks - 1, 1))
 
     result = []
@@ -101,13 +102,10 @@ def uniform_chunk_sampling(
         if _current_aug_index >= _augs_per_clip:
             _current_clip_index += 1
             _current_aug_index = 0
-        
+
         # Last clip is True if sampled self._clips_per_video or if end of video is reached.
         is_last_clip = False
-        if (
-            _current_clip_index >= num_chunks
-            or uniform_clip * _current_clip_index > max_possible_clip_start
-        ):
+        if _current_clip_index >= num_chunks or uniform_clip * _current_clip_index > max_possible_clip_start:
             _current_clip_index = 0
             is_last_clip = True
 
@@ -118,7 +116,7 @@ def uniform_chunk_sampling(
 
         clip_end_sec = clip_start_sec + chunk_duration_fraction
         result.append((clip_start_sec, clip_end_sec))
-    
+
     return result
 
 
@@ -135,13 +133,13 @@ def uniform_temporal_subsample(video: VideoInput, num_samples: int) -> VideoInpu
         num_samples (`int`):
             Number of frames to sample.
     """
-    num_frames = video.shape[-3]#len(video) gives first element of size tensor which is channels instead of frames
+    num_frames = video.shape[-3]  # len(video) gives first element of size tensor which is channels instead of frames
     assert num_samples > 0 and num_frames > 0
     # Sample by nearest neighbor interpolation if num_samples > t.
     indices = np.linspace(0, num_frames - 1, num_samples)
     indices = np.clip(indices, 0, num_frames - 1).astype(int)
 
-    return video[:, indices, :, :]#second index has frames(slicing instead of looping)
+    return video[:, indices, :, :]  # second index has frames(slicing instead of looping)
 
 
 class ImageBindImageProcessor(BaseImageProcessor):
@@ -192,7 +190,7 @@ class ImageBindImageProcessor(BaseImageProcessor):
         fps (`List[int]`, *optional*, defaults to [30]):
             Frame rate of the video. It's assumed that all videos have the same frame rate.
         duration('List[float]', *optional*, defaults to [10.0]):
-            Durations of videos 
+            Durations of videos
     """
 
     model_input_names = ["pixel_values"]
@@ -322,7 +320,7 @@ def resize(
             **kwargs,
         )
 
-    #Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/transforms/functional.py#L92
+    # Adapted from https://github.com/facebookresearch/pytorchvideo/blob/1fadaef40dd393ca09680f55582399f4679fc9b7/pytorchvideo/transforms/functional.py#L92
     def short_side_scale(
         self,
         image: np.ndarray,
@@ -356,15 +354,18 @@ def short_side_scale(
         else:
             new_h = size
             new_w = int(math.floor((float(w) / h) * size))
-        
+
         data_format = input_data_format if data_format is None else data_format
-        resized_image =  torch.nn.functional.interpolate(
+        resized_image = torch.nn.functional.interpolate(
             torch.tensor(image).contiguous(), size=(new_h, new_w), mode=resample, align_corners=False
         ).numpy()
-        #input image in always in FIRST channel dim
-        resized_image = np.array([to_channel_dimension_format(
-            img, data_format, input_channel_dim=ChannelDimension.FIRST
-        ) for img in resized_image])
+        # input image in always in FIRST channel dim
+        resized_image = np.array(
+            [
+                to_channel_dimension_format(img, data_format, input_channel_dim=ChannelDimension.FIRST)
+                for img in resized_image
+            ]
+        )
         return resized_image
 
     def uniform_crop(
@@ -439,17 +440,26 @@ def uniform_crop(
             cropped = images[:, :, y_offset : y_offset + crop_size, x_offset : x_offset + crop_size]
             if ndim == 3:
                 cropped = cropped.squeeze(0)
-            #input image in always in FIRST channel dim
-            cropped = np.array([to_channel_dimension_format(
-                img, data_format, input_channel_dim=ChannelDimension.FIRST
-            ) for img in cropped])
+            # input image in always in FIRST channel dim
+            cropped = np.array(
+                [
+                    to_channel_dimension_format(img, data_format, input_channel_dim=ChannelDimension.FIRST)
+                    for img in cropped
+                ]
+            )
 
             uniform_cropped.append(cropped)
 
         return uniform_cropped
 
     def chunk(
-        self, video: VideoInput, fps: int, duration: float, chunk_duration: int, num_chunks: int, num_frames_per_chunk: int
+        self,
+        video: VideoInput,
+        fps: int,
+        duration: float,
+        chunk_duration: int,
+        num_chunks: int,
+        num_frames_per_chunk: int,
     ) -> List[VideoInput]:
         """
         Uniformly sample `num_chunks` chunks of duration `chunk_duration` from a video.
@@ -460,7 +470,7 @@ def chunk(
             fps (`int`):
                 Frame rate of the video
             duration('float', *optional*, defaults to 10.0):
-                Durations of videos 
+                Durations of videos
             chunk_duration (`int`):
                 Duration of each chunk(clip duration).
             num_chunks (`int`):
@@ -480,11 +490,11 @@ def chunk(
 
         all_clips = []
         for clip_timepoints in all_clips_timepoints:
-            #shape of video tensor is (Channel X Frames X Height X Width) so frames dim is accessed at 1 index
-            
+            # shape of video tensor is (Channel X Frames X Height X Width) so frames dim is accessed at 1 index
+
             start_idx = math.ceil(fps * clip_timepoints[0])
             end_idx = math.ceil(fps * clip_timepoints[1])
-            end_idx = min(end_idx, int(duration*fps))
+            end_idx = min(end_idx, int(duration * fps))
             frame_idxs = list(range(start_idx, end_idx))
             frame_idxs = torch.tensor(frame_idxs).contiguous()
             video_clip = video[:, frame_idxs, :, :]
@@ -542,14 +552,16 @@ def _preprocess_image(
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
         if do_resize:
-            images  = self.short_side_scale(image = np.array(images), input_data_format=input_data_format)
+            images = self.short_side_scale(image=np.array(images), input_data_format=input_data_format)
 
         if do_rescale:
             images = [
                 self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
                 for image in images
             ]
-        images = torch.tensor(images).permute(1,0,2,3).numpy()#to interchange channel and frame dim for normalize func as mean and std have shape 3
+        images = (
+            torch.tensor(images).permute(1, 0, 2, 3).numpy()
+        )  # to interchange channel and frame dim for normalize func as mean and std have shape 3
         if do_normalize:
             images = [
                 self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
@@ -557,7 +569,7 @@ def _preprocess_image(
             ]
 
         if do_center_crop:
-            images = self.uniform_crop(np.array(images), crop_size, num_crops =3,input_data_format=input_data_format)
+            images = self.uniform_crop(np.array(images), crop_size, num_crops=3, input_data_format=input_data_format)
 
         images = [
             to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
@@ -640,7 +652,7 @@ def preprocess(
             fps (`List[int]`, *optional*, defaults to `self.fps`):
                 Frame rate of the video. It's assumed that all videos have the same frame rate.
             duration('List[float]', *optional*, defaults to [10.0]):
-                Durations of videos 
+                Durations of videos
             return_tensors (`str` or `TensorType`, *optional*):
                 The type of tensors to return. Can be one of:
                 - Unset: Return a list of `np.ndarray`.
@@ -704,7 +716,7 @@ def preprocess(
         if images is not None:
             pixel_values = self._preprocess_image(
                 images=images,
-                is_video = is_video,
+                is_video=is_video,
                 do_resize=do_resize,
                 size=size,
                 resample=resample,
@@ -721,13 +733,13 @@ def preprocess(
             )
         else:
             pixel_values = []
-                              
-            for idx,video in enumerate(videos):
+
+            for idx, video in enumerate(videos):
                 if do_chunk:
                     clips = self.chunk(
                         video=video[0],
                         fps=fps[idx],
-                        duration= duration[idx],
+                        duration=duration[idx],
                         chunk_duration=chunk_duration,
                         num_chunks=num_chunks,
                         num_frames_per_chunk=num_frames_per_chunk,
@@ -736,7 +748,7 @@ def preprocess(
                     _pixel_values = [
                         self._preprocess_image(
                             images=clip,
-                            is_video = is_video,
+                            is_video=is_video,
                             do_resize=do_resize,
                             size=size,
                             resample=PILImageResampling.BILINEAR,
@@ -757,7 +769,7 @@ def preprocess(
                     _pixel_values = [
                         self._preprocess_image(
                             images=video,
-                            is_video = is_video,
+                            is_video=is_video,
                             do_resize=do_resize,
                             size=size,
                             resample=resample,
@@ -774,15 +786,16 @@ def preprocess(
                         )
                     ]
                 _pixel_values = np.stack(np.array(_pixel_values))
-                #Exchange frames and channels dim
+                # Exchange frames and channels dim
                 _pixel_values = np.swapaxes(_pixel_values, 2, 3)
                 pixel_values.append(_pixel_values)
             pixel_values = np.stack(pixel_values)
             # Combine the second and third dimensions for merging num_crops in one dim
             pixel_values_shape = pixel_values.shape
-            pixel_values_shape = (pixel_values_shape[0], pixel_values_shape[1] * pixel_values_shape[2], *pixel_values_shape[3:])
+            pixel_values_shape = (
+                pixel_values_shape[0],
+                pixel_values_shape[1] * pixel_values_shape[2],
+                *pixel_values_shape[3:],
+            )
             pixel_values = pixel_values.reshape(pixel_values_shape)
         return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
-
-
-    

From 79c40897e941fe6dafc90418decca9f52d359f32 Mon Sep 17 00:00:00 2001
From: RUFFY-369 <prakarshkaushik369@gmail.com>
Date: Wed, 7 Aug 2024 23:09:06 +0530
Subject: [PATCH 10/11] fix:make fix copies

---
 .../models/imagebind/image_processing_imagebind.py          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/imagebind/image_processing_imagebind.py b/src/transformers/models/imagebind/image_processing_imagebind.py
index a42738f21b9c..da480d044272 100644
--- a/src/transformers/models/imagebind/image_processing_imagebind.py
+++ b/src/transformers/models/imagebind/image_processing_imagebind.py
@@ -154,7 +154,7 @@ class ImageBindImageProcessor(BaseImageProcessor):
             Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
             the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
             method.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
             Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
         do_center_crop (`bool`, *optional*, defaults to `True`):
             Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
@@ -187,10 +187,10 @@ class ImageBindImageProcessor(BaseImageProcessor):
             Number of chunks to sample(number of clips per video).
         num_frames_per_chunk (`int`, *optional*, defaults to 2):
             Number of frames to sample per chunk.
-        fps (`List[int]`, *optional*, defaults to [30]):
+        fps (`List[int]`, *optional*, defaults to `[30]`):
             Frame rate of the video. It's assumed that all videos have the same frame rate.
-        duration('List[float]', *optional*, defaults to [10.0]):
             Durations of videos
+        duration (`List`, *optional*, defaults to `[10.0]`): <fill_docstring>
     """
 
     model_input_names = ["pixel_values"]

From f64778d3a9aed9387065321f22fd5022eeabc100 Mon Sep 17 00:00:00 2001
From: RUFFY-369 <prakarshkaushik369@gmail.com>
Date: Thu, 8 Aug 2024 00:18:33 +0530
Subject: [PATCH 11/11] chore:resolve necessary conflicts

---
 .../imagebind/image_processing_imagebind.py    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/imagebind/image_processing_imagebind.py b/src/transformers/models/imagebind/image_processing_imagebind.py
index da480d044272..b787f0572697 100644
--- a/src/transformers/models/imagebind/image_processing_imagebind.py
+++ b/src/transformers/models/imagebind/image_processing_imagebind.py
@@ -71,14 +71,14 @@ def make_batched_videos(videos) -> List[VideoInput]:
 
 # Copy from models.imagebind.feature_extraction_imagebind.uniform_chunk_sampling
 def uniform_chunk_sampling(
-    total_duration: float, chunk_duration: int, num_chunks: int
+    total_duration: float, chunk_duration: float, num_chunks: int
 ) -> List[Tuple[Fraction, Fraction]]:
     """
     Uniformly sample `num_chunks` chunks of duration `chunk_duration` from an audio/video of total duration `total_duration`.
 
     Args:
         total_duration (float): Total duration of the audio/video.
-        chunk_duration (int): Duration of each chunk(clip duration).
+        chunk_duration (float): Duration of each chunk(clip duration).
         num_chunks (int): Number of chunks to sample(number of clips per video).
 
     Returns:
@@ -181,7 +181,7 @@ class ImageBindImageProcessor(BaseImageProcessor):
             Whether to convert the image to RGB.
         do_chunk (`bool`, *optional*, defaults to `False`):
             Whether to chunk the video into multiple clips.
-        chunk_duration (`int`, *optional*, defaults to 2):
+        chunk_duration (`float`, *optional*, defaults to 2.0):
             Duration of each chunk in seconds(clip duration).
         num_chunks (`int`, *optional*, defaults to 5):
             Number of chunks to sample(number of clips per video).
@@ -209,7 +209,7 @@ def __init__(
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
         do_chunk: bool = False,
-        chunk_duration: int = 2,
+        chunk_duration: float = 2.0,
         num_chunks: int = 5,
         num_frames_per_chunk: int = 2,
         fps: List[int] = [30],
@@ -457,7 +457,7 @@ def chunk(
         video: VideoInput,
         fps: int,
         duration: float,
-        chunk_duration: int,
+        chunk_duration: float,
         num_chunks: int,
         num_frames_per_chunk: int,
     ) -> List[VideoInput]:
@@ -471,12 +471,12 @@ def chunk(
                 Frame rate of the video
             duration('float', *optional*, defaults to 10.0):
                 Durations of videos
-            chunk_duration (`int`):
+            chunk_duration (`float`):
                 Duration of each chunk(clip duration).
             num_chunks (`int`):
                 Number of chunks to sample(number of clips per video).
             num_frames_per_chunk (`int`):
-                Number of frames to sample per chunk.######(WHY IS IT DEFINED WHEN chunk_duration can fulfill its purpose?)######
+                Number of frames to sample per chunk.
         """
         fps = float(fps)
         video_duration = duration
@@ -500,7 +500,7 @@ def chunk(
             video_clip = video[:, frame_idxs, :, :]
             if video_clip is None:
                 raise ValueError("No clip found")
-            video_clip = uniform_temporal_subsample(video_clip.numpy(), num_samples=chunk_duration)
+            video_clip = uniform_temporal_subsample(video_clip.numpy(), num_samples=num_frames_per_chunk)
             video_clip = video_clip / 255.0  # since this is float, need 0-1
             all_clips.append(video_clip)
 
@@ -594,7 +594,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = None,
         do_chunk: bool = None,
-        chunk_duration: int = None,
+        chunk_duration: float = None,
         num_chunks: int = None,
         num_frames_per_chunk: int = None,
         fps: List[int] = None,