From 10119253314b6e91b90faefad174f533ea25bb2c Mon Sep 17 00:00:00 2001
From: mkorn1 <kornacle@gmail.com>
Date: Mon, 24 Nov 2025 18:33:13 -0600
Subject: [PATCH 1/2] video to frames to text output.

---
 src/transformers/pipelines/__init__.py        |  10 +
 src/transformers/pipelines/video_to_text.py   | 238 ++++++++++++++++++
 .../pipelines/test_pipelines_video_to_text.py | 127 ++++++++++
 3 files changed, 375 insertions(+)
 create mode 100644 src/transformers/pipelines/video_to_text.py
 create mode 100644 tests/pipelines/test_pipelines_video_to_text.py

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 5b8e3f6b221c..b39ce1cd2928 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -85,6 +85,7 @@
     TokenClassificationPipeline,
 )
 from .video_classification import VideoClassificationPipeline
+from .video_to_text import VideoToTextPipeline
 from .visual_question_answering import VisualQuestionAnsweringPipeline
 from .zero_shot_audio_classification import ZeroShotAudioClassificationPipeline
 from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
@@ -310,6 +311,12 @@
         "default": {"model": ("MCG-NJU/videomae-base-finetuned-kinetics", "488eb9a")},
         "type": "video",
     },
+    "video-to-text": {
+        "impl": VideoToTextPipeline,
+        "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
+        "default": {"model": ("ydshieh/vit-gpt2-coco-en", "5bebf1e")},
+        "type": "video",
+    },
     "mask-generation": {
         "impl": MaskGenerationPipeline,
         "pt": (AutoModelForMaskGeneration,) if is_torch_available() else (),
@@ -483,6 +490,8 @@ def pipeline(task: Literal["translation"], model: Optional[Union[str, "PreTraine
 @overload
 def pipeline(task: Literal["video-classification"], model: Optional[Union[str, "PreTrainedModel"]] = None, config: Optional[Union[str, PreTrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> VideoClassificationPipeline: ...
 @overload
+def pipeline(task: Literal["video-to-text"], model: Optional[Union[str, "PreTrainedModel"]] = None, config: Optional[Union[str, PreTrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> VideoToTextPipeline: ...
+@overload
 def pipeline(task: Literal["visual-question-answering"], model: Optional[Union[str, "PreTrainedModel"]] = None, config: Optional[Union[str, PreTrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> VisualQuestionAnsweringPipeline: ...
 @overload
 def pipeline(task: Literal["zero-shot-audio-classification"], model: Optional[Union[str, "PreTrainedModel"]] = None, config: Optional[Union[str, PreTrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ZeroShotAudioClassificationPipeline: ...
@@ -567,6 +576,7 @@ def pipeline(
             - `"translation"`: will return a [`TranslationPipeline`].
             - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
             - `"video-classification"`: will return a [`VideoClassificationPipeline`].
+            - `"video-to-text"`: will return a [`VideoToTextPipeline`].
             - `"visual-question-answering"`: will return a [`VisualQuestionAnsweringPipeline`].
             - `"zero-shot-classification"`: will return a [`ZeroShotClassificationPipeline`].
             - `"zero-shot-image-classification"`: will return a [`ZeroShotImageClassificationPipeline`].
diff --git a/src/transformers/pipelines/video_to_text.py b/src/transformers/pipelines/video_to_text.py
new file mode 100644
index 000000000000..7fe508ed2e48
--- /dev/null
+++ b/src/transformers/pipelines/video_to_text.py
@@ -0,0 +1,238 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from io import BytesIO
+from typing import Any, Union, overload
+
+import httpx
+
+from ..generation import GenerationConfig
+from ..utils import (
+    add_end_docstrings,
+    is_av_available,
+    is_torch_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_av_available():
+    import av
+    import numpy as np
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
+class VideoToTextPipeline(Pipeline):
+    """
+    Video To Text pipeline using a `AutoModelForImageTextToText`. This pipeline predicts a caption for a given video.
+
+    Unless the model you're using explicitly sets these generation parameters in its configuration files
+    (`generation_config.json`), the following default values will be used:
+    - max_new_tokens: 256
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> captioner = pipeline("video-to-text", model="ydshieh/vit-gpt2-coco-en")
+    >>> captioner("path/to/video.mp4")
+    [{'generated_text': 'a person is setting a table'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This video to text pipeline can currently be loaded from pipeline() using the following task identifier:
+    "video-to-text".
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=video-to-text).
+    """
+
+    _pipeline_calls_generate = True
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = True
+    # Make sure the docstring is updated when the default generation config is changed
+    _default_generation_config = GenerationConfig(
+        max_new_tokens=256,
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "av")
+        self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, num_frames=None, frame_sampling_rate=None, timeout=None):
+        forward_params = {}
+        preprocess_params = {}
+
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        if num_frames is not None:
+            preprocess_params["num_frames"] = num_frames
+        if frame_sampling_rate is not None:
+            preprocess_params["frame_sampling_rate"] = frame_sampling_rate
+
+        if max_new_tokens is not None:
+            forward_params["max_new_tokens"] = max_new_tokens
+        if generate_kwargs is not None:
+            if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
+                raise ValueError(
+                    "`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use"
+                    " only 1 version"
+                )
+            forward_params.update(generate_kwargs)
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, {}
+
+    @overload
+    def __call__(self, inputs: str, **kwargs: Any) -> list[dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, inputs: list[str], **kwargs: Any) -> list[list[dict[str, Any]]]: ...
+
+    def __call__(self, inputs: str | list[str] | None = None, **kwargs):
+        """
+        Generate text captions for the video(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `list[str]`):
+                The pipeline handles two types of videos:
+
+                - A string containing a http link pointing to a video
+                - A string containing a local path to a video
+
+                The pipeline accepts either a single video or a batch of videos, which must then be passed as a string.
+                Videos in a batch must all be in the same format: all as http links or all as local paths.
+            max_new_tokens (`int`, *optional*):
+                The amount of maximum tokens to generate. By default it will use `generate` default.
+            num_frames (`int`, *optional*, defaults to `self.model.config.num_frames`):
+                The number of frames sampled from the video to run the generation on. If not provided, will default
+                to the number of frames specified in the model configuration.
+            frame_sampling_rate (`int`, *optional*, defaults to 1):
+                The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every
+                frame will be used.
+            generate_kwargs (`Dict`, *optional*):
+                Pass it to send all of these arguments directly to `generate` allowing full control of this function.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching videos from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:
+
+            - **generated_text** (`str`) -- The generated text.
+        """
+        if "videos" in kwargs:
+            warnings.warn(
+                "The `videos` argument has been renamed to `inputs`. In version 5 of Transformers, `videos` will no longer be accepted",
+                FutureWarning,
+            )
+            inputs = kwargs.pop("videos")
+        if inputs is None:
+            raise ValueError("Cannot call the video-to-text pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, video, num_frames=None, frame_sampling_rate=1, timeout=None):
+        if num_frames is None:
+            # Try to get from model config, otherwise use a default
+            if hasattr(self.model.config, "num_frames"):
+                num_frames = self.model.config.num_frames
+            else:
+                num_frames = 8  # Default fallback
+
+        if video.startswith("http://") or video.startswith("https://"):
+            video = BytesIO(httpx.get(video, follow_redirects=True, timeout=timeout).content)
+
+        container = av.open(video)
+
+        start_idx = 0
+        end_idx = num_frames * frame_sampling_rate - 1
+        indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64)
+
+        video_frames = read_video_pyav(container, indices)
+        video_frames = list(video_frames)
+
+        # Process video frames through image processor
+        # For models that expect a single image, we'll use the first frame or average frames
+        # For models that support multiple frames, we'll pass all frames
+        model_inputs = self.image_processor(video_frames, return_tensors="pt")
+        model_inputs = model_inputs.to(self.dtype)
+
+        # Some models like GIT need input_ids set to None
+        if self.model.config.model_type == "git":
+            model_inputs["input_ids"] = None
+
+        return model_inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        # Git model sets `model_inputs["input_ids"] = None` in `preprocess`. In batch model, the
+        # pipeline will group them into a list of `None`, which fail `_forward`. Avoid this by checking it first.
+        if (
+            "input_ids" in model_inputs
+            and isinstance(model_inputs["input_ids"], list)
+            and all(x is None for x in model_inputs["input_ids"])
+        ):
+            model_inputs["input_ids"] = None
+
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
+        inputs = model_inputs.pop(self.model.main_input_name)
+        model_outputs = self.model.generate(inputs, **model_inputs, **generate_kwargs)
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        records = []
+        for output_ids in model_outputs:
+            record = {
+                "generated_text": self.tokenizer.decode(
+                    output_ids,
+                    skip_special_tokens=True,
+                )
+            }
+            records.append(record)
+        return records
+
+
+def read_video_pyav(container, indices):
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
diff --git a/tests/pipelines/test_pipelines_video_to_text.py b/tests/pipelines/test_pipelines_video_to_text.py
new file mode 100644
index 000000000000..5b321394af75
--- /dev/null
+++ b/tests/pipelines/test_pipelines_video_to_text.py
@@ -0,0 +1,127 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from huggingface_hub import hf_hub_download
+
+from transformers import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, VideoMAEImageProcessor
+from transformers.pipelines import VideoToTextPipeline, pipeline
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_av,
+    require_torch,
+    require_vision,
+)
+
+from .test_pipelines_common import ANY
+
+
+@is_pipeline_test
+@require_torch
+@require_vision
+@require_av
+class VideoToTextPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
+    example_video_filepath = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls.example_video_filepath is None:
+            cls.example_video_filepath = hf_hub_download(
+                repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
+            )
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        self._load_dataset()
+        video_to_text = VideoToTextPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+            max_new_tokens=20,
+        )
+        examples = [
+            self.example_video_filepath,
+            # TODO: re-enable this once we have a stable hub solution for CI
+            # "https://huggingface.co/datasets/nateraw/video-demo/resolve/main/archery.mp4",
+        ]
+        return video_to_text, examples
+
+    def run_pipeline_test(self, video_to_text, examples):
+        for example in examples:
+            outputs = video_to_text(example)
+
+            self.assertEqual(
+                outputs,
+                [
+                    {"generated_text": ANY(str)},
+                ],
+            )
+
+    @require_torch
+    def test_small_model_pt(self):
+        small_model = "hf-internal-testing/tiny-random-vit-gpt2"
+        small_image_processor = VideoMAEImageProcessor(
+            size={"shortest_edge": 10}, crop_size={"height": 10, "width": 10}
+        )
+        video_to_text = pipeline(
+            "video-to-text", model=small_model, image_processor=small_image_processor, frame_sampling_rate=4, max_new_tokens=19
+        )
+
+        video_file_path = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")
+        output = video_to_text(video_file_path)
+        self.assertEqual(
+            nested_simplify(output, decimals=4),
+            [
+                {
+                    "generated_text": "growthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthGOGO"
+                },
+            ],
+        )
+
+        outputs = video_to_text(
+            [
+                video_file_path,
+                video_file_path,
+            ],
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {
+                        "generated_text": "growthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthGOGO"
+                    }
+                ],
+                [
+                    {
+                        "generated_text": "growthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthgrowthGOGO"
+                    }
+                ],
+            ],
+        )
+

From 1b706be2a5a7a6f4ef674d74758ee46a862f56f8 Mon Sep 17 00:00:00 2001
From: mkorn1 <kornacle@gmail.com>
Date: Wed, 26 Nov 2025 12:32:26 -0600
Subject: [PATCH 2/2] Add video-to-text pipeline with enhanced features

---
 src/transformers/pipelines/__init__.py        |   2 +-
 src/transformers/pipelines/video_to_text.py   | 172 ++++++++++++++----
 .../pipelines/test_pipelines_video_to_text.py | 117 ++++++++++++
 3 files changed, 257 insertions(+), 34 deletions(-)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index b39ce1cd2928..5a718e82100b 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -314,7 +314,7 @@
     "video-to-text": {
         "impl": VideoToTextPipeline,
         "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
-        "default": {"model": ("ydshieh/vit-gpt2-coco-en", "5bebf1e")},
+        "default": {"model": ("microsoft/git-base", "main")},
         "type": "video",
     },
     "mask-generation": {
diff --git a/src/transformers/pipelines/video_to_text.py b/src/transformers/pipelines/video_to_text.py
index 7fe508ed2e48..2a05955ad243 100644
--- a/src/transformers/pipelines/video_to_text.py
+++ b/src/transformers/pipelines/video_to_text.py
@@ -32,7 +32,6 @@
     import av
     import numpy as np
 
-
 if is_torch_available():
     import torch
 
@@ -84,7 +83,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, "av")
         self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
 
-    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, num_frames=None, frame_sampling_rate=None, timeout=None):
+    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, num_frames=None, frame_sampling_rate=None, timeout=None, system_prompt=None):
         forward_params = {}
         preprocess_params = {}
 
@@ -97,6 +96,8 @@ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, num_fr
 
         if max_new_tokens is not None:
             forward_params["max_new_tokens"] = max_new_tokens
+        if system_prompt is not None:
+            forward_params["system_prompt"] = system_prompt
         if generate_kwargs is not None:
             if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
                 raise ValueError(
@@ -134,14 +135,17 @@ def __call__(self, inputs: str | list[str] | None = None, **kwargs):
                 Videos in a batch must all be in the same format: all as http links or all as local paths.
             max_new_tokens (`int`, *optional*):
                 The amount of maximum tokens to generate. By default it will use `generate` default.
-            num_frames (`int`, *optional*, defaults to `self.model.config.num_frames`):
-                The number of frames sampled from the video to run the generation on. If not provided, will default
-                to the number of frames specified in the model configuration.
+            num_frames (`int`, *optional*):
+                The number of frames sampled from the video to run the generation on. If not provided, will be
+                calculated as a function of video duration (1 frame per second, min 8, max 128). If video duration
+                is unavailable, will default to the number of frames specified in the model configuration.
             frame_sampling_rate (`int`, *optional*, defaults to 1):
-                The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every
-                frame will be used.
+                Currently unused - frames are time-spaced based on video duration.
             generate_kwargs (`Dict`, *optional*):
                 Pass it to send all of these arguments directly to `generate` allowing full control of this function.
+            system_prompt (`str`, *optional*):
+                A system prompt to guide the model's generation. This will be tokenized and passed to the model
+                to influence the style and detail of the generated description.
             timeout (`float`, *optional*, defaults to None):
                 The maximum time in seconds to wait for fetching videos from the web. If None, no timeout is set and
                 the call may block forever.
@@ -162,29 +166,72 @@ def __call__(self, inputs: str | list[str] | None = None, **kwargs):
         return super().__call__(inputs, **kwargs)
 
     def preprocess(self, video, num_frames=None, frame_sampling_rate=1, timeout=None):
-        if num_frames is None:
-            # Try to get from model config, otherwise use a default
-            if hasattr(self.model.config, "num_frames"):
-                num_frames = self.model.config.num_frames
-            else:
-                num_frames = 8  # Default fallback
-
         if video.startswith("http://") or video.startswith("https://"):
             video = BytesIO(httpx.get(video, follow_redirects=True, timeout=timeout).content)
 
         container = av.open(video)
-
-        start_idx = 0
-        end_idx = num_frames * frame_sampling_rate - 1
-        indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64)
+        
+        # Get video metadata for logging
+        video_stream = container.streams.video[0]
+        total_frames = video_stream.frames if video_stream.frames else 0
+        fps = float(video_stream.average_rate) if video_stream.average_rate else 0
+        duration = container.duration / av.time_base if container.duration else 0
+        
+        # Calculate num_frames as a function of video length
+        # Default: 1 frame per second, minimum 8, maximum 128
+        if num_frames is None:
+            if duration > 0:
+                # 1 frame per second, with min/max bounds
+                num_frames = max(8, min(128, int(duration)))
+            else:
+                # Fallback: try to get from model config, otherwise use default
+                if hasattr(self.model.config, "num_frames"):
+                    num_frames = self.model.config.num_frames
+                else:
+                    num_frames = 64  # Default fallback
+        
+        logger.info(f"Video metadata: duration={duration:.2f}s, fps={fps:.2f}, total_frames={total_frames}")
+        logger.info(f"Frame selection: num_frames={num_frames} (calculated from duration)")
+
+        # Use time-spaced frames (time-based sampling instead of frame-based)
+        # Sample frames evenly spaced in time
+        if duration > 0 and fps > 0:
+            # Calculate time points evenly spaced across the video duration
+            # Use endpoint=True to include the last frame
+            time_points = np.linspace(0, duration, num=num_frames, endpoint=True)
+            
+            # Convert time points to frame indices
+            indices = (time_points * fps).astype(np.int64)
+            # Ensure indices don't exceed total frames
+            if total_frames > 0:
+                indices = np.clip(indices, 0, total_frames - 1)
+            # Remove duplicates and sort to maintain temporal order
+            indices = np.unique(indices)
+            logger.info(f"Time-spaced sampling selected {len(indices)} frame indices: {indices.tolist()}")
+        else:
+            # Fallback to frame-based linear sampling if duration/fps unavailable
+            start_idx = 0
+            end_idx = total_frames - 1 if total_frames > 0 else num_frames - 1
+            indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64)
+            logger.info(f"Frame-based linear sampling selected {len(indices)} frame indices: {indices.tolist()}")
+        
+        # Log temporal gaps between selected frames
+        if len(indices) > 1 and fps > 0:
+            gaps = []
+            for i in range(len(indices) - 1):
+                gap_frames = indices[i + 1] - indices[i]
+                gap_seconds = gap_frames / fps if fps > 0 else 0
+                gaps.append(f"{gap_frames} frames ({gap_seconds:.2f}s)")
+            logger.info(f"Temporal gaps between selected frames: {gaps}")
 
         video_frames = read_video_pyav(container, indices)
         video_frames = list(video_frames)
+        logger.info(f"Extracted {len(video_frames)} frames")
 
         # Process video frames through image processor
-        # For models that expect a single image, we'll use the first frame or average frames
-        # For models that support multiple frames, we'll pass all frames
+        logger.info(f"Processing {len(video_frames)} individual frames")
         model_inputs = self.image_processor(video_frames, return_tensors="pt")
+        
         model_inputs = model_inputs.to(self.dtype)
 
         # Some models like GIT need input_ids set to None
@@ -203,6 +250,33 @@ def _forward(self, model_inputs, **generate_kwargs):
         ):
             model_inputs["input_ids"] = None
 
+        # Handle system prompt if provided
+        system_prompt = generate_kwargs.pop("system_prompt", None)
+        if system_prompt is not None:
+            # Tokenize the system prompt
+            if self.model.config.model_type == "git":
+                # For GIT models, we can pass the prompt as input_ids
+                # Tokenize and add to model_inputs
+                prompt_ids = self.tokenizer(system_prompt, return_tensors="pt", add_special_tokens=True)
+                prompt_ids = prompt_ids["input_ids"].to(self.device)
+                # If input_ids is None, set it to the prompt; otherwise prepend
+                if model_inputs.get("input_ids") is None:
+                    model_inputs["input_ids"] = prompt_ids
+                else:
+                    # Prepend system prompt to existing input_ids
+                    if isinstance(model_inputs["input_ids"], torch.Tensor):
+                        model_inputs["input_ids"] = torch.cat([prompt_ids, model_inputs["input_ids"]], dim=1)
+            else:
+                # For other models, add as input_ids or pass through generate_kwargs
+                prompt_ids = self.tokenizer(system_prompt, return_tensors="pt", add_special_tokens=True)
+                prompt_ids = prompt_ids["input_ids"].to(self.device)
+                if "input_ids" not in model_inputs or model_inputs["input_ids"] is None:
+                    model_inputs["input_ids"] = prompt_ids
+                else:
+                    # Prepend system prompt to existing input_ids
+                    if isinstance(model_inputs["input_ids"], torch.Tensor):
+                        model_inputs["input_ids"] = torch.cat([prompt_ids, model_inputs["input_ids"]], dim=1)
+
         # User-defined `generation_config` passed to the pipeline call take precedence
         if "generation_config" not in generate_kwargs:
             generate_kwargs["generation_config"] = self.generation_config
@@ -213,26 +287,58 @@ def _forward(self, model_inputs, **generate_kwargs):
 
     def postprocess(self, model_outputs):
         records = []
-        for output_ids in model_outputs:
-            record = {
-                "generated_text": self.tokenizer.decode(
-                    output_ids,
-                    skip_special_tokens=True,
-                )
-            }
-            records.append(record)
+        seen_texts = set()
+        all_texts = []
+        
+        logger.info(f"Postprocessing {len(model_outputs)} model outputs")
+        
+        for idx, output_ids in enumerate(model_outputs):
+            text = self.tokenizer.decode(output_ids, skip_special_tokens=True)
+            all_texts.append(text)
+            logger.info(f"Generated text #{idx + 1}: '{text}'")
+            
+            # Deduplicate: only add if we haven't seen this text before
+            if text not in seen_texts:
+                seen_texts.add(text)
+                record = {"generated_text": text}
+                records.append(record)
+                logger.debug(f"Added unique text: '{text}'")
+            else:
+                logger.debug(f"Deduplicated duplicate text: '{text}'")
+        
+        logger.info(f"Total generated texts: {len(all_texts)}, Unique texts after deduplication: {len(records)}")
+        if len(all_texts) > len(records):
+            duplicates = [t for t in all_texts if all_texts.count(t) > 1]
+            logger.info(f"Duplicated texts: {set(duplicates)}")
+        
         return records
 
 
 def read_video_pyav(container, indices):
+    """
+    Read frames from video container in the order specified by indices.
+    Maintains temporal order by reading frames in the exact order of the indices array.
+    """
+    # Ensure indices are sorted to maintain temporal order
+    sorted_indices = np.sort(indices)
     frames = []
     container.seek(0)
-    start_index = indices[0]
-    end_index = indices[-1]
+    
+    # Create a set for fast lookup, but iterate in sorted order
+    indices_set = set(sorted_indices)
+    frame_dict = {}
+    
+    # Read all needed frames in one pass
     for i, frame in enumerate(container.decode(video=0)):
-        if i > end_index:
+        if i > sorted_indices[-1]:
             break
-        if i >= start_index and i in indices:
-            frames.append(frame)
+        if i in indices_set:
+            frame_dict[i] = frame
+    
+    # Extract frames in the order specified by sorted_indices
+    for idx in sorted_indices:
+        if idx in frame_dict:
+            frames.append(frame_dict[idx])
+    
     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
 
diff --git a/tests/pipelines/test_pipelines_video_to_text.py b/tests/pipelines/test_pipelines_video_to_text.py
index 5b321394af75..0932beb9bcf1 100644
--- a/tests/pipelines/test_pipelines_video_to_text.py
+++ b/tests/pipelines/test_pipelines_video_to_text.py
@@ -125,3 +125,120 @@ def test_small_model_pt(self):
             ],
         )
 
+    @require_torch
+    def test_small_model_pt_with_num_frames(self):
+        """Test that num_frames parameter works correctly."""
+        small_model = "hf-internal-testing/tiny-random-vit-gpt2"
+        small_image_processor = VideoMAEImageProcessor(
+            size={"shortest_edge": 10}, crop_size={"height": 10, "width": 10}
+        )
+        video_to_text = pipeline(
+            "video-to-text", model=small_model, image_processor=small_image_processor, max_new_tokens=19
+        )
+
+        video_file_path = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")
+        
+        # Test with explicit num_frames
+        output = video_to_text(video_file_path, num_frames=16)
+        self.assertIsInstance(output, list)
+        self.assertGreater(len(output), 0)
+        self.assertIn("generated_text", output[0])
+
+    @require_torch
+    def test_small_model_pt_with_system_prompt(self):
+        """Test that system_prompt parameter works correctly."""
+        small_model = "hf-internal-testing/tiny-random-vit-gpt2"
+        small_image_processor = VideoMAEImageProcessor(
+            size={"shortest_edge": 10}, crop_size={"height": 10, "width": 10}
+        )
+        video_to_text = pipeline(
+            "video-to-text", model=small_model, image_processor=small_image_processor, max_new_tokens=19
+        )
+
+        video_file_path = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")
+        
+        # Test with system_prompt
+        system_prompt = "Describe this video in detail."
+        output = video_to_text(video_file_path, system_prompt=system_prompt)
+        self.assertIsInstance(output, list)
+        self.assertGreater(len(output), 0)
+        self.assertIn("generated_text", output[0])
+        self.assertIsInstance(output[0]["generated_text"], str)
+
+    @require_torch
+    def test_small_model_pt_batch_processing(self):
+        """Test batch processing with multiple videos."""
+        small_model = "hf-internal-testing/tiny-random-vit-gpt2"
+        small_image_processor = VideoMAEImageProcessor(
+            size={"shortest_edge": 10}, crop_size={"height": 10, "width": 10}
+        )
+        video_to_text = pipeline(
+            "video-to-text", model=small_model, image_processor=small_image_processor, max_new_tokens=19
+        )
+
+        video_file_path = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")
+        
+        # Test batch processing
+        outputs = video_to_text([video_file_path, video_file_path])
+        self.assertIsInstance(outputs, list)
+        self.assertEqual(len(outputs), 2)
+        self.assertIsInstance(outputs[0], list)
+        self.assertIsInstance(outputs[1], list)
+        self.assertGreater(len(outputs[0]), 0)
+        self.assertGreater(len(outputs[1]), 0)
+
+    @require_torch
+    def test_small_model_pt_with_generate_kwargs(self):
+        """Test that generate_kwargs parameter works correctly."""
+        small_model = "hf-internal-testing/tiny-random-vit-gpt2"
+        small_image_processor = VideoMAEImageProcessor(
+            size={"shortest_edge": 10}, crop_size={"height": 10, "width": 10}
+        )
+        video_to_text = pipeline(
+            "video-to-text", model=small_model, image_processor=small_image_processor
+        )
+
+        video_file_path = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")
+        
+        # Test with generate_kwargs
+        output = video_to_text(video_file_path, generate_kwargs={"max_new_tokens": 10})
+        self.assertIsInstance(output, list)
+        self.assertGreater(len(output), 0)
+        self.assertIn("generated_text", output[0])
+
+    @require_torch
+    def test_small_model_pt_max_new_tokens_conflict(self):
+        """Test that providing max_new_tokens both as argument and in generate_kwargs raises an error."""
+        small_model = "hf-internal-testing/tiny-random-vit-gpt2"
+        small_image_processor = VideoMAEImageProcessor(
+            size={"shortest_edge": 10}, crop_size={"height": 10, "width": 10}
+        )
+        video_to_text = pipeline(
+            "video-to-text", model=small_model, image_processor=small_image_processor
+        )
+
+        video_file_path = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")
+        
+        # Test that providing max_new_tokens in both places raises ValueError
+        with self.assertRaises(ValueError):
+            video_to_text(video_file_path, max_new_tokens=10, generate_kwargs={"max_new_tokens": 20})
+
+    @require_torch
+    def test_small_model_pt_frame_sampling_rate(self):
+        """Test that frame_sampling_rate parameter is accepted (even if currently unused)."""
+        small_model = "hf-internal-testing/tiny-random-vit-gpt2"
+        small_image_processor = VideoMAEImageProcessor(
+            size={"shortest_edge": 10}, crop_size={"height": 10, "width": 10}
+        )
+        video_to_text = pipeline(
+            "video-to-text", model=small_model, image_processor=small_image_processor, max_new_tokens=19
+        )
+
+        video_file_path = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")
+        
+        # Test that frame_sampling_rate doesn't cause errors
+        output = video_to_text(video_file_path, frame_sampling_rate=2)
+        self.assertIsInstance(output, list)
+        self.assertGreater(len(output), 0)
+        self.assertIn("generated_text", output[0])
+