From 7203626970ee4197b37a59d15933eb6545e7f68a Mon Sep 17 00:00:00 2001
From: Jaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
Date: Thu, 16 Apr 2026 08:01:20 +0000
Subject: [PATCH 1/7] [Core] Add PyAV video backend for concurrent video
 decoding

Signed-off-by: Jaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
---
 tests/multimodal/test_video.py | 109 ++++++++++++++++
 vllm/envs.py                   |   5 +
 vllm/multimodal/video.py       | 221 +++++++++++++++++++++++++++++++++
 3 files changed, 335 insertions(+)

diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index 3ece384348bc..10a87c7bcc09 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import importlib.util
 from pathlib import Path
 
 import numpy as np
@@ -15,6 +16,8 @@
 
 from .utils import create_video_from_image
 
+_has_pyav = importlib.util.find_spec("av") is not None
+
 pytestmark = pytest.mark.cpu_test
 
 ASSETS_DIR = Path(__file__).parent / "assets"
@@ -310,6 +313,75 @@ def dummy_video_path(tmp_path):
     return video_path
 
 
+# ============================================================================
+# PyAV Backend Tests
+# ============================================================================
+
+
+@pytest.mark.skipif(not _has_pyav, reason="PyAV not installed")
+def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyPatch):
+    """Test that the pyav backend can load frames from a valid video."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "pyav")
+
+        with open(dummy_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("pyav")
+        frames, metadata = loader.load_bytes(video_data, num_frames=8)
+
+        assert frames.ndim == 4
+        assert frames.shape[3] == 3  # RGB
+        assert frames.shape[0] == 8
+        assert frames.shape[0] == len(metadata["frames_indices"])
+        assert metadata["video_backend"] == "pyav"
+        assert "total_num_frames" in metadata
+        assert "fps" in metadata
+        assert "duration" in metadata
+
+
+@pytest.mark.skipif(not _has_pyav, reason="PyAV not installed")
+def test_pyav_backend_seek_path(dummy_video_path, monkeypatch: pytest.MonkeyPatch):
+    """Test that the PyAV seek path works by lowering the threshold."""
+    import vllm.multimodal.video as video_mod
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "pyav")
+        m.setattr(video_mod, "_PYAV_SEEK_FRAME_THRESHOLD", 0)
+
+        with open(dummy_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("pyav")
+        frames, metadata = loader.load_bytes(video_data, num_frames=8)
+
+        assert frames.ndim == 4
+        assert frames.shape[3] == 3  # RGB
+        assert frames.shape[0] == 8
+        assert frames.shape[0] == len(metadata["frames_indices"])
+
+
+@pytest.mark.skipif(not _has_pyav, reason="PyAV not installed")
+def test_pyav_dynamic_backend_loads_frames(
+    dummy_video_path, monkeypatch: pytest.MonkeyPatch
+):
+    """Test that the pyav_dynamic backend can load frames."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "pyav_dynamic")
+
+        with open(dummy_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("pyav_dynamic")
+        frames, metadata = loader.load_bytes(video_data, fps=2, max_duration=10)
+
+        assert frames.ndim == 4
+        assert frames.shape[3] == 3  # RGB
+        assert frames.shape[0] > 0
+        assert frames.shape[0] == len(metadata["frames_indices"])
+        assert metadata["video_backend"] == "pyav_dynamic"
+
+
 @pytest.mark.parametrize(
     "backend, kwargs, expected_num_frames",
     [
@@ -349,6 +421,43 @@ def dummy_video_path(tmp_path):
             119,
             id="molmo2-fps",
         ),
+        # pyav: same sampling logic as opencv
+        pytest.param(
+            "pyav",
+            {"num_frames": 32},
+            32,
+            id="pyav-num_frames",
+            marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"),
+        ),
+        pytest.param(
+            "pyav",
+            {"fps": 2},
+            120,
+            id="pyav-fps",
+            marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"),
+        ),
+        pytest.param(
+            "pyav",
+            {"num_frames": 500, "fps": 2},
+            120,
+            id="pyav-num_frames_wins_fps",
+            marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"),
+        ),
+        # pyav_dynamic: same sampling logic as opencv_dynamic
+        pytest.param(
+            "pyav_dynamic",
+            {"fps": 1, "max_duration": 60},
+            60,
+            id="pyav_dynamic-within_max_duration",
+            marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"),
+        ),
+        pytest.param(
+            "pyav_dynamic",
+            {"fps": 2, "max_duration": 30},
+            60,
+            id="pyav_dynamic-exceeds_max_duration",
+            marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"),
+        ),
     ],
 )
 def test_video_loader_frames_sampling(
diff --git a/vllm/envs.py b/vllm/envs.py
index 8ed1d33434cb..e8f47f7b13b5 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -831,6 +831,11 @@ def _get_or_set_default() -> str:
     ),
     # Backend for Video IO
     # - "opencv": Default backend that uses OpenCV stream buffered backend.
+    # - "pyav": PyAV backend using in-process FFmpeg bindings.
+    #   Uses adaptive scan/seek: sequential scan for short videos,
+    #   per-frame seeking for long videos to reduce GIL hold time.
+    # - "pyav_dynamic": Dynamic-sampling variant of the PyAV backend
+    #   (mirrors "opencv_dynamic" sampling logic).
     # - "identity": Returns raw video bytes for model processor to handle.
     #
     # Custom backend implementations can be registered
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 90102151423f..264c6b5ac97c 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -19,6 +19,11 @@
     cv2 = PlaceholderModule("cv2")
     vr = PlaceholderModule("cv2").placeholder_attr("videoio_registry")
 
+try:
+    import av
+except ImportError:
+    av = PlaceholderModule("av")  # type: ignore[assignment]
+
 
 logger = init_logger(__name__)
 
@@ -959,3 +964,219 @@ def load_bytes(
             valid_frame_indices=valid_frame_indices,
         )
         return frames, metadata
+
+
+# Threshold for switching from sequential scan to per-frame seeking
+# in PyAV backends.  At 30 fps, 5000 frames ≈ 3 minutes.  Scan holds
+# the GIL continuously; seek releases it between frames, which is
+# critical for concurrent serving with long videos.
+_PYAV_SEEK_FRAME_THRESHOLD = 5000
+
+
+class PyAVVideoBackendMixin:
+    """Shared utilities for PyAV-based video backends.
+
+    Decodes video using the PyAV library (Python bindings for FFmpeg).
+    Short videos are decoded via sequential scan; long videos use
+    per-frame seeking to release the GIL between frames and allow
+    concurrent request processing.
+    """
+
+    _video_backend_name: str
+
+    @staticmethod
+    def _get_metadata(
+        container: "av.container.InputContainer",
+    ) -> VideoSourceMetadata:
+        """Extract metadata from an open PyAV container."""
+        if not container.streams.video:
+            raise ValueError("No video streams found in container")
+        stream = container.streams.video[0]
+        total_frames = stream.frames or 0
+        fps = float(stream.average_rate) if stream.average_rate else 0.0
+        duration = float(stream.duration * stream.time_base) if stream.duration else 0.0
+
+        if total_frames == 0 and duration > 0 and fps > 0:
+            total_frames = int(duration * fps)
+
+        return VideoSourceMetadata(total_frames, fps, duration)
+
+    @staticmethod
+    def _decode_frames_scan(
+        container: "av.container.InputContainer",
+        frame_indices: list[int],
+    ) -> tuple[npt.NDArray, list[int]]:
+        """Decode frames by sequential scan (best for short videos)."""
+        target_set = set(frame_indices)
+        max_idx = max(frame_indices)
+        frames_list: list[npt.NDArray] = []
+        valid_indices: list[int] = []
+
+        container.streams.video[0].thread_type = "AUTO"
+        for idx, frame in enumerate(container.decode(video=0)):
+            if idx > max_idx:
+                break
+            if idx in target_set:
+                frames_list.append(frame.to_ndarray(format="rgb24"))
+                valid_indices.append(idx)
+
+        return np.stack(frames_list), valid_indices
+
+    @staticmethod
+    def _decode_frames_seek(
+        container: "av.container.InputContainer",
+        frame_indices: list[int],
+        fps: float,
+        duration: float,
+    ) -> tuple[npt.NDArray, list[int]]:
+        """Decode frames by seeking to timestamps (best for long videos).
+
+        Releases the GIL between frames (via seek + decode boundaries),
+        allowing other requests to make progress under concurrency.
+        """
+        stream = container.streams.video[0]
+        # SLICE parallelizes within a single frame without the
+        # one-frame-per-thread latency penalty of FRAME threading.
+        stream.thread_type = "SLICE"
+        time_base = stream.time_base
+        frames_list: list[npt.NDArray] = []
+        valid_indices: list[int] = []
+
+        frame_interval = 1.0 / fps if fps > 0 else 0.1
+        max_ts = max(0.0, duration - frame_interval) if duration > 0 else float("inf")
+
+        for idx in frame_indices:
+            ts = min(idx / fps, max_ts) if fps > 0 else 0.0
+            pts = int(ts / time_base)
+            container.seek(pts, stream=stream)
+            frame = next(container.decode(video=0), None)
+            if frame is not None:
+                frames_list.append(frame.to_ndarray(format="rgb24"))
+                valid_indices.append(idx)
+
+        return np.stack(frames_list), valid_indices
+
+    @classmethod
+    def _decode_frames(
+        cls,
+        container: "av.container.InputContainer",
+        frame_indices: list[int],
+        source: VideoSourceMetadata,
+        seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD,
+    ) -> tuple[npt.NDArray, list[int]]:
+        """Decode frames, choosing scan or seek based on video length.
+
+        Scan holds the GIL continuously — fine for short videos.
+        Seek releases the GIL between frames, critical for concurrency
+        with long videos.
+        """
+        if source.total_frames_num >= seek_threshold and source.original_fps > 0:
+            return cls._decode_frames_seek(
+                container, frame_indices, source.original_fps, source.duration
+            )
+        return cls._decode_frames_scan(container, frame_indices)
+
+    @classmethod
+    def _prepare_source(
+        cls,
+        source: VideoSourceMetadata,
+    ) -> VideoSourceMetadata:
+        return source
+
+    @classmethod
+    def _load_bytes_impl(
+        cls,
+        data: bytes,
+        num_frames: int,
+        fps: int,
+        max_duration: int,
+        seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD,
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """Shared implementation for all PyAV-based load_bytes methods."""
+        with av.open(BytesIO(data)) as container:
+            raw_source = cls._get_metadata(container)
+            source = cls._prepare_source(raw_source)
+
+            frame_idx = cls.compute_frames_index_to_sample(  # type: ignore[attr-defined]
+                source=source,
+                target=VideoTargetMetadata(num_frames, fps, max_duration),
+            )
+            frames, valid_frame_indices = cls._decode_frames(
+                container, frame_idx, source, seek_threshold
+            )
+
+        if len(valid_frame_indices) < len(frame_idx):
+            logger.warning(
+                "pyav video loading: expected %d frames but got %d.",
+                len(frame_idx),
+                len(valid_frame_indices),
+            )
+        metadata = cls.create_hf_metadata(  # type: ignore[attr-defined]
+            source=source,
+            video_backend=cls._video_backend_name,
+            valid_frame_indices=valid_frame_indices,
+        )
+        return frames, metadata
+
+
+@VIDEO_LOADER_REGISTRY.register("pyav")
+class PyAVVideoBackend(VideoLoader, PyAVVideoBackendMixin):
+    """Video backend using PyAV (in-process FFmpeg bindings)."""
+
+    _video_backend_name = "pyav"
+
+    compute_frames_index_to_sample = OpenCVVideoBackend.compute_frames_index_to_sample
+
+    @classmethod
+    def load_bytes(
+        cls,
+        data: bytes,
+        num_frames: int = -1,
+        fps: int = -1,
+        max_duration: int = 300,
+        seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD,
+        **kwargs,
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        return cls._load_bytes_impl(data, num_frames, fps, max_duration, seek_threshold)
+
+
+@VIDEO_LOADER_REGISTRY.register("pyav_dynamic")
+class PyAVDynamicVideoBackend(VideoLoader, PyAVVideoBackendMixin):
+    """Dynamic-sampling PyAV backend (mirrors OpenCVDynamicVideoBackend)."""
+
+    _video_backend_name = "pyav_dynamic"
+
+    @classmethod
+    def _prepare_source(
+        cls,
+        source: VideoSourceMetadata,
+    ) -> VideoSourceMetadata:
+        """Estimate duration from frame count and fps when not available."""
+        if source.duration:
+            return source
+        if source.original_fps > 0:
+            max_frame_idx = source.total_frames_num - 1
+            estimated_duration = round(max_frame_idx / source.original_fps) + 1
+        else:
+            estimated_duration = 0
+        return VideoSourceMetadata(
+            source.total_frames_num,
+            source.original_fps,
+            estimated_duration,
+        )
+
+    compute_frames_index_to_sample = (
+        OpenCVDynamicVideoBackend.compute_frames_index_to_sample
+    )
+
+    @classmethod
+    def load_bytes(
+        cls,
+        data: bytes,
+        num_frames: int = -1,
+        fps: int = 2,
+        max_duration: int = 300,
+        seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD,
+        **kwargs,
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        return cls._load_bytes_impl(data, num_frames, fps, max_duration, seek_threshold)

From 9d9ce8c4a77cb2397ad2b1e8905ee37237eb9ebe Mon Sep 17 00:00:00 2001
From: Jaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
Date: Fri, 17 Apr 2026 09:24:48 +0000
Subject: [PATCH 2/7] Drop adaptive scan path, keep seek-only path in PyAV
 backend

Signed-off-by: Jaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
---
 tests/multimodal/test_video.py | 45 +--------------------
 vllm/envs.py                   |  4 +-
 vllm/multimodal/video.py       | 71 ++++------------------------------
 3 files changed, 12 insertions(+), 108 deletions(-)

diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index 10a87c7bcc09..4443cad9a088 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import importlib.util
 from pathlib import Path
 
 import numpy as np
@@ -16,8 +15,6 @@
 
 from .utils import create_video_from_image
 
-_has_pyav = importlib.util.find_spec("av") is not None
-
 pytestmark = pytest.mark.cpu_test
 
 ASSETS_DIR = Path(__file__).parent / "assets"
@@ -318,7 +315,6 @@ def dummy_video_path(tmp_path):
 # ============================================================================
 
 
-@pytest.mark.skipif(not _has_pyav, reason="PyAV not installed")
 def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyPatch):
     """Test that the pyav backend can load frames from a valid video."""
     with monkeypatch.context() as m:
@@ -340,28 +336,6 @@ def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyP
         assert "duration" in metadata
 
 
-@pytest.mark.skipif(not _has_pyav, reason="PyAV not installed")
-def test_pyav_backend_seek_path(dummy_video_path, monkeypatch: pytest.MonkeyPatch):
-    """Test that the PyAV seek path works by lowering the threshold."""
-    import vllm.multimodal.video as video_mod
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "pyav")
-        m.setattr(video_mod, "_PYAV_SEEK_FRAME_THRESHOLD", 0)
-
-        with open(dummy_video_path, "rb") as f:
-            video_data = f.read()
-
-        loader = VIDEO_LOADER_REGISTRY.load("pyav")
-        frames, metadata = loader.load_bytes(video_data, num_frames=8)
-
-        assert frames.ndim == 4
-        assert frames.shape[3] == 3  # RGB
-        assert frames.shape[0] == 8
-        assert frames.shape[0] == len(metadata["frames_indices"])
-
-
-@pytest.mark.skipif(not _has_pyav, reason="PyAV not installed")
 def test_pyav_dynamic_backend_loads_frames(
     dummy_video_path, monkeypatch: pytest.MonkeyPatch
 ):
@@ -422,26 +396,13 @@ def test_pyav_dynamic_backend_loads_frames(
             id="molmo2-fps",
         ),
         # pyav: same sampling logic as opencv
-        pytest.param(
-            "pyav",
-            {"num_frames": 32},
-            32,
-            id="pyav-num_frames",
-            marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"),
-        ),
-        pytest.param(
-            "pyav",
-            {"fps": 2},
-            120,
-            id="pyav-fps",
-            marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"),
-        ),
+        pytest.param("pyav", {"num_frames": 32}, 32, id="pyav-num_frames"),
+        pytest.param("pyav", {"fps": 2}, 120, id="pyav-fps"),
         pytest.param(
             "pyav",
             {"num_frames": 500, "fps": 2},
             120,
             id="pyav-num_frames_wins_fps",
-            marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"),
         ),
         # pyav_dynamic: same sampling logic as opencv_dynamic
         pytest.param(
@@ -449,14 +410,12 @@ def test_pyav_dynamic_backend_loads_frames(
             {"fps": 1, "max_duration": 60},
             60,
             id="pyav_dynamic-within_max_duration",
-            marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"),
         ),
         pytest.param(
             "pyav_dynamic",
             {"fps": 2, "max_duration": 30},
             60,
             id="pyav_dynamic-exceeds_max_duration",
-            marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"),
         ),
     ],
 )
diff --git a/vllm/envs.py b/vllm/envs.py
index e8f47f7b13b5..a87d555cc15d 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -832,8 +832,8 @@ def _get_or_set_default() -> str:
     # Backend for Video IO
     # - "opencv": Default backend that uses OpenCV stream buffered backend.
     # - "pyav": PyAV backend using in-process FFmpeg bindings.
-    #   Uses adaptive scan/seek: sequential scan for short videos,
-    #   per-frame seeking for long videos to reduce GIL hold time.
+    #   Decodes sampled frames via per-frame seek, releasing the GIL
+    #   between frames for concurrent serving.
     # - "pyav_dynamic": Dynamic-sampling variant of the PyAV backend
     #   (mirrors "opencv_dynamic" sampling logic).
     # - "identity": Returns raw video bytes for model processor to handle.
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 264c6b5ac97c..ed0652c6e356 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -966,20 +966,13 @@ def load_bytes(
         return frames, metadata
 
 
-# Threshold for switching from sequential scan to per-frame seeking
-# in PyAV backends.  At 30 fps, 5000 frames ≈ 3 minutes.  Scan holds
-# the GIL continuously; seek releases it between frames, which is
-# critical for concurrent serving with long videos.
-_PYAV_SEEK_FRAME_THRESHOLD = 5000
-
-
 class PyAVVideoBackendMixin:
     """Shared utilities for PyAV-based video backends.
 
     Decodes video using the PyAV library (Python bindings for FFmpeg).
-    Short videos are decoded via sequential scan; long videos use
-    per-frame seeking to release the GIL between frames and allow
-    concurrent request processing.
+    Frames are extracted via per-frame `container.seek()`, which
+    releases the GIL between frames and scales with the number of
+    sampled frames rather than the video length.
     """
 
     _video_backend_name: str
@@ -1002,38 +995,13 @@ def _get_metadata(
         return VideoSourceMetadata(total_frames, fps, duration)
 
     @staticmethod
-    def _decode_frames_scan(
-        container: "av.container.InputContainer",
-        frame_indices: list[int],
-    ) -> tuple[npt.NDArray, list[int]]:
-        """Decode frames by sequential scan (best for short videos)."""
-        target_set = set(frame_indices)
-        max_idx = max(frame_indices)
-        frames_list: list[npt.NDArray] = []
-        valid_indices: list[int] = []
-
-        container.streams.video[0].thread_type = "AUTO"
-        for idx, frame in enumerate(container.decode(video=0)):
-            if idx > max_idx:
-                break
-            if idx in target_set:
-                frames_list.append(frame.to_ndarray(format="rgb24"))
-                valid_indices.append(idx)
-
-        return np.stack(frames_list), valid_indices
-
-    @staticmethod
-    def _decode_frames_seek(
+    def _decode_frames(
         container: "av.container.InputContainer",
         frame_indices: list[int],
         fps: float,
         duration: float,
     ) -> tuple[npt.NDArray, list[int]]:
-        """Decode frames by seeking to timestamps (best for long videos).
-
-        Releases the GIL between frames (via seek + decode boundaries),
-        allowing other requests to make progress under concurrency.
-        """
+        """Decode target frames via per-frame seek + keyframe decode."""
         stream = container.streams.video[0]
         # SLICE parallelizes within a single frame without the
         # one-frame-per-thread latency penalty of FRAME threading.
@@ -1056,26 +1024,6 @@ def _decode_frames_seek(
 
         return np.stack(frames_list), valid_indices
 
-    @classmethod
-    def _decode_frames(
-        cls,
-        container: "av.container.InputContainer",
-        frame_indices: list[int],
-        source: VideoSourceMetadata,
-        seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD,
-    ) -> tuple[npt.NDArray, list[int]]:
-        """Decode frames, choosing scan or seek based on video length.
-
-        Scan holds the GIL continuously — fine for short videos.
-        Seek releases the GIL between frames, critical for concurrency
-        with long videos.
-        """
-        if source.total_frames_num >= seek_threshold and source.original_fps > 0:
-            return cls._decode_frames_seek(
-                container, frame_indices, source.original_fps, source.duration
-            )
-        return cls._decode_frames_scan(container, frame_indices)
-
     @classmethod
     def _prepare_source(
         cls,
@@ -1090,7 +1038,6 @@ def _load_bytes_impl(
         num_frames: int,
         fps: int,
         max_duration: int,
-        seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         """Shared implementation for all PyAV-based load_bytes methods."""
         with av.open(BytesIO(data)) as container:
@@ -1102,7 +1049,7 @@ def _load_bytes_impl(
                 target=VideoTargetMetadata(num_frames, fps, max_duration),
             )
             frames, valid_frame_indices = cls._decode_frames(
-                container, frame_idx, source, seek_threshold
+                container, frame_idx, source.original_fps, source.duration
             )
 
         if len(valid_frame_indices) < len(frame_idx):
@@ -1134,10 +1081,9 @@ def load_bytes(
         num_frames: int = -1,
         fps: int = -1,
         max_duration: int = 300,
-        seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD,
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
-        return cls._load_bytes_impl(data, num_frames, fps, max_duration, seek_threshold)
+        return cls._load_bytes_impl(data, num_frames, fps, max_duration)
 
 
 @VIDEO_LOADER_REGISTRY.register("pyav_dynamic")
@@ -1176,7 +1122,6 @@ def load_bytes(
         num_frames: int = -1,
         fps: int = 2,
         max_duration: int = 300,
-        seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD,
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
-        return cls._load_bytes_impl(data, num_frames, fps, max_duration, seek_threshold)
+        return cls._load_bytes_impl(data, num_frames, fps, max_duration)

From 29ad2947ee5d3a5e787bc215a3c6aa27ec9307c8 Mon Sep 17 00:00:00 2001
From: Jaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
Date: Sat, 18 Apr 2026 16:12:45 +0000
Subject: [PATCH 3/7] [Multimodal] separate video sampling algorithm from
 decode codec

Signed-off-by: Jaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
---
 .../multimodal/processing/test_glm4_1v.py     |  10 +-
 tests/multimodal/test_video.py                |  89 ++--
 vllm/envs.py                                  |  12 +-
 vllm/multimodal/video.py                      | 421 +++++++-----------
 4 files changed, 230 insertions(+), 302 deletions(-)

diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index f70d00524275..6f8a4d48b884 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -6,7 +6,7 @@
 from vllm.assets.video import VideoAsset
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import batched_tensors_equal
-from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
+from vllm.multimodal.video import DynamicVideoBackend, VideoBackend
 
 from ...utils import build_model_context
 
@@ -93,9 +93,11 @@ def test_video_loader_consistency(
     with open(video_path, "rb") as f:
         video_bytes = f.read()
 
-    static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
-    dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
-        video_bytes, fps=fps
+    static_video, static_metadata = VideoBackend.load_bytes(
+        video_bytes, backend="opencv"
+    )
+    dynamic_video, dynamic_metadata = DynamicVideoBackend.load_bytes(
+        video_bytes, fps=fps, backend="opencv"
     )
 
     # pre-sampled loader shouldn't read all frames
diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index 4443cad9a088..e82883ece338 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -71,7 +71,9 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
             video_data = f.read()
 
         loader = VIDEO_LOADER_REGISTRY.load("opencv")
-        frames, metadata = loader.load_bytes(video_data, num_frames=-1)
+        frames, metadata = loader.load_bytes(
+            video_data, num_frames=-1, backend="opencv"
+        )
 
         # Verify metadata consistency:
         # frames_indices must match actual loaded frames
@@ -158,12 +160,12 @@ def release(self):
 
         # Test WITHOUT recovery - should have fewer frames due to failures
         frames_no_recovery, meta_no = loader.load_bytes(
-            video_data, num_frames=8, frame_recovery=False
+            video_data, num_frames=8, frame_recovery=False, backend="opencv"
         )
 
         # Test WITH recovery - should recover using next valid frames
         frames_with_recovery, meta_yes = loader.load_bytes(
-            video_data, num_frames=8, frame_recovery=True
+            video_data, num_frames=8, frame_recovery=True, backend="opencv"
         )
 
         # With recovery should have MORE frames than without
@@ -214,12 +216,12 @@ def test_video_recovery_with_corrupted_file(monkeypatch: pytest.MonkeyPatch):
 
         # Test without recovery - frame 17 will be skipped
         frames_no_recovery, meta_no_recovery = loader.load_bytes(
-            video_data, num_frames=8, frame_recovery=False
+            video_data, num_frames=8, frame_recovery=False, backend="opencv"
         )
 
         # Test with recovery - frame 18 should fill in for frame 17
         frames_with_recovery, meta_with_recovery = loader.load_bytes(
-            video_data, num_frames=8, frame_recovery=True
+            video_data, num_frames=8, frame_recovery=True, backend="opencv"
         )
 
         # Verify metadata consistency for both modes
@@ -271,12 +273,16 @@ def test_video_recovery_dynamic_backend(monkeypatch: pytest.MonkeyPatch):
 
         # Test without recovery
         frames_no_recovery, meta_no = loader.load_bytes(
-            video_data, fps=2, max_duration=10, frame_recovery=False
+            video_data,
+            fps=2,
+            max_duration=10,
+            frame_recovery=False,
+            backend="opencv",
         )
 
         # Test with frame_recovery enabled
         frames_with_recovery, meta_with = loader.load_bytes(
-            video_data, fps=2, max_duration=10, frame_recovery=True
+            video_data, fps=2, max_duration=10, frame_recovery=True, backend="opencv"
         )
 
         # Verify basic properties
@@ -316,15 +322,15 @@ def dummy_video_path(tmp_path):
 
 
 def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyPatch):
-    """Test that the pyav backend can load frames from a valid video."""
+    """Test that the pyav codec backend can load frames from a valid video."""
     with monkeypatch.context() as m:
-        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "pyav")
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
 
         with open(dummy_video_path, "rb") as f:
             video_data = f.read()
 
-        loader = VIDEO_LOADER_REGISTRY.load("pyav")
-        frames, metadata = loader.load_bytes(video_data, num_frames=8)
+        loader = VIDEO_LOADER_REGISTRY.load("opencv")
+        frames, metadata = loader.load_bytes(video_data, num_frames=8, backend="pyav")
 
         assert frames.ndim == 4
         assert frames.shape[3] == 3  # RGB
@@ -339,15 +345,17 @@ def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyP
 def test_pyav_dynamic_backend_loads_frames(
     dummy_video_path, monkeypatch: pytest.MonkeyPatch
 ):
-    """Test that the pyav_dynamic backend can load frames."""
+    """Test that the pyav codec with dynamic sampling can load frames."""
     with monkeypatch.context() as m:
-        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "pyav_dynamic")
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
 
         with open(dummy_video_path, "rb") as f:
             video_data = f.read()
 
-        loader = VIDEO_LOADER_REGISTRY.load("pyav_dynamic")
-        frames, metadata = loader.load_bytes(video_data, fps=2, max_duration=10)
+        loader = VIDEO_LOADER_REGISTRY.load("opencv_dynamic")
+        frames, metadata = loader.load_bytes(
+            video_data, fps=2, max_duration=10, backend="pyav"
+        )
 
         assert frames.ndim == 4
         assert frames.shape[3] == 3  # RGB
@@ -357,26 +365,32 @@ def test_pyav_dynamic_backend_loads_frames(
 
 
 @pytest.mark.parametrize(
-    "backend, kwargs, expected_num_frames",
+    "loader_key, kwargs, expected_num_frames",
     [
-        # opencv: num_frames directly controls count
-        pytest.param("opencv", {"num_frames": 32}, 32, id="opencv-num_frames"),
-        pytest.param("opencv", {"fps": 2}, 120, id="opencv-fps"),
+        # uniform sampling + opencv codec
         pytest.param(
             "opencv",
-            {"num_frames": 500, "fps": 2},
+            {"num_frames": 32, "backend": "opencv"},
+            32,
+            id="opencv-num_frames",
+        ),
+        pytest.param("opencv", {"fps": 2, "backend": "opencv"}, 120, id="opencv-fps"),
+        pytest.param(
+            "opencv",
+            {"num_frames": 500, "fps": 2, "backend": "opencv"},
             120,
             id="opencv-num_frames_wins_fps",
         ),
+        # dynamic sampling + opencv codec
         pytest.param(
             "opencv_dynamic",
-            {"fps": 1, "max_duration": 60},
+            {"fps": 1, "max_duration": 60, "backend": "opencv"},
             60,
             id="opencv_dynamic-within_max_duration",
         ),
         pytest.param(
             "opencv_dynamic",
-            {"fps": 2, "max_duration": 30},
+            {"fps": 2, "max_duration": 30, "backend": "opencv"},
             60,
             id="opencv_dynamic-exceeds_max_duration",
         ),
@@ -395,25 +409,30 @@ def test_pyav_dynamic_backend_loads_frames(
             119,
             id="molmo2-fps",
         ),
-        # pyav: same sampling logic as opencv
-        pytest.param("pyav", {"num_frames": 32}, 32, id="pyav-num_frames"),
-        pytest.param("pyav", {"fps": 2}, 120, id="pyav-fps"),
+        # uniform sampling + pyav codec (same frame counts as opencv)
+        pytest.param(
+            "opencv",
+            {"num_frames": 32, "backend": "pyav"},
+            32,
+            id="pyav-num_frames",
+        ),
+        pytest.param("opencv", {"fps": 2, "backend": "pyav"}, 120, id="pyav-fps"),
         pytest.param(
-            "pyav",
-            {"num_frames": 500, "fps": 2},
+            "opencv",
+            {"num_frames": 500, "fps": 2, "backend": "pyav"},
             120,
             id="pyav-num_frames_wins_fps",
         ),
-        # pyav_dynamic: same sampling logic as opencv_dynamic
+        # dynamic sampling + pyav codec
         pytest.param(
-            "pyav_dynamic",
-            {"fps": 1, "max_duration": 60},
+            "opencv_dynamic",
+            {"fps": 1, "max_duration": 60, "backend": "pyav"},
             60,
             id="pyav_dynamic-within_max_duration",
         ),
         pytest.param(
-            "pyav_dynamic",
-            {"fps": 2, "max_duration": 30},
+            "opencv_dynamic",
+            {"fps": 2, "max_duration": 30, "backend": "pyav"},
             60,
             id="pyav_dynamic-exceeds_max_duration",
         ),
@@ -422,13 +441,13 @@ def test_pyav_dynamic_backend_loads_frames(
 def test_video_loader_frames_sampling(
     dummy_video_path,
     monkeypatch: pytest.MonkeyPatch,
-    backend: str,
+    loader_key: str,
     kwargs: dict,
     expected_num_frames: int,
 ):
     """Test video loader frames sampling functionality."""
-    monkeypatch.setenv("VLLM_VIDEO_LOADER_BACKEND", backend)
-    loader = VIDEO_LOADER_REGISTRY.load(backend)
+    monkeypatch.setenv("VLLM_VIDEO_LOADER_BACKEND", loader_key)
+    loader = VIDEO_LOADER_REGISTRY.load(loader_key)
 
     with open(dummy_video_path, "rb") as f:
         long_video_bytes = f.read()
diff --git a/vllm/envs.py b/vllm/envs.py
index a87d555cc15d..072a9269be47 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -829,14 +829,10 @@ def _get_or_set_default() -> str:
     "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int(
         os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")
     ),
-    # Backend for Video IO
-    # - "opencv": Default backend that uses OpenCV stream buffered backend.
-    # - "pyav": PyAV backend using in-process FFmpeg bindings.
-    #   Decodes sampled frames via per-frame seek, releasing the GIL
-    #   between frames for concurrent serving.
-    # - "pyav_dynamic": Dynamic-sampling variant of the PyAV backend
-    #   (mirrors "opencv_dynamic" sampling logic).
-    # - "identity": Returns raw video bytes for model processor to handle.
+    # Backend for Video IO — selects the frame-sampling algorithm.
+    # - "opencv": uniform sampling.
+    # - "opencv_dynamic": duration-aware dynamic sampling.
+    # - "identity": returns raw video bytes for model processor to handle.
     #
     # Custom backend implementations can be registered
     # via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index ed0652c6e356..ca50b4d981a3 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -3,7 +3,7 @@
 import math
 from abc import abstractmethod
 from io import BytesIO
-from typing import Any, NamedTuple, cast
+from typing import Any, ClassVar, Literal, NamedTuple, cast
 
 import numpy as np
 import numpy.typing as npt
@@ -360,8 +360,75 @@ def read_frames(
         return frames, valid_frame_indices
 
 
+class PyAVVideoBackendMixin:
+    """PyAV (in-process FFmpeg bindings) codec utilities.
+
+    Reads stream metadata and decodes target frames via per-frame
+    ``container.seek()``. The seek releases the GIL between frames and
+    scales with the number of sampled frames rather than the video
+    length, enabling concurrent decoding under serving load.
+    """
+
+    @staticmethod
+    def get_metadata(
+        container: "av.container.InputContainer",
+    ) -> VideoSourceMetadata:
+        if not container.streams.video:
+            raise ValueError("No video streams found in container")
+        stream = container.streams.video[0]
+        total_frames = stream.frames or 0
+        fps = float(stream.average_rate) if stream.average_rate else 0.0
+        duration = float(stream.duration * stream.time_base) if stream.duration else 0.0
+        if total_frames == 0 and duration > 0 and fps > 0:
+            total_frames = int(duration * fps)
+        return VideoSourceMetadata(total_frames, fps, duration)
+
+    @staticmethod
+    def decode_frames(
+        container: "av.container.InputContainer",
+        frame_indices: list[int],
+        fps: float,
+        duration: float,
+    ) -> tuple[npt.NDArray, list[int]]:
+        """Decode target frames via per-frame seek + keyframe decode."""
+        stream = container.streams.video[0]
+        # SLICE parallelizes within a single frame without the
+        # one-frame-per-thread latency penalty of FRAME threading.
+        stream.thread_type = "SLICE"
+        time_base = stream.time_base
+
+        frames_list: list[npt.NDArray] = []
+        valid_indices: list[int] = []
+        frame_interval = 1.0 / fps if fps > 0 else 0.1
+        max_ts = max(0.0, duration - frame_interval) if duration > 0 else float("inf")
+
+        for idx in frame_indices:
+            ts = min(idx / fps, max_ts) if fps > 0 else 0.0
+            pts = int(ts / time_base)
+            container.seek(pts, stream=stream)
+            frame = next(container.decode(video=0), None)
+            if frame is not None:
+                frames_list.append(frame.to_ndarray(format="rgb24"))
+                valid_indices.append(idx)
+
+        if not frames_list:
+            return np.empty((0,), dtype=np.uint8), valid_indices
+        return np.stack(frames_list), valid_indices
+
+
 @VIDEO_LOADER_REGISTRY.register("opencv")
-class OpenCVVideoBackend(VideoLoader, OpenCVVideoBackendMixin):
+class VideoBackend(VideoLoader, OpenCVVideoBackendMixin, PyAVVideoBackendMixin):
+    """Uniform-sampling video backend.
+
+    Samples ``num_frames`` uniformly across the video (or one frame every
+    ``1/fps`` seconds, whichever produces fewer frames). The decoding codec
+    is selected via the ``backend`` kwarg (``"opencv"`` or ``"pyav"``),
+    which can be passed through ``--media-io-kwargs``. Defaults to
+    ``"pyav"`` for concurrent decoding.
+    """
+
+    _sampling_suffix: ClassVar[str] = ""
+
     @classmethod
     def compute_frames_index_to_sample(
         cls,
@@ -371,7 +438,6 @@ def compute_frames_index_to_sample(
     ) -> list[int]:
         total_frames_num = source.total_frames_num
         duration = source.duration
-
         num_frames = target.num_frames
         fps = target.fps
         # resample video to target num_frames and fps
@@ -381,16 +447,18 @@ def compute_frames_index_to_sample(
             num_frames_to_sample = min(num_frames, total_frames_num)
         if fps > 0:
             num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps))
-        num_frames_to_sample = max(1, num_frames_to_sample)  # at least one sample
+        num_frames_to_sample = max(1, num_frames_to_sample)
 
         if num_frames_to_sample == total_frames_num:
-            frame_idx = list(range(0, num_frames_to_sample))
-        else:
-            uniform_sampled_frames = np.linspace(
-                0, total_frames_num - 1, num_frames_to_sample, dtype=int
-            )
-            frame_idx = uniform_sampled_frames.tolist()
-        return frame_idx
+            return list(range(num_frames_to_sample))
+        return np.linspace(
+            0, total_frames_num - 1, num_frames_to_sample, dtype=int
+        ).tolist()
+
+    @classmethod
+    def _prepare_source(cls, source: VideoSourceMetadata) -> VideoSourceMetadata:
+        """Sampling-algorithm-specific metadata adjustment hook."""
+        return source
 
     @classmethod
     def load_bytes(
@@ -400,55 +468,98 @@ def load_bytes(
         fps: int = -1,
         max_duration: int = 300,
         frame_recovery: bool = False,
+        *,
+        backend: Literal["opencv", "pyav"] = "pyav",
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
-        """
-        Load video frames from bytes.
+        """Load sampled frames from raw video bytes.
 
         Args:
-            data: Raw video bytes
-            num_frames: Target number of frames to sample (-1 for all)
-            fps: Target FPS for sampling (-1 for original)
-            max_duration: Maximum duration (unused in base backend)
-            frame_recovery: Enable forward-scan recovery for failed frames
+            data: Raw video bytes.
+            num_frames: Target number of frames to sample (``-1`` for all).
+            fps: Target FPS for sampling (``-1`` for original).
+            max_duration: Maximum duration in seconds — only used by the
+                dynamic subclass; ignored here.
+            frame_recovery: Enable forward-scan recovery for failed frames.
+                Only honored by the OpenCV codec.
+            backend: Decoding codec — ``"opencv"`` or ``"pyav"`` .
 
         Returns:
-            Tuple of (frames_array, metadata_dict)
+            Tuple of ``(frames_array, metadata_dict)``.
         """
-        cap = cls.open_video_capture(data)
-
-        source = OpenCVVideoBackendMixin.get_video_metadata(cap)
         target = VideoTargetMetadata(
-            num_frames=num_frames,
-            fps=fps,
-            max_duration=max_duration,
+            num_frames=num_frames, fps=fps, max_duration=max_duration
         )
 
-        # resample video to target num_frames and fps
-        # - the minimum of the two will be used
-        frame_idx = cls.compute_frames_index_to_sample(
-            source=source,
-            target=target,
-        )
+        if backend == "opencv":
+            cap = cls.open_video_capture(data)
+            source = cls._prepare_source(cls.get_video_metadata(cap))
+            frame_idx = cls.compute_frames_index_to_sample(
+                source=source, target=target, **kwargs
+            )
+            frames, valid = cls.read_frames(
+                cap,
+                frame_idx,
+                total_frames_num=source.total_frames_num,
+                frame_recovery=frame_recovery,
+            )
+        elif backend == "pyav":
+            with av.open(BytesIO(data)) as container:
+                source = cls._prepare_source(cls.get_metadata(container))
+                frame_idx = cls.compute_frames_index_to_sample(
+                    source=source, target=target, **kwargs
+                )
+                frames, valid = cls.decode_frames(
+                    container, frame_idx, source.original_fps, source.duration
+                )
+        else:
+            raise ValueError(
+                f"Unknown video codec backend {backend!r}; "
+                "valid options: 'opencv', 'pyav'."
+            )
 
-        frames, valid_frame_indices = cls.read_frames(
-            cap,
-            frame_idx,
-            total_frames_num=source.total_frames_num,
-            frame_recovery=frame_recovery,
-        )
+        if len(valid) < len(frame_idx):
+            logger.warning(
+                "%s video loading: expected %d frames but got %d.",
+                backend,
+                len(frame_idx),
+                len(valid),
+            )
 
-        metadata = cls.create_hf_metadata(
+        return frames, cls.create_hf_metadata(
             source=source,
-            video_backend="opencv",
-            valid_frame_indices=valid_frame_indices,
+            video_backend=f"{backend}{cls._sampling_suffix}",
+            valid_frame_indices=valid,
         )
 
-        return frames, metadata
-
 
 @VIDEO_LOADER_REGISTRY.register("opencv_dynamic")
-class OpenCVDynamicVideoBackend(VideoLoader, OpenCVVideoBackendMixin):
+class DynamicVideoBackend(VideoBackend):
+    """Duration-aware dynamic-sampling video backend.
+
+    Samples at ``fps`` up to ``max_duration`` seconds, falling back to
+    uniform sampling across the full duration when the video is longer
+    than ``max_duration``. Codec is selectable the same way as
+    :class:`VideoBackend`.
+    """
+
+    _sampling_suffix: ClassVar[str] = "_dynamic"
+
+    @classmethod
+    def _prepare_source(cls, source: VideoSourceMetadata) -> VideoSourceMetadata:
+        # Estimate duration from frame count and fps when the container
+        # does not report it (common for WebM/streaming inputs).
+        if source.duration:
+            return source
+        if source.original_fps > 0:
+            max_frame_idx = source.total_frames_num - 1
+            duration = round(max_frame_idx / source.original_fps) + 1
+        else:
+            duration = 0
+        return VideoSourceMetadata(
+            source.total_frames_num, source.original_fps, duration
+        )
+
     @classmethod
     def compute_frames_index_to_sample(
         cls,
@@ -461,8 +572,8 @@ def compute_frames_index_to_sample(
         original_fps = source.original_fps
         max_duration = target.max_duration
         fps = target.fps
-
         max_frame_idx = source.total_frames_num - 1
+
         # Refer to:
         # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
         frame_indices_list: list[int]
@@ -496,62 +607,20 @@ def load_bytes(
         fps: int = 2,
         max_duration: int = 300,
         frame_recovery: bool = False,
+        *,
+        backend: Literal["opencv", "pyav"] = "pyav",
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
-        """
-        Load video frames with dynamic sampling based on duration.
-
-        Args:
-            data: Raw video bytes
-            num_frames: Not used in dynamic backend
-            fps: Target FPS for sampling (default: 2)
-            max_duration: Maximum video duration to process (default: 300s)
-            frame_recovery: Enable forward-scan recovery for failed frames
-
-        Returns:
-            Tuple of (frames_array, metadata_dict)
-        """
-        cap = cls.open_video_capture(data)
-
-        orig_source = OpenCVVideoBackendMixin.get_video_metadata(cap)
-        max_frame_idx = orig_source.total_frames_num - 1
-        duration = (
-            orig_source.duration or round(max_frame_idx / orig_source.original_fps) + 1
-        )
-
-        # recompute source metadata with adjusted duration to ensure correct
-        # sampling indices computation
-        source = VideoSourceMetadata(
-            total_frames_num=orig_source.total_frames_num,
-            original_fps=orig_source.original_fps,
-            duration=duration,
-        )
-        target = VideoTargetMetadata(
+        return super().load_bytes(
+            data,
             num_frames=num_frames,
             fps=fps,
             max_duration=max_duration,
-        )
-
-        frame_indices_list = cls.compute_frames_index_to_sample(
-            source=source,
-            target=target,
-        )
-
-        frames, valid_frame_indices = cls.read_frames(
-            cap,
-            frame_indices_list,
-            total_frames_num=source.total_frames_num,
             frame_recovery=frame_recovery,
+            backend=backend,
+            **kwargs,
         )
 
-        metadata = cls.create_hf_metadata(
-            source=source,
-            video_backend="opencv_dynamic",
-            valid_frame_indices=valid_frame_indices,
-        )
-
-        return frames, metadata
-
 
 @VIDEO_LOADER_REGISTRY.register("molmo2")
 class Molmo2VideoBackend(VideoLoader, OpenCVVideoBackendMixin):
@@ -840,7 +909,7 @@ def load_bytes(
 
 
 @VIDEO_LOADER_REGISTRY.register("nemotron_vl")
-class NemotronVLVideoBackend(OpenCVVideoBackend):
+class NemotronVLVideoBackend(VideoBackend):
     @classmethod
     def load_bytes(
         cls,
@@ -849,14 +918,17 @@ def load_bytes(
         fps: int = -1,
         max_duration: int = 300,
         frame_recovery: bool = False,
+        *,
+        backend: Literal["opencv", "pyav"] = "opencv",
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
-        frames, metadata = OpenCVVideoBackend.load_bytes(
+        frames, metadata = super().load_bytes(
             data,
             num_frames=num_frames,
             fps=fps,
             max_duration=max_duration,
             frame_recovery=frame_recovery,
+            backend=backend,
             **kwargs,
         )
 
@@ -964,164 +1036,3 @@ def load_bytes(
             valid_frame_indices=valid_frame_indices,
         )
         return frames, metadata
-
-
-class PyAVVideoBackendMixin:
-    """Shared utilities for PyAV-based video backends.
-
-    Decodes video using the PyAV library (Python bindings for FFmpeg).
-    Frames are extracted via per-frame `container.seek()`, which
-    releases the GIL between frames and scales with the number of
-    sampled frames rather than the video length.
-    """
-
-    _video_backend_name: str
-
-    @staticmethod
-    def _get_metadata(
-        container: "av.container.InputContainer",
-    ) -> VideoSourceMetadata:
-        """Extract metadata from an open PyAV container."""
-        if not container.streams.video:
-            raise ValueError("No video streams found in container")
-        stream = container.streams.video[0]
-        total_frames = stream.frames or 0
-        fps = float(stream.average_rate) if stream.average_rate else 0.0
-        duration = float(stream.duration * stream.time_base) if stream.duration else 0.0
-
-        if total_frames == 0 and duration > 0 and fps > 0:
-            total_frames = int(duration * fps)
-
-        return VideoSourceMetadata(total_frames, fps, duration)
-
-    @staticmethod
-    def _decode_frames(
-        container: "av.container.InputContainer",
-        frame_indices: list[int],
-        fps: float,
-        duration: float,
-    ) -> tuple[npt.NDArray, list[int]]:
-        """Decode target frames via per-frame seek + keyframe decode."""
-        stream = container.streams.video[0]
-        # SLICE parallelizes within a single frame without the
-        # one-frame-per-thread latency penalty of FRAME threading.
-        stream.thread_type = "SLICE"
-        time_base = stream.time_base
-        frames_list: list[npt.NDArray] = []
-        valid_indices: list[int] = []
-
-        frame_interval = 1.0 / fps if fps > 0 else 0.1
-        max_ts = max(0.0, duration - frame_interval) if duration > 0 else float("inf")
-
-        for idx in frame_indices:
-            ts = min(idx / fps, max_ts) if fps > 0 else 0.0
-            pts = int(ts / time_base)
-            container.seek(pts, stream=stream)
-            frame = next(container.decode(video=0), None)
-            if frame is not None:
-                frames_list.append(frame.to_ndarray(format="rgb24"))
-                valid_indices.append(idx)
-
-        return np.stack(frames_list), valid_indices
-
-    @classmethod
-    def _prepare_source(
-        cls,
-        source: VideoSourceMetadata,
-    ) -> VideoSourceMetadata:
-        return source
-
-    @classmethod
-    def _load_bytes_impl(
-        cls,
-        data: bytes,
-        num_frames: int,
-        fps: int,
-        max_duration: int,
-    ) -> tuple[npt.NDArray, dict[str, Any]]:
-        """Shared implementation for all PyAV-based load_bytes methods."""
-        with av.open(BytesIO(data)) as container:
-            raw_source = cls._get_metadata(container)
-            source = cls._prepare_source(raw_source)
-
-            frame_idx = cls.compute_frames_index_to_sample(  # type: ignore[attr-defined]
-                source=source,
-                target=VideoTargetMetadata(num_frames, fps, max_duration),
-            )
-            frames, valid_frame_indices = cls._decode_frames(
-                container, frame_idx, source.original_fps, source.duration
-            )
-
-        if len(valid_frame_indices) < len(frame_idx):
-            logger.warning(
-                "pyav video loading: expected %d frames but got %d.",
-                len(frame_idx),
-                len(valid_frame_indices),
-            )
-        metadata = cls.create_hf_metadata(  # type: ignore[attr-defined]
-            source=source,
-            video_backend=cls._video_backend_name,
-            valid_frame_indices=valid_frame_indices,
-        )
-        return frames, metadata
-
-
-@VIDEO_LOADER_REGISTRY.register("pyav")
-class PyAVVideoBackend(VideoLoader, PyAVVideoBackendMixin):
-    """Video backend using PyAV (in-process FFmpeg bindings)."""
-
-    _video_backend_name = "pyav"
-
-    compute_frames_index_to_sample = OpenCVVideoBackend.compute_frames_index_to_sample
-
-    @classmethod
-    def load_bytes(
-        cls,
-        data: bytes,
-        num_frames: int = -1,
-        fps: int = -1,
-        max_duration: int = 300,
-        **kwargs,
-    ) -> tuple[npt.NDArray, dict[str, Any]]:
-        return cls._load_bytes_impl(data, num_frames, fps, max_duration)
-
-
-@VIDEO_LOADER_REGISTRY.register("pyav_dynamic")
-class PyAVDynamicVideoBackend(VideoLoader, PyAVVideoBackendMixin):
-    """Dynamic-sampling PyAV backend (mirrors OpenCVDynamicVideoBackend)."""
-
-    _video_backend_name = "pyav_dynamic"
-
-    @classmethod
-    def _prepare_source(
-        cls,
-        source: VideoSourceMetadata,
-    ) -> VideoSourceMetadata:
-        """Estimate duration from frame count and fps when not available."""
-        if source.duration:
-            return source
-        if source.original_fps > 0:
-            max_frame_idx = source.total_frames_num - 1
-            estimated_duration = round(max_frame_idx / source.original_fps) + 1
-        else:
-            estimated_duration = 0
-        return VideoSourceMetadata(
-            source.total_frames_num,
-            source.original_fps,
-            estimated_duration,
-        )
-
-    compute_frames_index_to_sample = (
-        OpenCVDynamicVideoBackend.compute_frames_index_to_sample
-    )
-
-    @classmethod
-    def load_bytes(
-        cls,
-        data: bytes,
-        num_frames: int = -1,
-        fps: int = 2,
-        max_duration: int = 300,
-        **kwargs,
-    ) -> tuple[npt.NDArray, dict[str, Any]]:
-        return cls._load_bytes_impl(data, num_frames, fps, max_duration)

From 1677d1769e98fba82c5d0d646004d6332dbd04f9 Mon Sep 17 00:00:00 2001
From: Jaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
Date: Sat, 18 Apr 2026 20:44:54 +0400
Subject: [PATCH 4/7] Update vllm/multimodal/video.py

Co-authored-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Jaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
---
 vllm/multimodal/video.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index ca50b4d981a3..820b06631713 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -504,6 +504,7 @@ def load_bytes(
                 frame_recovery=frame_recovery,
             )
         elif backend == "pyav":
+            assert not frame_recovery, "frame_recovery is only available for `opencv` backend"
             with av.open(BytesIO(data)) as container:
                 source = cls._prepare_source(cls.get_metadata(container))
                 frame_idx = cls.compute_frames_index_to_sample(

From 9eabc60727149576d56cfcb82dc53d1ee148f0f9 Mon Sep 17 00:00:00 2001
From: Jaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
Date: Sat, 18 Apr 2026 16:49:01 +0000
Subject: [PATCH 5/7] [Test][GLM-4.1V] Parameterize video loader consistency
 test over opencv/pyav codecs

Signed-off-by: Jaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
---
 tests/models/multimodal/processing/test_glm4_1v.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index 6f8a4d48b884..5798c5663472 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -70,9 +70,11 @@ def test_processor_override(
 
 @pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
 @pytest.mark.parametrize("fps", [2])
+@pytest.mark.parametrize("backend", ["opencv", "pyav"])
 def test_video_loader_consistency(
     model_id: str,
     fps: int,
+    backend: str,
 ):
     """
     Ensure dynamic video loader (pre-sampled by loader) and normal video
@@ -94,10 +96,10 @@ def test_video_loader_consistency(
         video_bytes = f.read()
 
     static_video, static_metadata = VideoBackend.load_bytes(
-        video_bytes, backend="opencv"
+        video_bytes, backend=backend
     )
     dynamic_video, dynamic_metadata = DynamicVideoBackend.load_bytes(
-        video_bytes, fps=fps, backend="opencv"
+        video_bytes, fps=fps, backend=backend
     )
 
     # pre-sampled loader shouldn't read all frames

From 0823582abc440bb8d08f083785a47999ab3df384 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Wed, 22 Apr 2026 00:32:18 +0800
Subject: [PATCH 6/7] set opencv as default backend

Co-authored-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/multimodal/video.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 820b06631713..dfe83c4b6b6b 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -469,7 +469,7 @@ def load_bytes(
         max_duration: int = 300,
         frame_recovery: bool = False,
         *,
-        backend: Literal["opencv", "pyav"] = "pyav",
+        backend: Literal["opencv", "pyav"] = "opencv",
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         """Load sampled frames from raw video bytes.
@@ -609,7 +609,7 @@ def load_bytes(
         max_duration: int = 300,
         frame_recovery: bool = False,
         *,
-        backend: Literal["opencv", "pyav"] = "pyav",
+        backend: Literal["opencv", "pyav"] = "opencv",
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         return super().load_bytes(

From 5ee1a7d0d4066e89e8b26128c4b9a48b0e66f1f3 Mon Sep 17 00:00:00 2001
From: Jaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
Date: Tue, 21 Apr 2026 18:18:07 +0000
Subject: [PATCH 7/7] run pre-commit

Signed-off-by: Jaseel Muhammad <jaseel.muhammad@mbzuai.ac.ae>
---
 vllm/multimodal/video.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index dfe83c4b6b6b..5b118af8fc53 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -504,7 +504,9 @@ def load_bytes(
                 frame_recovery=frame_recovery,
             )
         elif backend == "pyav":
-            assert not frame_recovery, "frame_recovery is only available for `opencv` backend"
+            assert not frame_recovery, (
+                "frame_recovery is only available for `opencv` backend"
+            )
             with av.open(BytesIO(data)) as container:
                 source = cls._prepare_source(cls.get_metadata(container))
                 frame_idx = cls.compute_frames_index_to_sample(