From 7203626970ee4197b37a59d15933eb6545e7f68a Mon Sep 17 00:00:00 2001 From: Jaseel Muhammad Date: Thu, 16 Apr 2026 08:01:20 +0000 Subject: [PATCH 1/7] [Core] Add PyAV video backend for concurrent video decoding Signed-off-by: Jaseel Muhammad --- tests/multimodal/test_video.py | 109 ++++++++++++++++ vllm/envs.py | 5 + vllm/multimodal/video.py | 221 +++++++++++++++++++++++++++++++++ 3 files changed, 335 insertions(+) diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index 3ece384348bc..10a87c7bcc09 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib.util from pathlib import Path import numpy as np @@ -15,6 +16,8 @@ from .utils import create_video_from_image +_has_pyav = importlib.util.find_spec("av") is not None + pytestmark = pytest.mark.cpu_test ASSETS_DIR = Path(__file__).parent / "assets" @@ -310,6 +313,75 @@ def dummy_video_path(tmp_path): return video_path +# ============================================================================ +# PyAV Backend Tests +# ============================================================================ + + +@pytest.mark.skipif(not _has_pyav, reason="PyAV not installed") +def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyPatch): + """Test that the pyav backend can load frames from a valid video.""" + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "pyav") + + with open(dummy_video_path, "rb") as f: + video_data = f.read() + + loader = VIDEO_LOADER_REGISTRY.load("pyav") + frames, metadata = loader.load_bytes(video_data, num_frames=8) + + assert frames.ndim == 4 + assert frames.shape[3] == 3 # RGB + assert frames.shape[0] == 8 + assert frames.shape[0] == len(metadata["frames_indices"]) + assert metadata["video_backend"] == "pyav" + assert "total_num_frames" in metadata + assert "fps" in metadata + assert "duration" in metadata + + +@pytest.mark.skipif(not _has_pyav, reason="PyAV not installed") +def test_pyav_backend_seek_path(dummy_video_path, monkeypatch: pytest.MonkeyPatch): + """Test that the PyAV seek path works by lowering the threshold.""" + import vllm.multimodal.video as video_mod + + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "pyav") + m.setattr(video_mod, "_PYAV_SEEK_FRAME_THRESHOLD", 0) + + with open(dummy_video_path, "rb") as f: + video_data = f.read() + + loader = VIDEO_LOADER_REGISTRY.load("pyav") + frames, metadata = loader.load_bytes(video_data, num_frames=8) + + assert frames.ndim == 4 + assert frames.shape[3] == 3 # RGB + assert frames.shape[0] == 8 + assert frames.shape[0] == len(metadata["frames_indices"]) + + +@pytest.mark.skipif(not _has_pyav, reason="PyAV not installed") +def test_pyav_dynamic_backend_loads_frames( + dummy_video_path, monkeypatch: pytest.MonkeyPatch +): + """Test that the pyav_dynamic backend can load frames.""" + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "pyav_dynamic") + + with open(dummy_video_path, "rb") as f: + video_data = f.read() + + loader = VIDEO_LOADER_REGISTRY.load("pyav_dynamic") + frames, metadata = loader.load_bytes(video_data, fps=2, max_duration=10) + + assert frames.ndim == 4 + assert frames.shape[3] == 3 # RGB + assert frames.shape[0] > 0 + assert frames.shape[0] == len(metadata["frames_indices"]) + assert metadata["video_backend"] == "pyav_dynamic" + + @pytest.mark.parametrize( "backend, kwargs, expected_num_frames", [ @@ -349,6 +421,43 @@ def dummy_video_path(tmp_path): 119, id="molmo2-fps", ), + # pyav: same sampling logic as opencv + pytest.param( + "pyav", + {"num_frames": 32}, + 32, + id="pyav-num_frames", + marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"), + ), + pytest.param( + "pyav", + {"fps": 2}, + 120, + id="pyav-fps", + marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"), + ), + pytest.param( + "pyav", + {"num_frames": 500, "fps": 2}, + 120, + id="pyav-num_frames_wins_fps", + marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"), + ), + # pyav_dynamic: same sampling logic as opencv_dynamic + pytest.param( + "pyav_dynamic", + {"fps": 1, "max_duration": 60}, + 60, + id="pyav_dynamic-within_max_duration", + marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"), + ), + pytest.param( + "pyav_dynamic", + {"fps": 2, "max_duration": 30}, + 60, + id="pyav_dynamic-exceeds_max_duration", + marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"), + ), ], ) def test_video_loader_frames_sampling( diff --git a/vllm/envs.py b/vllm/envs.py index 8ed1d33434cb..e8f47f7b13b5 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -831,6 +831,11 @@ def _get_or_set_default() -> str: ), # Backend for Video IO # - "opencv": Default backend that uses OpenCV stream buffered backend. + # - "pyav": PyAV backend using in-process FFmpeg bindings. + # Uses adaptive scan/seek: sequential scan for short videos, + # per-frame seeking for long videos to reduce GIL hold time. + # - "pyav_dynamic": Dynamic-sampling variant of the PyAV backend + # (mirrors "opencv_dynamic" sampling logic). # - "identity": Returns raw video bytes for model processor to handle. # # Custom backend implementations can be registered diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 90102151423f..264c6b5ac97c 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -19,6 +19,11 @@ cv2 = PlaceholderModule("cv2") vr = PlaceholderModule("cv2").placeholder_attr("videoio_registry") +try: + import av +except ImportError: + av = PlaceholderModule("av") # type: ignore[assignment] + logger = init_logger(__name__) @@ -959,3 +964,219 @@ def load_bytes( valid_frame_indices=valid_frame_indices, ) return frames, metadata + + +# Threshold for switching from sequential scan to per-frame seeking +# in PyAV backends. At 30 fps, 5000 frames ≈ 3 minutes. Scan holds +# the GIL continuously; seek releases it between frames, which is +# critical for concurrent serving with long videos. +_PYAV_SEEK_FRAME_THRESHOLD = 5000 + + +class PyAVVideoBackendMixin: + """Shared utilities for PyAV-based video backends. + + Decodes video using the PyAV library (Python bindings for FFmpeg). + Short videos are decoded via sequential scan; long videos use + per-frame seeking to release the GIL between frames and allow + concurrent request processing. + """ + + _video_backend_name: str + + @staticmethod + def _get_metadata( + container: "av.container.InputContainer", + ) -> VideoSourceMetadata: + """Extract metadata from an open PyAV container.""" + if not container.streams.video: + raise ValueError("No video streams found in container") + stream = container.streams.video[0] + total_frames = stream.frames or 0 + fps = float(stream.average_rate) if stream.average_rate else 0.0 + duration = float(stream.duration * stream.time_base) if stream.duration else 0.0 + + if total_frames == 0 and duration > 0 and fps > 0: + total_frames = int(duration * fps) + + return VideoSourceMetadata(total_frames, fps, duration) + + @staticmethod + def _decode_frames_scan( + container: "av.container.InputContainer", + frame_indices: list[int], + ) -> tuple[npt.NDArray, list[int]]: + """Decode frames by sequential scan (best for short videos).""" + target_set = set(frame_indices) + max_idx = max(frame_indices) + frames_list: list[npt.NDArray] = [] + valid_indices: list[int] = [] + + container.streams.video[0].thread_type = "AUTO" + for idx, frame in enumerate(container.decode(video=0)): + if idx > max_idx: + break + if idx in target_set: + frames_list.append(frame.to_ndarray(format="rgb24")) + valid_indices.append(idx) + + return np.stack(frames_list), valid_indices + + @staticmethod + def _decode_frames_seek( + container: "av.container.InputContainer", + frame_indices: list[int], + fps: float, + duration: float, + ) -> tuple[npt.NDArray, list[int]]: + """Decode frames by seeking to timestamps (best for long videos). + + Releases the GIL between frames (via seek + decode boundaries), + allowing other requests to make progress under concurrency. + """ + stream = container.streams.video[0] + # SLICE parallelizes within a single frame without the + # one-frame-per-thread latency penalty of FRAME threading. + stream.thread_type = "SLICE" + time_base = stream.time_base + frames_list: list[npt.NDArray] = [] + valid_indices: list[int] = [] + + frame_interval = 1.0 / fps if fps > 0 else 0.1 + max_ts = max(0.0, duration - frame_interval) if duration > 0 else float("inf") + + for idx in frame_indices: + ts = min(idx / fps, max_ts) if fps > 0 else 0.0 + pts = int(ts / time_base) + container.seek(pts, stream=stream) + frame = next(container.decode(video=0), None) + if frame is not None: + frames_list.append(frame.to_ndarray(format="rgb24")) + valid_indices.append(idx) + + return np.stack(frames_list), valid_indices + + @classmethod + def _decode_frames( + cls, + container: "av.container.InputContainer", + frame_indices: list[int], + source: VideoSourceMetadata, + seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD, + ) -> tuple[npt.NDArray, list[int]]: + """Decode frames, choosing scan or seek based on video length. + + Scan holds the GIL continuously — fine for short videos. + Seek releases the GIL between frames, critical for concurrency + with long videos. + """ + if source.total_frames_num >= seek_threshold and source.original_fps > 0: + return cls._decode_frames_seek( + container, frame_indices, source.original_fps, source.duration + ) + return cls._decode_frames_scan(container, frame_indices) + + @classmethod + def _prepare_source( + cls, + source: VideoSourceMetadata, + ) -> VideoSourceMetadata: + return source + + @classmethod + def _load_bytes_impl( + cls, + data: bytes, + num_frames: int, + fps: int, + max_duration: int, + seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD, + ) -> tuple[npt.NDArray, dict[str, Any]]: + """Shared implementation for all PyAV-based load_bytes methods.""" + with av.open(BytesIO(data)) as container: + raw_source = cls._get_metadata(container) + source = cls._prepare_source(raw_source) + + frame_idx = cls.compute_frames_index_to_sample( # type: ignore[attr-defined] + source=source, + target=VideoTargetMetadata(num_frames, fps, max_duration), + ) + frames, valid_frame_indices = cls._decode_frames( + container, frame_idx, source, seek_threshold + ) + + if len(valid_frame_indices) < len(frame_idx): + logger.warning( + "pyav video loading: expected %d frames but got %d.", + len(frame_idx), + len(valid_frame_indices), + ) + metadata = cls.create_hf_metadata( # type: ignore[attr-defined] + source=source, + video_backend=cls._video_backend_name, + valid_frame_indices=valid_frame_indices, + ) + return frames, metadata + + +@VIDEO_LOADER_REGISTRY.register("pyav") +class PyAVVideoBackend(VideoLoader, PyAVVideoBackendMixin): + """Video backend using PyAV (in-process FFmpeg bindings).""" + + _video_backend_name = "pyav" + + compute_frames_index_to_sample = OpenCVVideoBackend.compute_frames_index_to_sample + + @classmethod + def load_bytes( + cls, + data: bytes, + num_frames: int = -1, + fps: int = -1, + max_duration: int = 300, + seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD, + **kwargs, + ) -> tuple[npt.NDArray, dict[str, Any]]: + return cls._load_bytes_impl(data, num_frames, fps, max_duration, seek_threshold) + + +@VIDEO_LOADER_REGISTRY.register("pyav_dynamic") +class PyAVDynamicVideoBackend(VideoLoader, PyAVVideoBackendMixin): + """Dynamic-sampling PyAV backend (mirrors OpenCVDynamicVideoBackend).""" + + _video_backend_name = "pyav_dynamic" + + @classmethod + def _prepare_source( + cls, + source: VideoSourceMetadata, + ) -> VideoSourceMetadata: + """Estimate duration from frame count and fps when not available.""" + if source.duration: + return source + if source.original_fps > 0: + max_frame_idx = source.total_frames_num - 1 + estimated_duration = round(max_frame_idx / source.original_fps) + 1 + else: + estimated_duration = 0 + return VideoSourceMetadata( + source.total_frames_num, + source.original_fps, + estimated_duration, + ) + + compute_frames_index_to_sample = ( + OpenCVDynamicVideoBackend.compute_frames_index_to_sample + ) + + @classmethod + def load_bytes( + cls, + data: bytes, + num_frames: int = -1, + fps: int = 2, + max_duration: int = 300, + seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD, + **kwargs, + ) -> tuple[npt.NDArray, dict[str, Any]]: + return cls._load_bytes_impl(data, num_frames, fps, max_duration, seek_threshold) From 9d9ce8c4a77cb2397ad2b1e8905ee37237eb9ebe Mon Sep 17 00:00:00 2001 From: Jaseel Muhammad Date: Fri, 17 Apr 2026 09:24:48 +0000 Subject: [PATCH 2/7] Drop adaptive scan path, keep seek-only path in PyAV backend Signed-off-by: Jaseel Muhammad --- tests/multimodal/test_video.py | 45 +-------------------- vllm/envs.py | 4 +- vllm/multimodal/video.py | 71 ++++------------------------------ 3 files changed, 12 insertions(+), 108 deletions(-) diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index 10a87c7bcc09..4443cad9a088 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import importlib.util from pathlib import Path import numpy as np @@ -16,8 +15,6 @@ from .utils import create_video_from_image -_has_pyav = importlib.util.find_spec("av") is not None - pytestmark = pytest.mark.cpu_test ASSETS_DIR = Path(__file__).parent / "assets" @@ -318,7 +315,6 @@ def dummy_video_path(tmp_path): # ============================================================================ -@pytest.mark.skipif(not _has_pyav, reason="PyAV not installed") def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyPatch): """Test that the pyav backend can load frames from a valid video.""" with monkeypatch.context() as m: @@ -340,28 +336,6 @@ def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyP assert "duration" in metadata -@pytest.mark.skipif(not _has_pyav, reason="PyAV not installed") -def test_pyav_backend_seek_path(dummy_video_path, monkeypatch: pytest.MonkeyPatch): - """Test that the PyAV seek path works by lowering the threshold.""" - import vllm.multimodal.video as video_mod - - with monkeypatch.context() as m: - m.setenv("VLLM_VIDEO_LOADER_BACKEND", "pyav") - m.setattr(video_mod, "_PYAV_SEEK_FRAME_THRESHOLD", 0) - - with open(dummy_video_path, "rb") as f: - video_data = f.read() - - loader = VIDEO_LOADER_REGISTRY.load("pyav") - frames, metadata = loader.load_bytes(video_data, num_frames=8) - - assert frames.ndim == 4 - assert frames.shape[3] == 3 # RGB - assert frames.shape[0] == 8 - assert frames.shape[0] == len(metadata["frames_indices"]) - - -@pytest.mark.skipif(not _has_pyav, reason="PyAV not installed") def test_pyav_dynamic_backend_loads_frames( dummy_video_path, monkeypatch: pytest.MonkeyPatch ): @@ -422,26 +396,13 @@ def test_pyav_dynamic_backend_loads_frames( id="molmo2-fps", ), # pyav: same sampling logic as opencv - pytest.param( - "pyav", - {"num_frames": 32}, - 32, - id="pyav-num_frames", - marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"), - ), - pytest.param( - "pyav", - {"fps": 2}, - 120, - id="pyav-fps", - marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"), - ), + pytest.param("pyav", {"num_frames": 32}, 32, id="pyav-num_frames"), + pytest.param("pyav", {"fps": 2}, 120, id="pyav-fps"), pytest.param( "pyav", {"num_frames": 500, "fps": 2}, 120, id="pyav-num_frames_wins_fps", - marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"), ), # pyav_dynamic: same sampling logic as opencv_dynamic pytest.param( @@ -449,14 +410,12 @@ def test_pyav_dynamic_backend_loads_frames( {"fps": 1, "max_duration": 60}, 60, id="pyav_dynamic-within_max_duration", - marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"), ), pytest.param( "pyav_dynamic", {"fps": 2, "max_duration": 30}, 60, id="pyav_dynamic-exceeds_max_duration", - marks=pytest.mark.skipif(not _has_pyav, reason="PyAV not installed"), ), ], ) diff --git a/vllm/envs.py b/vllm/envs.py index e8f47f7b13b5..a87d555cc15d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -832,8 +832,8 @@ def _get_or_set_default() -> str: # Backend for Video IO # - "opencv": Default backend that uses OpenCV stream buffered backend. # - "pyav": PyAV backend using in-process FFmpeg bindings. - # Uses adaptive scan/seek: sequential scan for short videos, - # per-frame seeking for long videos to reduce GIL hold time. + # Decodes sampled frames via per-frame seek, releasing the GIL + # between frames for concurrent serving. # - "pyav_dynamic": Dynamic-sampling variant of the PyAV backend # (mirrors "opencv_dynamic" sampling logic). # - "identity": Returns raw video bytes for model processor to handle. diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 264c6b5ac97c..ed0652c6e356 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -966,20 +966,13 @@ def load_bytes( return frames, metadata -# Threshold for switching from sequential scan to per-frame seeking -# in PyAV backends. At 30 fps, 5000 frames ≈ 3 minutes. Scan holds -# the GIL continuously; seek releases it between frames, which is -# critical for concurrent serving with long videos. -_PYAV_SEEK_FRAME_THRESHOLD = 5000 - - class PyAVVideoBackendMixin: """Shared utilities for PyAV-based video backends. Decodes video using the PyAV library (Python bindings for FFmpeg). - Short videos are decoded via sequential scan; long videos use - per-frame seeking to release the GIL between frames and allow - concurrent request processing. + Frames are extracted via per-frame `container.seek()`, which + releases the GIL between frames and scales with the number of + sampled frames rather than the video length. """ _video_backend_name: str @@ -1002,38 +995,13 @@ def _get_metadata( return VideoSourceMetadata(total_frames, fps, duration) @staticmethod - def _decode_frames_scan( - container: "av.container.InputContainer", - frame_indices: list[int], - ) -> tuple[npt.NDArray, list[int]]: - """Decode frames by sequential scan (best for short videos).""" - target_set = set(frame_indices) - max_idx = max(frame_indices) - frames_list: list[npt.NDArray] = [] - valid_indices: list[int] = [] - - container.streams.video[0].thread_type = "AUTO" - for idx, frame in enumerate(container.decode(video=0)): - if idx > max_idx: - break - if idx in target_set: - frames_list.append(frame.to_ndarray(format="rgb24")) - valid_indices.append(idx) - - return np.stack(frames_list), valid_indices - - @staticmethod - def _decode_frames_seek( + def _decode_frames( container: "av.container.InputContainer", frame_indices: list[int], fps: float, duration: float, ) -> tuple[npt.NDArray, list[int]]: - """Decode frames by seeking to timestamps (best for long videos). - - Releases the GIL between frames (via seek + decode boundaries), - allowing other requests to make progress under concurrency. - """ + """Decode target frames via per-frame seek + keyframe decode.""" stream = container.streams.video[0] # SLICE parallelizes within a single frame without the # one-frame-per-thread latency penalty of FRAME threading. @@ -1056,26 +1024,6 @@ def _decode_frames_seek( return np.stack(frames_list), valid_indices - @classmethod - def _decode_frames( - cls, - container: "av.container.InputContainer", - frame_indices: list[int], - source: VideoSourceMetadata, - seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD, - ) -> tuple[npt.NDArray, list[int]]: - """Decode frames, choosing scan or seek based on video length. - - Scan holds the GIL continuously — fine for short videos. - Seek releases the GIL between frames, critical for concurrency - with long videos. - """ - if source.total_frames_num >= seek_threshold and source.original_fps > 0: - return cls._decode_frames_seek( - container, frame_indices, source.original_fps, source.duration - ) - return cls._decode_frames_scan(container, frame_indices) - @classmethod def _prepare_source( cls, @@ -1090,7 +1038,6 @@ def _load_bytes_impl( num_frames: int, fps: int, max_duration: int, - seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD, ) -> tuple[npt.NDArray, dict[str, Any]]: """Shared implementation for all PyAV-based load_bytes methods.""" with av.open(BytesIO(data)) as container: @@ -1102,7 +1049,7 @@ def _load_bytes_impl( target=VideoTargetMetadata(num_frames, fps, max_duration), ) frames, valid_frame_indices = cls._decode_frames( - container, frame_idx, source, seek_threshold + container, frame_idx, source.original_fps, source.duration ) if len(valid_frame_indices) < len(frame_idx): @@ -1134,10 +1081,9 @@ def load_bytes( num_frames: int = -1, fps: int = -1, max_duration: int = 300, - seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD, **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: - return cls._load_bytes_impl(data, num_frames, fps, max_duration, seek_threshold) + return cls._load_bytes_impl(data, num_frames, fps, max_duration) @VIDEO_LOADER_REGISTRY.register("pyav_dynamic") @@ -1176,7 +1122,6 @@ def load_bytes( num_frames: int = -1, fps: int = 2, max_duration: int = 300, - seek_threshold: int = _PYAV_SEEK_FRAME_THRESHOLD, **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: - return cls._load_bytes_impl(data, num_frames, fps, max_duration, seek_threshold) + return cls._load_bytes_impl(data, num_frames, fps, max_duration) From 29ad2947ee5d3a5e787bc215a3c6aa27ec9307c8 Mon Sep 17 00:00:00 2001 From: Jaseel Muhammad Date: Sat, 18 Apr 2026 16:12:45 +0000 Subject: [PATCH 3/7] [Multimodal] separate video sampling algorithm from decode codec Signed-off-by: Jaseel Muhammad --- .../multimodal/processing/test_glm4_1v.py | 10 +- tests/multimodal/test_video.py | 89 ++-- vllm/envs.py | 12 +- vllm/multimodal/video.py | 421 +++++++----------- 4 files changed, 230 insertions(+), 302 deletions(-) diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index f70d00524275..6f8a4d48b884 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -6,7 +6,7 @@ from vllm.assets.video import VideoAsset from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import batched_tensors_equal -from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend +from vllm.multimodal.video import DynamicVideoBackend, VideoBackend from ...utils import build_model_context @@ -93,9 +93,11 @@ def test_video_loader_consistency( with open(video_path, "rb") as f: video_bytes = f.read() - static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes) - dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes( - video_bytes, fps=fps + static_video, static_metadata = VideoBackend.load_bytes( + video_bytes, backend="opencv" + ) + dynamic_video, dynamic_metadata = DynamicVideoBackend.load_bytes( + video_bytes, fps=fps, backend="opencv" ) # pre-sampled loader shouldn't read all frames diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index 4443cad9a088..e82883ece338 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -71,7 +71,9 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch): video_data = f.read() loader = VIDEO_LOADER_REGISTRY.load("opencv") - frames, metadata = loader.load_bytes(video_data, num_frames=-1) + frames, metadata = loader.load_bytes( + video_data, num_frames=-1, backend="opencv" + ) # Verify metadata consistency: # frames_indices must match actual loaded frames @@ -158,12 +160,12 @@ def release(self): # Test WITHOUT recovery - should have fewer frames due to failures frames_no_recovery, meta_no = loader.load_bytes( - video_data, num_frames=8, frame_recovery=False + video_data, num_frames=8, frame_recovery=False, backend="opencv" ) # Test WITH recovery - should recover using next valid frames frames_with_recovery, meta_yes = loader.load_bytes( - video_data, num_frames=8, frame_recovery=True + video_data, num_frames=8, frame_recovery=True, backend="opencv" ) # With recovery should have MORE frames than without @@ -214,12 +216,12 @@ def test_video_recovery_with_corrupted_file(monkeypatch: pytest.MonkeyPatch): # Test without recovery - frame 17 will be skipped frames_no_recovery, meta_no_recovery = loader.load_bytes( - video_data, num_frames=8, frame_recovery=False + video_data, num_frames=8, frame_recovery=False, backend="opencv" ) # Test with recovery - frame 18 should fill in for frame 17 frames_with_recovery, meta_with_recovery = loader.load_bytes( - video_data, num_frames=8, frame_recovery=True + video_data, num_frames=8, frame_recovery=True, backend="opencv" ) # Verify metadata consistency for both modes @@ -271,12 +273,16 @@ def test_video_recovery_dynamic_backend(monkeypatch: pytest.MonkeyPatch): # Test without recovery frames_no_recovery, meta_no = loader.load_bytes( - video_data, fps=2, max_duration=10, frame_recovery=False + video_data, + fps=2, + max_duration=10, + frame_recovery=False, + backend="opencv", ) # Test with frame_recovery enabled frames_with_recovery, meta_with = loader.load_bytes( - video_data, fps=2, max_duration=10, frame_recovery=True + video_data, fps=2, max_duration=10, frame_recovery=True, backend="opencv" ) # Verify basic properties @@ -316,15 +322,15 @@ def dummy_video_path(tmp_path): def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyPatch): - """Test that the pyav backend can load frames from a valid video.""" + """Test that the pyav codec backend can load frames from a valid video.""" with monkeypatch.context() as m: - m.setenv("VLLM_VIDEO_LOADER_BACKEND", "pyav") + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv") with open(dummy_video_path, "rb") as f: video_data = f.read() - loader = VIDEO_LOADER_REGISTRY.load("pyav") - frames, metadata = loader.load_bytes(video_data, num_frames=8) + loader = VIDEO_LOADER_REGISTRY.load("opencv") + frames, metadata = loader.load_bytes(video_data, num_frames=8, backend="pyav") assert frames.ndim == 4 assert frames.shape[3] == 3 # RGB @@ -339,15 +345,17 @@ def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyP def test_pyav_dynamic_backend_loads_frames( dummy_video_path, monkeypatch: pytest.MonkeyPatch ): - """Test that the pyav_dynamic backend can load frames.""" + """Test that the pyav codec with dynamic sampling can load frames.""" with monkeypatch.context() as m: - m.setenv("VLLM_VIDEO_LOADER_BACKEND", "pyav_dynamic") + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic") with open(dummy_video_path, "rb") as f: video_data = f.read() - loader = VIDEO_LOADER_REGISTRY.load("pyav_dynamic") - frames, metadata = loader.load_bytes(video_data, fps=2, max_duration=10) + loader = VIDEO_LOADER_REGISTRY.load("opencv_dynamic") + frames, metadata = loader.load_bytes( + video_data, fps=2, max_duration=10, backend="pyav" + ) assert frames.ndim == 4 assert frames.shape[3] == 3 # RGB @@ -357,26 +365,32 @@ def test_pyav_dynamic_backend_loads_frames( @pytest.mark.parametrize( - "backend, kwargs, expected_num_frames", + "loader_key, kwargs, expected_num_frames", [ - # opencv: num_frames directly controls count - pytest.param("opencv", {"num_frames": 32}, 32, id="opencv-num_frames"), - pytest.param("opencv", {"fps": 2}, 120, id="opencv-fps"), + # uniform sampling + opencv codec pytest.param( "opencv", - {"num_frames": 500, "fps": 2}, + {"num_frames": 32, "backend": "opencv"}, + 32, + id="opencv-num_frames", + ), + pytest.param("opencv", {"fps": 2, "backend": "opencv"}, 120, id="opencv-fps"), + pytest.param( + "opencv", + {"num_frames": 500, "fps": 2, "backend": "opencv"}, 120, id="opencv-num_frames_wins_fps", ), + # dynamic sampling + opencv codec pytest.param( "opencv_dynamic", - {"fps": 1, "max_duration": 60}, + {"fps": 1, "max_duration": 60, "backend": "opencv"}, 60, id="opencv_dynamic-within_max_duration", ), pytest.param( "opencv_dynamic", - {"fps": 2, "max_duration": 30}, + {"fps": 2, "max_duration": 30, "backend": "opencv"}, 60, id="opencv_dynamic-exceeds_max_duration", ), @@ -395,25 +409,30 @@ def test_pyav_dynamic_backend_loads_frames( 119, id="molmo2-fps", ), - # pyav: same sampling logic as opencv - pytest.param("pyav", {"num_frames": 32}, 32, id="pyav-num_frames"), - pytest.param("pyav", {"fps": 2}, 120, id="pyav-fps"), + # uniform sampling + pyav codec (same frame counts as opencv) + pytest.param( + "opencv", + {"num_frames": 32, "backend": "pyav"}, + 32, + id="pyav-num_frames", + ), + pytest.param("opencv", {"fps": 2, "backend": "pyav"}, 120, id="pyav-fps"), pytest.param( - "pyav", - {"num_frames": 500, "fps": 2}, + "opencv", + {"num_frames": 500, "fps": 2, "backend": "pyav"}, 120, id="pyav-num_frames_wins_fps", ), - # pyav_dynamic: same sampling logic as opencv_dynamic + # dynamic sampling + pyav codec pytest.param( - "pyav_dynamic", - {"fps": 1, "max_duration": 60}, + "opencv_dynamic", + {"fps": 1, "max_duration": 60, "backend": "pyav"}, 60, id="pyav_dynamic-within_max_duration", ), pytest.param( - "pyav_dynamic", - {"fps": 2, "max_duration": 30}, + "opencv_dynamic", + {"fps": 2, "max_duration": 30, "backend": "pyav"}, 60, id="pyav_dynamic-exceeds_max_duration", ), @@ -422,13 +441,13 @@ def test_pyav_dynamic_backend_loads_frames( def test_video_loader_frames_sampling( dummy_video_path, monkeypatch: pytest.MonkeyPatch, - backend: str, + loader_key: str, kwargs: dict, expected_num_frames: int, ): """Test video loader frames sampling functionality.""" - monkeypatch.setenv("VLLM_VIDEO_LOADER_BACKEND", backend) - loader = VIDEO_LOADER_REGISTRY.load(backend) + monkeypatch.setenv("VLLM_VIDEO_LOADER_BACKEND", loader_key) + loader = VIDEO_LOADER_REGISTRY.load(loader_key) with open(dummy_video_path, "rb") as f: long_video_bytes = f.read() diff --git a/vllm/envs.py b/vllm/envs.py index a87d555cc15d..072a9269be47 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -829,14 +829,10 @@ def _get_or_set_default() -> str: "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int( os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25") ), - # Backend for Video IO - # - "opencv": Default backend that uses OpenCV stream buffered backend. - # - "pyav": PyAV backend using in-process FFmpeg bindings. - # Decodes sampled frames via per-frame seek, releasing the GIL - # between frames for concurrent serving. - # - "pyav_dynamic": Dynamic-sampling variant of the PyAV backend - # (mirrors "opencv_dynamic" sampling logic). - # - "identity": Returns raw video bytes for model processor to handle. + # Backend for Video IO — selects the frame-sampling algorithm. + # - "opencv": uniform sampling. + # - "opencv_dynamic": duration-aware dynamic sampling. + # - "identity": returns raw video bytes for model processor to handle. # # Custom backend implementations can be registered # via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index ed0652c6e356..ca50b4d981a3 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -3,7 +3,7 @@ import math from abc import abstractmethod from io import BytesIO -from typing import Any, NamedTuple, cast +from typing import Any, ClassVar, Literal, NamedTuple, cast import numpy as np import numpy.typing as npt @@ -360,8 +360,75 @@ def read_frames( return frames, valid_frame_indices +class PyAVVideoBackendMixin: + """PyAV (in-process FFmpeg bindings) codec utilities. + + Reads stream metadata and decodes target frames via per-frame + ``container.seek()``. The seek releases the GIL between frames and + scales with the number of sampled frames rather than the video + length, enabling concurrent decoding under serving load. + """ + + @staticmethod + def get_metadata( + container: "av.container.InputContainer", + ) -> VideoSourceMetadata: + if not container.streams.video: + raise ValueError("No video streams found in container") + stream = container.streams.video[0] + total_frames = stream.frames or 0 + fps = float(stream.average_rate) if stream.average_rate else 0.0 + duration = float(stream.duration * stream.time_base) if stream.duration else 0.0 + if total_frames == 0 and duration > 0 and fps > 0: + total_frames = int(duration * fps) + return VideoSourceMetadata(total_frames, fps, duration) + + @staticmethod + def decode_frames( + container: "av.container.InputContainer", + frame_indices: list[int], + fps: float, + duration: float, + ) -> tuple[npt.NDArray, list[int]]: + """Decode target frames via per-frame seek + keyframe decode.""" + stream = container.streams.video[0] + # SLICE parallelizes within a single frame without the + # one-frame-per-thread latency penalty of FRAME threading. + stream.thread_type = "SLICE" + time_base = stream.time_base + + frames_list: list[npt.NDArray] = [] + valid_indices: list[int] = [] + frame_interval = 1.0 / fps if fps > 0 else 0.1 + max_ts = max(0.0, duration - frame_interval) if duration > 0 else float("inf") + + for idx in frame_indices: + ts = min(idx / fps, max_ts) if fps > 0 else 0.0 + pts = int(ts / time_base) + container.seek(pts, stream=stream) + frame = next(container.decode(video=0), None) + if frame is not None: + frames_list.append(frame.to_ndarray(format="rgb24")) + valid_indices.append(idx) + + if not frames_list: + return np.empty((0,), dtype=np.uint8), valid_indices + return np.stack(frames_list), valid_indices + + @VIDEO_LOADER_REGISTRY.register("opencv") -class OpenCVVideoBackend(VideoLoader, OpenCVVideoBackendMixin): +class VideoBackend(VideoLoader, OpenCVVideoBackendMixin, PyAVVideoBackendMixin): + """Uniform-sampling video backend. + + Samples ``num_frames`` uniformly across the video (or one frame every + ``1/fps`` seconds, whichever produces fewer frames). The decoding codec + is selected via the ``backend`` kwarg (``"opencv"`` or ``"pyav"``), + which can be passed through ``--media-io-kwargs``. Defaults to + ``"pyav"`` for concurrent decoding. + """ + + _sampling_suffix: ClassVar[str] = "" + @classmethod def compute_frames_index_to_sample( cls, @@ -371,7 +438,6 @@ def compute_frames_index_to_sample( ) -> list[int]: total_frames_num = source.total_frames_num duration = source.duration - num_frames = target.num_frames fps = target.fps # resample video to target num_frames and fps @@ -381,16 +447,18 @@ def compute_frames_index_to_sample( num_frames_to_sample = min(num_frames, total_frames_num) if fps > 0: num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps)) - num_frames_to_sample = max(1, num_frames_to_sample) # at least one sample + num_frames_to_sample = max(1, num_frames_to_sample) if num_frames_to_sample == total_frames_num: - frame_idx = list(range(0, num_frames_to_sample)) - else: - uniform_sampled_frames = np.linspace( - 0, total_frames_num - 1, num_frames_to_sample, dtype=int - ) - frame_idx = uniform_sampled_frames.tolist() - return frame_idx + return list(range(num_frames_to_sample)) + return np.linspace( + 0, total_frames_num - 1, num_frames_to_sample, dtype=int + ).tolist() + + @classmethod + def _prepare_source(cls, source: VideoSourceMetadata) -> VideoSourceMetadata: + """Sampling-algorithm-specific metadata adjustment hook.""" + return source @classmethod def load_bytes( @@ -400,55 +468,98 @@ def load_bytes( fps: int = -1, max_duration: int = 300, frame_recovery: bool = False, + *, + backend: Literal["opencv", "pyav"] = "pyav", **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: - """ - Load video frames from bytes. + """Load sampled frames from raw video bytes. Args: - data: Raw video bytes - num_frames: Target number of frames to sample (-1 for all) - fps: Target FPS for sampling (-1 for original) - max_duration: Maximum duration (unused in base backend) - frame_recovery: Enable forward-scan recovery for failed frames + data: Raw video bytes. + num_frames: Target number of frames to sample (``-1`` for all). + fps: Target FPS for sampling (``-1`` for original). + max_duration: Maximum duration in seconds — only used by the + dynamic subclass; ignored here. + frame_recovery: Enable forward-scan recovery for failed frames. + Only honored by the OpenCV codec. + backend: Decoding codec — ``"opencv"`` or ``"pyav"`` . Returns: - Tuple of (frames_array, metadata_dict) + Tuple of ``(frames_array, metadata_dict)``. """ - cap = cls.open_video_capture(data) - - source = OpenCVVideoBackendMixin.get_video_metadata(cap) target = VideoTargetMetadata( - num_frames=num_frames, - fps=fps, - max_duration=max_duration, + num_frames=num_frames, fps=fps, max_duration=max_duration ) - # resample video to target num_frames and fps - # - the minimum of the two will be used - frame_idx = cls.compute_frames_index_to_sample( - source=source, - target=target, - ) + if backend == "opencv": + cap = cls.open_video_capture(data) + source = cls._prepare_source(cls.get_video_metadata(cap)) + frame_idx = cls.compute_frames_index_to_sample( + source=source, target=target, **kwargs + ) + frames, valid = cls.read_frames( + cap, + frame_idx, + total_frames_num=source.total_frames_num, + frame_recovery=frame_recovery, + ) + elif backend == "pyav": + with av.open(BytesIO(data)) as container: + source = cls._prepare_source(cls.get_metadata(container)) + frame_idx = cls.compute_frames_index_to_sample( + source=source, target=target, **kwargs + ) + frames, valid = cls.decode_frames( + container, frame_idx, source.original_fps, source.duration + ) + else: + raise ValueError( + f"Unknown video codec backend {backend!r}; " + "valid options: 'opencv', 'pyav'." + ) - frames, valid_frame_indices = cls.read_frames( - cap, - frame_idx, - total_frames_num=source.total_frames_num, - frame_recovery=frame_recovery, - ) + if len(valid) < len(frame_idx): + logger.warning( + "%s video loading: expected %d frames but got %d.", + backend, + len(frame_idx), + len(valid), + ) - metadata = cls.create_hf_metadata( + return frames, cls.create_hf_metadata( source=source, - video_backend="opencv", - valid_frame_indices=valid_frame_indices, + video_backend=f"{backend}{cls._sampling_suffix}", + valid_frame_indices=valid, ) - return frames, metadata - @VIDEO_LOADER_REGISTRY.register("opencv_dynamic") -class OpenCVDynamicVideoBackend(VideoLoader, OpenCVVideoBackendMixin): +class DynamicVideoBackend(VideoBackend): + """Duration-aware dynamic-sampling video backend. + + Samples at ``fps`` up to ``max_duration`` seconds, falling back to + uniform sampling across the full duration when the video is longer + than ``max_duration``. Codec is selectable the same way as + :class:`VideoBackend`. + """ + + _sampling_suffix: ClassVar[str] = "_dynamic" + + @classmethod + def _prepare_source(cls, source: VideoSourceMetadata) -> VideoSourceMetadata: + # Estimate duration from frame count and fps when the container + # does not report it (common for WebM/streaming inputs). + if source.duration: + return source + if source.original_fps > 0: + max_frame_idx = source.total_frames_num - 1 + duration = round(max_frame_idx / source.original_fps) + 1 + else: + duration = 0 + return VideoSourceMetadata( + source.total_frames_num, source.original_fps, duration + ) + @classmethod def compute_frames_index_to_sample( cls, @@ -461,8 +572,8 @@ def compute_frames_index_to_sample( original_fps = source.original_fps max_duration = target.max_duration fps = target.fps - max_frame_idx = source.total_frames_num - 1 + # Refer to: # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140 frame_indices_list: list[int] @@ -496,62 +607,20 @@ def load_bytes( fps: int = 2, max_duration: int = 300, frame_recovery: bool = False, + *, + backend: Literal["opencv", "pyav"] = "pyav", **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: - """ - Load video frames with dynamic sampling based on duration. - - Args: - data: Raw video bytes - num_frames: Not used in dynamic backend - fps: Target FPS for sampling (default: 2) - max_duration: Maximum video duration to process (default: 300s) - frame_recovery: Enable forward-scan recovery for failed frames - - Returns: - Tuple of (frames_array, metadata_dict) - """ - cap = cls.open_video_capture(data) - - orig_source = OpenCVVideoBackendMixin.get_video_metadata(cap) - max_frame_idx = orig_source.total_frames_num - 1 - duration = ( - orig_source.duration or round(max_frame_idx / orig_source.original_fps) + 1 - ) - - # recompute source metadata with adjusted duration to ensure correct - # sampling indices computation - source = VideoSourceMetadata( - total_frames_num=orig_source.total_frames_num, - original_fps=orig_source.original_fps, - duration=duration, - ) - target = VideoTargetMetadata( + return super().load_bytes( + data, num_frames=num_frames, fps=fps, max_duration=max_duration, - ) - - frame_indices_list = cls.compute_frames_index_to_sample( - source=source, - target=target, - ) - - frames, valid_frame_indices = cls.read_frames( - cap, - frame_indices_list, - total_frames_num=source.total_frames_num, frame_recovery=frame_recovery, + backend=backend, + **kwargs, ) - metadata = cls.create_hf_metadata( - source=source, - video_backend="opencv_dynamic", - valid_frame_indices=valid_frame_indices, - ) - - return frames, metadata - @VIDEO_LOADER_REGISTRY.register("molmo2") class Molmo2VideoBackend(VideoLoader, OpenCVVideoBackendMixin): @@ -840,7 +909,7 @@ def load_bytes( @VIDEO_LOADER_REGISTRY.register("nemotron_vl") -class NemotronVLVideoBackend(OpenCVVideoBackend): +class NemotronVLVideoBackend(VideoBackend): @classmethod def load_bytes( cls, @@ -849,14 +918,17 @@ def load_bytes( fps: int = -1, max_duration: int = 300, frame_recovery: bool = False, + *, + backend: Literal["opencv", "pyav"] = "opencv", **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: - frames, metadata = OpenCVVideoBackend.load_bytes( + frames, metadata = super().load_bytes( data, num_frames=num_frames, fps=fps, max_duration=max_duration, frame_recovery=frame_recovery, + backend=backend, **kwargs, ) @@ -964,164 +1036,3 @@ def load_bytes( valid_frame_indices=valid_frame_indices, ) return frames, metadata - - -class PyAVVideoBackendMixin: - """Shared utilities for PyAV-based video backends. - - Decodes video using the PyAV library (Python bindings for FFmpeg). - Frames are extracted via per-frame `container.seek()`, which - releases the GIL between frames and scales with the number of - sampled frames rather than the video length. - """ - - _video_backend_name: str - - @staticmethod - def _get_metadata( - container: "av.container.InputContainer", - ) -> VideoSourceMetadata: - """Extract metadata from an open PyAV container.""" - if not container.streams.video: - raise ValueError("No video streams found in container") - stream = container.streams.video[0] - total_frames = stream.frames or 0 - fps = float(stream.average_rate) if stream.average_rate else 0.0 - duration = float(stream.duration * stream.time_base) if stream.duration else 0.0 - - if total_frames == 0 and duration > 0 and fps > 0: - total_frames = int(duration * fps) - - return VideoSourceMetadata(total_frames, fps, duration) - - @staticmethod - def _decode_frames( - container: "av.container.InputContainer", - frame_indices: list[int], - fps: float, - duration: float, - ) -> tuple[npt.NDArray, list[int]]: - """Decode target frames via per-frame seek + keyframe decode.""" - stream = container.streams.video[0] - # SLICE parallelizes within a single frame without the - # one-frame-per-thread latency penalty of FRAME threading. - stream.thread_type = "SLICE" - time_base = stream.time_base - frames_list: list[npt.NDArray] = [] - valid_indices: list[int] = [] - - frame_interval = 1.0 / fps if fps > 0 else 0.1 - max_ts = max(0.0, duration - frame_interval) if duration > 0 else float("inf") - - for idx in frame_indices: - ts = min(idx / fps, max_ts) if fps > 0 else 0.0 - pts = int(ts / time_base) - container.seek(pts, stream=stream) - frame = next(container.decode(video=0), None) - if frame is not None: - frames_list.append(frame.to_ndarray(format="rgb24")) - valid_indices.append(idx) - - return np.stack(frames_list), valid_indices - - @classmethod - def _prepare_source( - cls, - source: VideoSourceMetadata, - ) -> VideoSourceMetadata: - return source - - @classmethod - def _load_bytes_impl( - cls, - data: bytes, - num_frames: int, - fps: int, - max_duration: int, - ) -> tuple[npt.NDArray, dict[str, Any]]: - """Shared implementation for all PyAV-based load_bytes methods.""" - with av.open(BytesIO(data)) as container: - raw_source = cls._get_metadata(container) - source = cls._prepare_source(raw_source) - - frame_idx = cls.compute_frames_index_to_sample( # type: ignore[attr-defined] - source=source, - target=VideoTargetMetadata(num_frames, fps, max_duration), - ) - frames, valid_frame_indices = cls._decode_frames( - container, frame_idx, source.original_fps, source.duration - ) - - if len(valid_frame_indices) < len(frame_idx): - logger.warning( - "pyav video loading: expected %d frames but got %d.", - len(frame_idx), - len(valid_frame_indices), - ) - metadata = cls.create_hf_metadata( # type: ignore[attr-defined] - source=source, - video_backend=cls._video_backend_name, - valid_frame_indices=valid_frame_indices, - ) - return frames, metadata - - -@VIDEO_LOADER_REGISTRY.register("pyav") -class PyAVVideoBackend(VideoLoader, PyAVVideoBackendMixin): - """Video backend using PyAV (in-process FFmpeg bindings).""" - - _video_backend_name = "pyav" - - compute_frames_index_to_sample = OpenCVVideoBackend.compute_frames_index_to_sample - - @classmethod - def load_bytes( - cls, - data: bytes, - num_frames: int = -1, - fps: int = -1, - max_duration: int = 300, - **kwargs, - ) -> tuple[npt.NDArray, dict[str, Any]]: - return cls._load_bytes_impl(data, num_frames, fps, max_duration) - - -@VIDEO_LOADER_REGISTRY.register("pyav_dynamic") -class PyAVDynamicVideoBackend(VideoLoader, PyAVVideoBackendMixin): - """Dynamic-sampling PyAV backend (mirrors OpenCVDynamicVideoBackend).""" - - _video_backend_name = "pyav_dynamic" - - @classmethod - def _prepare_source( - cls, - source: VideoSourceMetadata, - ) -> VideoSourceMetadata: - """Estimate duration from frame count and fps when not available.""" - if source.duration: - return source - if source.original_fps > 0: - max_frame_idx = source.total_frames_num - 1 - estimated_duration = round(max_frame_idx / source.original_fps) + 1 - else: - estimated_duration = 0 - return VideoSourceMetadata( - source.total_frames_num, - source.original_fps, - estimated_duration, - ) - - compute_frames_index_to_sample = ( - OpenCVDynamicVideoBackend.compute_frames_index_to_sample - ) - - @classmethod - def load_bytes( - cls, - data: bytes, - num_frames: int = -1, - fps: int = 2, - max_duration: int = 300, - **kwargs, - ) -> tuple[npt.NDArray, dict[str, Any]]: - return cls._load_bytes_impl(data, num_frames, fps, max_duration) From 1677d1769e98fba82c5d0d646004d6332dbd04f9 Mon Sep 17 00:00:00 2001 From: Jaseel Muhammad Date: Sat, 18 Apr 2026 20:44:54 +0400 Subject: [PATCH 4/7] Update vllm/multimodal/video.py Co-authored-by: Isotr0py <2037008807@qq.com> Signed-off-by: Jaseel Muhammad --- vllm/multimodal/video.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index ca50b4d981a3..820b06631713 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -504,6 +504,7 @@ def load_bytes( frame_recovery=frame_recovery, ) elif backend == "pyav": + assert not frame_recovery, "frame_recovery is only available for `opencv` backend" with av.open(BytesIO(data)) as container: source = cls._prepare_source(cls.get_metadata(container)) frame_idx = cls.compute_frames_index_to_sample( From 9eabc60727149576d56cfcb82dc53d1ee148f0f9 Mon Sep 17 00:00:00 2001 From: Jaseel Muhammad Date: Sat, 18 Apr 2026 16:49:01 +0000 Subject: [PATCH 5/7] [Test][GLM-4.1V] Parameterize video loader consistency test over opencv/pyav codecs Signed-off-by: Jaseel Muhammad --- tests/models/multimodal/processing/test_glm4_1v.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index 6f8a4d48b884..5798c5663472 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -70,9 +70,11 @@ def test_processor_override( @pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"]) @pytest.mark.parametrize("fps", [2]) +@pytest.mark.parametrize("backend", ["opencv", "pyav"]) def test_video_loader_consistency( model_id: str, fps: int, + backend: str, ): """ Ensure dynamic video loader (pre-sampled by loader) and normal video @@ -94,10 +96,10 @@ def test_video_loader_consistency( video_bytes = f.read() static_video, static_metadata = VideoBackend.load_bytes( - video_bytes, backend="opencv" + video_bytes, backend=backend ) dynamic_video, dynamic_metadata = DynamicVideoBackend.load_bytes( - video_bytes, fps=fps, backend="opencv" + video_bytes, fps=fps, backend=backend ) # pre-sampled loader shouldn't read all frames From 0823582abc440bb8d08f083785a47999ab3df384 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 22 Apr 2026 00:32:18 +0800 Subject: [PATCH 6/7] set opencv as default backend Co-authored-by: Isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/multimodal/video.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 820b06631713..dfe83c4b6b6b 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -469,7 +469,7 @@ def load_bytes( max_duration: int = 300, frame_recovery: bool = False, *, - backend: Literal["opencv", "pyav"] = "pyav", + backend: Literal["opencv", "pyav"] = "opencv", **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: """Load sampled frames from raw video bytes. @@ -609,7 +609,7 @@ def load_bytes( max_duration: int = 300, frame_recovery: bool = False, *, - backend: Literal["opencv", "pyav"] = "pyav", + backend: Literal["opencv", "pyav"] = "opencv", **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: return super().load_bytes( From 5ee1a7d0d4066e89e8b26128c4b9a48b0e66f1f3 Mon Sep 17 00:00:00 2001 From: Jaseel Muhammad Date: Tue, 21 Apr 2026 18:18:07 +0000 Subject: [PATCH 7/7] run pre-commit Signed-off-by: Jaseel Muhammad --- vllm/multimodal/video.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index dfe83c4b6b6b..5b118af8fc53 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -504,7 +504,9 @@ def load_bytes( frame_recovery=frame_recovery, ) elif backend == "pyav": - assert not frame_recovery, "frame_recovery is only available for `opencv` backend" + assert not frame_recovery, ( + "frame_recovery is only available for `opencv` backend" + ) with av.open(BytesIO(data)) as container: source = cls._prepare_source(cls.get_metadata(container)) frame_idx = cls.compute_frames_index_to_sample(