vllm-project · vllm-bot · Apr 22, 2026 · Apr 16, 2026 · Apr 17, 2026 · Apr 18, 2026
@@ -6,7 +6,7 @@
 from vllm.assets.video import VideoAsset
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import batched_tensors_equal
-from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
+from vllm.multimodal.video import DynamicVideoBackend, VideoBackend
 
 from ...utils import build_model_context
 
@@ -70,9 +70,11 @@ def test_processor_override(
 
 @pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
 @pytest.mark.parametrize("fps", [2])
+@pytest.mark.parametrize("backend", ["opencv", "pyav"])
 def test_video_loader_consistency(
     model_id: str,
     fps: int,
+    backend: str,
 ):
     """
     Ensure dynamic video loader (pre-sampled by loader) and normal video
@@ -93,9 +95,11 @@ def test_video_loader_consistency(
     with open(video_path, "rb") as f:
         video_bytes = f.read()
 
-    static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
-    dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
-        video_bytes, fps=fps
+    static_video, static_metadata = VideoBackend.load_bytes(
+        video_bytes, backend=backend
+    )
+    dynamic_video, dynamic_metadata = DynamicVideoBackend.load_bytes(
+        video_bytes, fps=fps, backend=backend
     )
 
     # pre-sampled loader shouldn't read all frames

@@ -71,7 +71,9 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
             video_data = f.read()
 
         loader = VIDEO_LOADER_REGISTRY.load("opencv")
-        frames, metadata = loader.load_bytes(video_data, num_frames=-1)
+        frames, metadata = loader.load_bytes(
+            video_data, num_frames=-1, backend="opencv"
+        )
 
         # Verify metadata consistency:
         # frames_indices must match actual loaded frames
@@ -158,12 +160,12 @@ def release(self):
 
         # Test WITHOUT recovery - should have fewer frames due to failures
         frames_no_recovery, meta_no = loader.load_bytes(
-            video_data, num_frames=8, frame_recovery=False
+            video_data, num_frames=8, frame_recovery=False, backend="opencv"
         )
 
         # Test WITH recovery - should recover using next valid frames
         frames_with_recovery, meta_yes = loader.load_bytes(
-            video_data, num_frames=8, frame_recovery=True
+            video_data, num_frames=8, frame_recovery=True, backend="opencv"
         )
 
         # With recovery should have MORE frames than without
@@ -214,12 +216,12 @@ def test_video_recovery_with_corrupted_file(monkeypatch: pytest.MonkeyPatch):
 
         # Test without recovery - frame 17 will be skipped
         frames_no_recovery, meta_no_recovery = loader.load_bytes(
-            video_data, num_frames=8, frame_recovery=False
+            video_data, num_frames=8, frame_recovery=False, backend="opencv"
         )
 
         # Test with recovery - frame 18 should fill in for frame 17
         frames_with_recovery, meta_with_recovery = loader.load_bytes(
-            video_data, num_frames=8, frame_recovery=True
+            video_data, num_frames=8, frame_recovery=True, backend="opencv"
         )
 
         # Verify metadata consistency for both modes
@@ -271,12 +273,16 @@ def test_video_recovery_dynamic_backend(monkeypatch: pytest.MonkeyPatch):
 
         # Test without recovery
         frames_no_recovery, meta_no = loader.load_bytes(
-            video_data, fps=2, max_duration=10, frame_recovery=False
+            video_data,
+            fps=2,
+            max_duration=10,
+            frame_recovery=False,
+            backend="opencv",
         )
 
         # Test with frame_recovery enabled
         frames_with_recovery, meta_with = loader.load_bytes(
-            video_data, fps=2, max_duration=10, frame_recovery=True
+            video_data, fps=2, max_duration=10, frame_recovery=True, backend="opencv"
         )
 
         # Verify basic properties
@@ -310,27 +316,81 @@ def dummy_video_path(tmp_path):
     return video_path
 
 
+# ============================================================================
+# PyAV Backend Tests
+# ============================================================================
+
+
+def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyPatch):
+    """Test that the pyav codec backend can load frames from a valid video."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
+
+        with open(dummy_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv")
+        frames, metadata = loader.load_bytes(video_data, num_frames=8, backend="pyav")
+
+        assert frames.ndim == 4
+        assert frames.shape[3] == 3  # RGB
+        assert frames.shape[0] == 8
+        assert frames.shape[0] == len(metadata["frames_indices"])
+        assert metadata["video_backend"] == "pyav"
+        assert "total_num_frames" in metadata
+        assert "fps" in metadata
+        assert "duration" in metadata
+
+
+def test_pyav_dynamic_backend_loads_frames(
+    dummy_video_path, monkeypatch: pytest.MonkeyPatch
+):
+    """Test that the pyav codec with dynamic sampling can load frames."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
+
+        with open(dummy_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv_dynamic")
+        frames, metadata = loader.load_bytes(
+            video_data, fps=2, max_duration=10, backend="pyav"
+        )
+
+        assert frames.ndim == 4
+        assert frames.shape[3] == 3  # RGB
+        assert frames.shape[0] > 0
+        assert frames.shape[0] == len(metadata["frames_indices"])
+        assert metadata["video_backend"] == "pyav_dynamic"
+
+
 @pytest.mark.parametrize(
-    "backend, kwargs, expected_num_frames",
+    "loader_key, kwargs, expected_num_frames",
     [
-        # opencv: num_frames directly controls count
-        pytest.param("opencv", {"num_frames": 32}, 32, id="opencv-num_frames"),
-        pytest.param("opencv", {"fps": 2}, 120, id="opencv-fps"),
+        # uniform sampling + opencv codec
+        pytest.param(
+            "opencv",
+            {"num_frames": 32, "backend": "opencv"},
+            32,
+            id="opencv-num_frames",
+        ),
+        pytest.param("opencv", {"fps": 2, "backend": "opencv"}, 120, id="opencv-fps"),
         pytest.param(
             "opencv",
-            {"num_frames": 500, "fps": 2},
+            {"num_frames": 500, "fps": 2, "backend": "opencv"},
             120,
             id="opencv-num_frames_wins_fps",
         ),
+        # dynamic sampling + opencv codec
         pytest.param(
             "opencv_dynamic",
-            {"fps": 1, "max_duration": 60},
+            {"fps": 1, "max_duration": 60, "backend": "opencv"},
             60,
             id="opencv_dynamic-within_max_duration",
         ),
         pytest.param(
             "opencv_dynamic",
-            {"fps": 2, "max_duration": 30},
+            {"fps": 2, "max_duration": 30, "backend": "opencv"},
             60,
             id="opencv_dynamic-exceeds_max_duration",
         ),
@@ -349,18 +409,45 @@ def dummy_video_path(tmp_path):
             119,
             id="molmo2-fps",
         ),
+        # uniform sampling + pyav codec (same frame counts as opencv)
+        pytest.param(
+            "opencv",
+            {"num_frames": 32, "backend": "pyav"},
+            32,
+            id="pyav-num_frames",
+        ),
+        pytest.param("opencv", {"fps": 2, "backend": "pyav"}, 120, id="pyav-fps"),
+        pytest.param(
+            "opencv",
+            {"num_frames": 500, "fps": 2, "backend": "pyav"},
+            120,
+            id="pyav-num_frames_wins_fps",
+        ),
+        # dynamic sampling + pyav codec
+        pytest.param(
+            "opencv_dynamic",
+            {"fps": 1, "max_duration": 60, "backend": "pyav"},
+            60,
+            id="pyav_dynamic-within_max_duration",
+        ),
+        pytest.param(
+            "opencv_dynamic",
+            {"fps": 2, "max_duration": 30, "backend": "pyav"},
+            60,
+            id="pyav_dynamic-exceeds_max_duration",
+        ),
     ],
 )
 def test_video_loader_frames_sampling(
     dummy_video_path,
     monkeypatch: pytest.MonkeyPatch,
-    backend: str,
+    loader_key: str,
     kwargs: dict,
     expected_num_frames: int,
 ):
     """Test video loader frames sampling functionality."""
-    monkeypatch.setenv("VLLM_VIDEO_LOADER_BACKEND", backend)
-    loader = VIDEO_LOADER_REGISTRY.load(backend)
+    monkeypatch.setenv("VLLM_VIDEO_LOADER_BACKEND", loader_key)
+    loader = VIDEO_LOADER_REGISTRY.load(loader_key)
 
     with open(dummy_video_path, "rb") as f:
         long_video_bytes = f.read()

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -829,9 +829,10 @@ def _get_or_set_default() -> str:
     "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int(
         os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")
     ),
-    # Backend for Video IO
-    # - "opencv": Default backend that uses OpenCV stream buffered backend.
-    # - "identity": Returns raw video bytes for model processor to handle.
+    # Backend for Video IO — selects the frame-sampling algorithm.
+    # - "opencv": uniform sampling.
+    # - "opencv_dynamic": duration-aware dynamic sampling.
+    # - "identity": returns raw video bytes for model processor to handle.
     #
     # Custom backend implementations can be registered
     # via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and