Skip to content
Merged
12 changes: 8 additions & 4 deletions tests/models/multimodal/processing/test_glm4_1v.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from vllm.assets.video import VideoAsset
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import batched_tensors_equal
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
from vllm.multimodal.video import DynamicVideoBackend, VideoBackend

from ...utils import build_model_context

Expand Down Expand Up @@ -70,9 +70,11 @@ def test_processor_override(

@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
@pytest.mark.parametrize("fps", [2])
@pytest.mark.parametrize("backend", ["opencv", "pyav"])
def test_video_loader_consistency(
model_id: str,
fps: int,
backend: str,
):
"""
Ensure dynamic video loader (pre-sampled by loader) and normal video
Expand All @@ -93,9 +95,11 @@ def test_video_loader_consistency(
with open(video_path, "rb") as f:
video_bytes = f.read()

static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
video_bytes, fps=fps
static_video, static_metadata = VideoBackend.load_bytes(
video_bytes, backend=backend
)
dynamic_video, dynamic_metadata = DynamicVideoBackend.load_bytes(
video_bytes, fps=fps, backend=backend
)

# pre-sampled loader shouldn't read all frames
Expand Down
121 changes: 104 additions & 17 deletions tests/multimodal/test_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
video_data = f.read()

loader = VIDEO_LOADER_REGISTRY.load("opencv")
frames, metadata = loader.load_bytes(video_data, num_frames=-1)
frames, metadata = loader.load_bytes(
video_data, num_frames=-1, backend="opencv"
)

# Verify metadata consistency:
# frames_indices must match actual loaded frames
Expand Down Expand Up @@ -158,12 +160,12 @@ def release(self):

# Test WITHOUT recovery - should have fewer frames due to failures
frames_no_recovery, meta_no = loader.load_bytes(
video_data, num_frames=8, frame_recovery=False
video_data, num_frames=8, frame_recovery=False, backend="opencv"
)

# Test WITH recovery - should recover using next valid frames
frames_with_recovery, meta_yes = loader.load_bytes(
video_data, num_frames=8, frame_recovery=True
video_data, num_frames=8, frame_recovery=True, backend="opencv"
)

# With recovery should have MORE frames than without
Expand Down Expand Up @@ -214,12 +216,12 @@ def test_video_recovery_with_corrupted_file(monkeypatch: pytest.MonkeyPatch):

# Test without recovery - frame 17 will be skipped
frames_no_recovery, meta_no_recovery = loader.load_bytes(
video_data, num_frames=8, frame_recovery=False
video_data, num_frames=8, frame_recovery=False, backend="opencv"
)

# Test with recovery - frame 18 should fill in for frame 17
frames_with_recovery, meta_with_recovery = loader.load_bytes(
video_data, num_frames=8, frame_recovery=True
video_data, num_frames=8, frame_recovery=True, backend="opencv"
)

# Verify metadata consistency for both modes
Expand Down Expand Up @@ -271,12 +273,16 @@ def test_video_recovery_dynamic_backend(monkeypatch: pytest.MonkeyPatch):

# Test without recovery
frames_no_recovery, meta_no = loader.load_bytes(
video_data, fps=2, max_duration=10, frame_recovery=False
video_data,
fps=2,
max_duration=10,
frame_recovery=False,
backend="opencv",
)

# Test with frame_recovery enabled
frames_with_recovery, meta_with = loader.load_bytes(
video_data, fps=2, max_duration=10, frame_recovery=True
video_data, fps=2, max_duration=10, frame_recovery=True, backend="opencv"
)

# Verify basic properties
Expand Down Expand Up @@ -310,27 +316,81 @@ def dummy_video_path(tmp_path):
return video_path


# ============================================================================
# PyAV Backend Tests
# ============================================================================


def test_pyav_backend_loads_frames(dummy_video_path, monkeypatch: pytest.MonkeyPatch):
"""Test that the pyav codec backend can load frames from a valid video."""
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")

with open(dummy_video_path, "rb") as f:
video_data = f.read()

loader = VIDEO_LOADER_REGISTRY.load("opencv")
frames, metadata = loader.load_bytes(video_data, num_frames=8, backend="pyav")

assert frames.ndim == 4
assert frames.shape[3] == 3 # RGB
assert frames.shape[0] == 8
assert frames.shape[0] == len(metadata["frames_indices"])
assert metadata["video_backend"] == "pyav"
assert "total_num_frames" in metadata
assert "fps" in metadata
assert "duration" in metadata


def test_pyav_dynamic_backend_loads_frames(
dummy_video_path, monkeypatch: pytest.MonkeyPatch
):
"""Test that the pyav codec with dynamic sampling can load frames."""
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")

with open(dummy_video_path, "rb") as f:
video_data = f.read()

loader = VIDEO_LOADER_REGISTRY.load("opencv_dynamic")
frames, metadata = loader.load_bytes(
video_data, fps=2, max_duration=10, backend="pyav"
)

assert frames.ndim == 4
assert frames.shape[3] == 3 # RGB
assert frames.shape[0] > 0
assert frames.shape[0] == len(metadata["frames_indices"])
assert metadata["video_backend"] == "pyav_dynamic"


@pytest.mark.parametrize(
"backend, kwargs, expected_num_frames",
"loader_key, kwargs, expected_num_frames",
[
# opencv: num_frames directly controls count
pytest.param("opencv", {"num_frames": 32}, 32, id="opencv-num_frames"),
pytest.param("opencv", {"fps": 2}, 120, id="opencv-fps"),
# uniform sampling + opencv codec
pytest.param(
"opencv",
{"num_frames": 32, "backend": "opencv"},
32,
id="opencv-num_frames",
),
pytest.param("opencv", {"fps": 2, "backend": "opencv"}, 120, id="opencv-fps"),
pytest.param(
"opencv",
{"num_frames": 500, "fps": 2},
{"num_frames": 500, "fps": 2, "backend": "opencv"},
120,
id="opencv-num_frames_wins_fps",
),
# dynamic sampling + opencv codec
pytest.param(
"opencv_dynamic",
{"fps": 1, "max_duration": 60},
{"fps": 1, "max_duration": 60, "backend": "opencv"},
60,
id="opencv_dynamic-within_max_duration",
),
pytest.param(
"opencv_dynamic",
{"fps": 2, "max_duration": 30},
{"fps": 2, "max_duration": 30, "backend": "opencv"},
60,
id="opencv_dynamic-exceeds_max_duration",
),
Expand All @@ -349,18 +409,45 @@ def dummy_video_path(tmp_path):
119,
id="molmo2-fps",
),
# uniform sampling + pyav codec (same frame counts as opencv)
pytest.param(
"opencv",
{"num_frames": 32, "backend": "pyav"},
32,
id="pyav-num_frames",
),
pytest.param("opencv", {"fps": 2, "backend": "pyav"}, 120, id="pyav-fps"),
pytest.param(
"opencv",
{"num_frames": 500, "fps": 2, "backend": "pyav"},
120,
id="pyav-num_frames_wins_fps",
),
# dynamic sampling + pyav codec
pytest.param(
"opencv_dynamic",
{"fps": 1, "max_duration": 60, "backend": "pyav"},
60,
id="pyav_dynamic-within_max_duration",
),
pytest.param(
"opencv_dynamic",
{"fps": 2, "max_duration": 30, "backend": "pyav"},
60,
id="pyav_dynamic-exceeds_max_duration",
),
],
)
def test_video_loader_frames_sampling(
dummy_video_path,
monkeypatch: pytest.MonkeyPatch,
backend: str,
loader_key: str,
kwargs: dict,
expected_num_frames: int,
):
"""Test video loader frames sampling functionality."""
monkeypatch.setenv("VLLM_VIDEO_LOADER_BACKEND", backend)
loader = VIDEO_LOADER_REGISTRY.load(backend)
monkeypatch.setenv("VLLM_VIDEO_LOADER_BACKEND", loader_key)
loader = VIDEO_LOADER_REGISTRY.load(loader_key)

with open(dummy_video_path, "rb") as f:
long_video_bytes = f.read()
Expand Down
7 changes: 4 additions & 3 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,9 +829,10 @@ def _get_or_set_default() -> str:
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int(
os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")
),
# Backend for Video IO
# - "opencv": Default backend that uses OpenCV stream buffered backend.
# - "identity": Returns raw video bytes for model processor to handle.
# Backend for Video IO — selects the frame-sampling algorithm.
# - "opencv": uniform sampling.
# - "opencv_dynamic": duration-aware dynamic sampling.
# - "identity": returns raw video bytes for model processor to handle.
#
# Custom backend implementations can be registered
# via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and
Expand Down
Loading
Loading