From 3f092ef78fcc95d56630f3e06db90cff9273b0db Mon Sep 17 00:00:00 2001 From: Ranran Date: Wed, 13 May 2026 21:19:48 -0400 Subject: [PATCH 1/4] fix pyav video Signed-off-by: Ranran --- vllm/multimodal/video.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 5b118af8fc53..37b46a26e0e9 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -390,7 +390,7 @@ def decode_frames( fps: float, duration: float, ) -> tuple[npt.NDArray, list[int]]: - """Decode target frames via per-frame seek + keyframe decode.""" + """Decode target frames via per-frame seek + forward decode to PTS.""" stream = container.streams.video[0] # SLICE parallelizes within a single frame without the # one-frame-per-thread latency penalty of FRAME threading. @@ -402,14 +402,30 @@ def decode_frames( frame_interval = 1.0 / fps if fps > 0 else 0.1 max_ts = max(0.0, duration - frame_interval) if duration > 0 else float("inf") + decoder = None + last_pts = None for idx in frame_indices: ts = min(idx / fps, max_ts) if fps > 0 else 0.0 pts = int(ts / time_base) - container.seek(pts, stream=stream) - frame = next(container.decode(video=0), None) - if frame is not None: - frames_list.append(frame.to_ndarray(format="rgb24")) + # seek() snaps backward to a keyframe; reuse the running decoder + # while targets advance monotonically to avoid re-decoding the + # GOP prefix once per requested frame. + if decoder is None or last_pts is None or pts <= last_pts: + container.seek(pts, stream=stream) + decoder = container.decode(video=0) + chosen = None + for frame in decoder: + if frame.pts is None: + continue + chosen = frame + last_pts = frame.pts + if frame.pts >= pts: + break + if chosen is not None: + frames_list.append(chosen.to_ndarray(format="rgb24")) valid_indices.append(idx) + else: + decoder = None if not frames_list: return np.empty((0,), dtype=np.uint8), valid_indices From 3abdc746c3b604a76ee51bcf76b06f5e661ec3e8 Mon Sep 17 00:00:00 2001 From: Ranran Date: Thu, 14 May 2026 00:21:27 -0400 Subject: [PATCH 2/4] simplify the code Signed-off-by: Ranran --- vllm/multimodal/video.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 37b46a26e0e9..697156a5b4dc 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -415,11 +415,9 @@ def decode_frames( decoder = container.decode(video=0) chosen = None for frame in decoder: - if frame.pts is None: - continue - chosen = frame - last_pts = frame.pts - if frame.pts >= pts: + if frame.pts is not None and frame.pts >= pts: + chosen = frame + last_pts = frame.pts break if chosen is not None: frames_list.append(chosen.to_ndarray(format="rgb24")) From 2257321e437e69ae2b7dbbbd542e964890fc29f1 Mon Sep 17 00:00:00 2001 From: Ranran Date: Thu, 14 May 2026 00:39:48 -0400 Subject: [PATCH 3/4] add regression test Signed-off-by: Ranran --- tests/multimodal/test_video.py | 78 ++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index e82883ece338..e4cf3736fde8 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -1,8 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import io from pathlib import Path +import av import numpy as np import numpy.typing as npt import pytest @@ -364,6 +366,82 @@ def test_pyav_dynamic_backend_loads_frames( assert metadata["video_backend"] == "pyav_dynamic" +def _synthesize_long_gop_video( + num_frames: int = 50, + fps: int = 30, + width: int = 64, + height: int = 64, +) -> bytes: + """Encode an H.264 clip with one keyframe and green-channel = frame index. + + The marker lets a test recover which frame the decoder actually returned, + independent of any metadata label. + """ + buf = io.BytesIO() + with av.open(buf, mode="w", format="mp4") as container: + stream = container.add_stream("h264", rate=fps) + stream.width = width + stream.height = height + stream.pix_fmt = "yuv420p" + stream.codec_context.gop_size = num_frames + stream.codec_context.max_b_frames = 0 + stream.codec_context.options = { + "x264-params": (f"scenecut=0:keyint={num_frames}:min-keyint={num_frames}") + } + for i in range(num_frames): + img = np.zeros((height, width, 3), dtype=np.uint8) + img[:, :, 1] = i + frame = av.VideoFrame.from_ndarray(img, format="rgb24") + for packet in stream.encode(frame): + container.mux(packet) + for packet in stream.encode(): + container.mux(packet) + return buf.getvalue() + + +def test_pyav_backend_returns_target_frames_not_keyframes(): + """Regression test: PyAV must decode forward past the seek keyframe. + + container.seek() snaps backward to the nearest keyframe. With a long GOP + (here: one keyframe at frame 0), a decoder that does not advance forward + to the target PTS collapses every sampled slot onto the keyframe. This + test encodes a per-frame marker on the green channel and verifies the + returned frames are distinct, ordered, and match the requested indices. + """ + num_frames = 50 + num_sampled = 4 + height, width = 64, 64 + + video_bytes = _synthesize_long_gop_video( + num_frames=num_frames, width=width, height=height + ) + + loader = VIDEO_LOADER_REGISTRY.load("opencv") + frames, metadata = loader.load_bytes( + video_bytes, num_frames=num_sampled, backend="pyav" + ) + assert frames.shape == (num_sampled, height, width, 3) + + requested = list(metadata["frames_indices"]) + assert len(requested) == num_sampled + + actual = [int(f[height // 2, width // 2, 1]) for f in frames] + + assert len(set(actual)) == num_sampled, ( + f"PyAV returned only {len(set(actual))} distinct frames for " + f"{num_sampled} requested indices: markers={actual}, " + f"requested={requested}. Keyframe-snap regression." + ) + + assert actual == sorted(actual), f"Returned frames out of order: markers={actual}" + + for marker, want_idx in zip(actual, requested): + assert abs(marker - want_idx) <= 10, ( + f"Frame mismatch: requested index {want_idx}, " + f"got marker {marker} (tolerance ±10)" + ) + + @pytest.mark.parametrize( "loader_key, kwargs, expected_num_frames", [ From 697c7eecbd0d6ac86a8604d33b9507f8fed4e2e6 Mon Sep 17 00:00:00 2001 From: Ranran Date: Thu, 14 May 2026 00:59:46 -0400 Subject: [PATCH 4/4] mv helper to utils and make import lazy Signed-off-by: Ranran --- tests/multimodal/test_video.py | 39 ++-------------------------------- tests/multimodal/utils.py | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index e4cf3736fde8..7c024052a439 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -1,10 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import io from pathlib import Path -import av import numpy as np import numpy.typing as npt import pytest @@ -15,7 +13,7 @@ VideoLoader, ) -from .utils import create_video_from_image +from .utils import create_long_gop_video, create_video_from_image pytestmark = pytest.mark.cpu_test @@ -366,39 +364,6 @@ def test_pyav_dynamic_backend_loads_frames( assert metadata["video_backend"] == "pyav_dynamic" -def _synthesize_long_gop_video( - num_frames: int = 50, - fps: int = 30, - width: int = 64, - height: int = 64, -) -> bytes: - """Encode an H.264 clip with one keyframe and green-channel = frame index. - - The marker lets a test recover which frame the decoder actually returned, - independent of any metadata label. - """ - buf = io.BytesIO() - with av.open(buf, mode="w", format="mp4") as container: - stream = container.add_stream("h264", rate=fps) - stream.width = width - stream.height = height - stream.pix_fmt = "yuv420p" - stream.codec_context.gop_size = num_frames - stream.codec_context.max_b_frames = 0 - stream.codec_context.options = { - "x264-params": (f"scenecut=0:keyint={num_frames}:min-keyint={num_frames}") - } - for i in range(num_frames): - img = np.zeros((height, width, 3), dtype=np.uint8) - img[:, :, 1] = i - frame = av.VideoFrame.from_ndarray(img, format="rgb24") - for packet in stream.encode(frame): - container.mux(packet) - for packet in stream.encode(): - container.mux(packet) - return buf.getvalue() - - def test_pyav_backend_returns_target_frames_not_keyframes(): """Regression test: PyAV must decode forward past the seek keyframe. @@ -412,7 +377,7 @@ def test_pyav_backend_returns_target_frames_not_keyframes(): num_sampled = 4 height, width = 64, 64 - video_bytes = _synthesize_long_gop_video( + video_bytes = create_long_gop_video( num_frames=num_frames, width=width, height=height ) diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py index 485bde939f69..32f3ec0e4233 100644 --- a/tests/multimodal/utils.py +++ b/tests/multimodal/utils.py @@ -66,6 +66,43 @@ def create_video_from_image( return video_path +def create_long_gop_video( + num_frames: int = 50, + fps: int = 30, + width: int = 64, + height: int = 64, +) -> bytes: + """Encode an H.264 clip with one keyframe and green-channel = frame index. + + The marker lets a test recover which frame the decoder actually returned, + independent of any metadata label. + """ + import io + + import av + + buf = io.BytesIO() + with av.open(buf, mode="w", format="mp4") as container: + stream = container.add_stream("h264", rate=fps) + stream.width = width + stream.height = height + stream.pix_fmt = "yuv420p" + stream.codec_context.gop_size = num_frames + stream.codec_context.max_b_frames = 0 + stream.codec_context.options = { + "x264-params": (f"scenecut=0:keyint={num_frames}:min-keyint={num_frames}") + } + for i in range(num_frames): + img = np.zeros((height, width, 3), dtype=np.uint8) + img[:, :, 1] = i + frame = av.VideoFrame.from_ndarray(img, format="rgb24") + for packet in stream.encode(frame): + container.mux(packet) + for packet in stream.encode(): + container.mux(packet) + return buf.getvalue() + + def cosine_similarity(A: npt.NDArray, B: npt.NDArray, axis: int = -1) -> npt.NDArray: """Compute cosine similarity between two vectors.""" return np.sum(A * B, axis=axis) / (