From dc068bc99a6c4baccfe49d1adebd17bb5578737a Mon Sep 17 00:00:00 2001 From: hujiaxin <524446785@qq.com> Date: Mon, 9 Feb 2026 20:18:02 +0800 Subject: [PATCH 1/9] openpangu-vl support video input Signed-off-by: hujiaxin <524446785@qq.com> --- vllm/multimodal/video.py | 83 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index f123799ca901..03d966090323 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math +import warnings from abc import abstractmethod from io import BytesIO from typing import TYPE_CHECKING, Any, cast @@ -747,3 +748,85 @@ def load_bytes( **kwargs, ) return out + + +@VIDEO_LOADER_REGISTRY.register("opencv_dynamic_openpangu") +class OpenCVDynamicOpenPanguVideoBackend(OpenCVVideoBackend): + @classmethod + def load_bytes( + cls, + data: bytes, + num_frames: int = 32, + fps: int = 1, + **kwargs, + ) -> tuple[npt.NDArray, dict[str, Any]]: + import cv2 + + backend = cls().get_cv2_video_api() + cap = cv2.VideoCapture(BytesIO(data), backend, []) + if not cap.isOpened(): + raise ValueError("Could not open video stream") + + total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + original_fps = float(cap.get(cv2.CAP_PROP_FPS)) + # The timestamp of the rightmost frame, cannot be used to calculate frame 0. + total_duration = (total_frames_num - 1) / original_fps + + # `fps` is the FPS parameter passed in for sampling, + # -1 indicates that sampling can be performed directly without FPS limitation. + if fps > 0: + # Num_frames is the maximum number of frames to sample. + # If fewer frames are sampled at this sample_fps, the update duration will be longer. + if num_frames >= int(total_duration * fps) + 1: + num_frames = int(total_duration * fps) + 1 + # Under the new maximum frame rate, the video duration of the rightmost frame, + # cannot be calculated for frame 0. + total_duration = min(total_duration, (num_frames - 1) / fps) + elif fps != -1: + raise ValueError(f"requires dataset fps is -1 or greater than 0 but got {fps}") + + sample_frame_timestamps = np.linspace(0, total_duration, num_frames, dtype=float) + frames_indices = [min(total_frames_num - 1, round(t * original_fps)) for t in sample_frame_timestamps] + + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + frames = np.empty((len(frames_indices), height, width, 3), dtype=np.uint8) + + i = 0 + for frame_idx in frames_indices: + cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) + ret, frame = cap.read() + if ret: + frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + i += 1 + else: + # when get a bad frame,continuous finding a next good frame + next_idx = frame_idx + 1 + while next_idx < total_frames_num: + ret, next_frame = cap.read() + if ret: + frames[i] = cv2.cvtColor(next_frame, cv2.COLOR_BGR2RGB) + i += 1 + break + next_idx += 1 + + if i != len(frames_indices): + warnings.warn( + f"Expected reading {len(frames_indices)} frames," + f"but only loaded {i} frames from video.", + UserWarning, + stacklevel=2 + ) + + # Use transformers transformers.video_utils.VideoMetadata format + metadata = { + "total_num_frames": total_frames_num, + "fps": original_fps, + "duration": total_duration, + "video_backend": "opencv_dynamic_openpangu", + "frames_indices": frames_indices, + "do_sample_frames": False, + "sample_frame_timestamps": sample_frame_timestamps, + } + return frames, metadata + From 5cc32bd0a42095beab9d173d3678d52b316aa4db Mon Sep 17 00:00:00 2001 From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Date: Tue, 10 Feb 2026 09:53:38 +0800 Subject: [PATCH 2/9] check code check code Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> --- vllm/multimodal/video.py | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 03d966090323..6712f85cb363 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -760,6 +760,21 @@ def load_bytes( fps: int = 1, **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: + """ + Load video frames with dynamic sampling based on duration. + Assume that total_num_frames = 10 and fps = 1. + The timestamp of frame 0 is 0.0. + The timestamp of frame 1 is 1.0.… + The timestamp of frame 9 (the last frame) should be 9.0, that is, (total_frames_num – 1) / original_fps. + + Args: + data: Raw video bytes + num_frames: Not used in dynamic backend + fps: Target FPS for sampling (default: 2) + + Returns: + Tuple of (frames_array, metadata_dict) + """ import cv2 backend = cls().get_cv2_video_api() @@ -770,7 +785,10 @@ def load_bytes( total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) original_fps = float(cap.get(cv2.CAP_PROP_FPS)) # The timestamp of the rightmost frame, cannot be used to calculate frame 0. - total_duration = (total_frames_num - 1) / original_fps + if total_frames_num >= 1 and original_fps > 0: + total_duration = (total_frames_num - 1) / original_fps + else: + total_duration = 0 # `fps` is the FPS parameter passed in for sampling, # -1 indicates that sampling can be performed directly without FPS limitation. @@ -783,10 +801,17 @@ def load_bytes( # cannot be calculated for frame 0. total_duration = min(total_duration, (num_frames - 1) / fps) elif fps != -1: - raise ValueError(f"requires dataset fps is -1 or greater than 0 but got {fps}") + raise ValueError( + f"requires dataset fps is -1 or greater than 0 but got {fps}" + ) - sample_frame_timestamps = np.linspace(0, total_duration, num_frames, dtype=float) - frames_indices = [min(total_frames_num - 1, round(t * original_fps)) for t in sample_frame_timestamps] + sample_frame_timestamps = np.linspace( + 0, total_duration, num_frames, dtype=float + ) + frames_indices = [ + min(total_frames_num - 1, round(t * original_fps)) + for t in sample_frame_timestamps + ] width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) @@ -815,7 +840,7 @@ def load_bytes( f"Expected reading {len(frames_indices)} frames," f"but only loaded {i} frames from video.", UserWarning, - stacklevel=2 + stacklevel=2, ) # Use transformers transformers.video_utils.VideoMetadata format @@ -829,4 +854,3 @@ def load_bytes( "sample_frame_timestamps": sample_frame_timestamps, } return frames, metadata - From 994b97bd81485dae280b2a89aeae6393eb10e27a Mon Sep 17 00:00:00 2001 From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Date: Tue, 10 Feb 2026 10:53:45 +0800 Subject: [PATCH 3/9] Update video.py Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> --- vllm/multimodal/video.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 6712f85cb363..54bfc9e67159 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -758,6 +758,8 @@ def load_bytes( data: bytes, num_frames: int = 32, fps: int = 1, + max_duration: int = 300, + frame_recovery: bool = False, **kwargs, ) -> tuple[npt.NDArray, dict[str, Any]]: """ From 8b37e17735f642a88c1fb0dc142b11af6123eb03 Mon Sep 17 00:00:00 2001 From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Date: Tue, 10 Feb 2026 11:15:00 +0800 Subject: [PATCH 4/9] Update video.py Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> --- vllm/multimodal/video.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 54bfc9e67159..2cf0cdf34a99 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -767,7 +767,7 @@ def load_bytes( Assume that total_num_frames = 10 and fps = 1. The timestamp of frame 0 is 0.0. The timestamp of frame 1 is 1.0.… - The timestamp of frame 9 (the last frame) should be 9.0, that is, (total_frames_num – 1) / original_fps. + The timestamp of frame 9 (the last frame) should be 9.0, that is, (total_frames_num – 1) / original_fps. # noqa: E501 Args: data: Raw video bytes @@ -796,10 +796,10 @@ def load_bytes( # -1 indicates that sampling can be performed directly without FPS limitation. if fps > 0: # Num_frames is the maximum number of frames to sample. - # If fewer frames are sampled at this sample_fps, the update duration will be longer. + # If fewer frames are sampled at this sample_fps, the update duration will be longer. # noqa: E501 if num_frames >= int(total_duration * fps) + 1: num_frames = int(total_duration * fps) + 1 - # Under the new maximum frame rate, the video duration of the rightmost frame, + # Under the new maximum frame rate, the video duration of the rightmost frame, # noqa: E501 # cannot be calculated for frame 0. total_duration = min(total_duration, (num_frames - 1) / fps) elif fps != -1: From 0dfdda0ce813861bd2ae62f3884d6ac7e3d9f588 Mon Sep 17 00:00:00 2001 From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Date: Tue, 10 Feb 2026 11:21:08 +0800 Subject: [PATCH 5/9] Update video.py Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> --- vllm/multimodal/video.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 2cf0cdf34a99..c592e8c16bce 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -767,7 +767,8 @@ def load_bytes( Assume that total_num_frames = 10 and fps = 1. The timestamp of frame 0 is 0.0. The timestamp of frame 1 is 1.0.… - The timestamp of frame 9 (the last frame) should be 9.0, that is, (total_frames_num – 1) / original_fps. # noqa: E501 + The timestamp of frame 9 (the last frame) should be 9.0, that is, + (total_frames_num – 1) / original_fps. Args: data: Raw video bytes From c5fa900a0c636ad76ab2fb73cdb75bb388c4e1a9 Mon Sep 17 00:00:00 2001 From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Date: Thu, 12 Feb 2026 09:10:59 +0800 Subject: [PATCH 6/9] Update video.py Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> --- vllm/multimodal/video.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index c592e8c16bce..d61925e4f153 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -854,6 +854,5 @@ def load_bytes( "video_backend": "opencv_dynamic_openpangu", "frames_indices": frames_indices, "do_sample_frames": False, - "sample_frame_timestamps": sample_frame_timestamps, } return frames, metadata From 3623ece333bf720b6a7794858106d83c1b7232c1 Mon Sep 17 00:00:00 2001 From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Date: Wed, 25 Feb 2026 14:46:38 +0800 Subject: [PATCH 7/9] Update video.py Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> --- vllm/multimodal/video.py | 41 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 31 deletions(-) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index d61925e4f153..640851e228b4 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math -import warnings from abc import abstractmethod from io import BytesIO from typing import TYPE_CHECKING, Any, cast @@ -773,7 +772,7 @@ def load_bytes( Args: data: Raw video bytes num_frames: Not used in dynamic backend - fps: Target FPS for sampling (default: 2) + fps: Target FPS for sampling (default: 1) Returns: Tuple of (frames_array, metadata_dict) @@ -816,43 +815,23 @@ def load_bytes( for t in sample_frame_timestamps ] - width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - frames = np.empty((len(frames_indices), height, width, 3), dtype=np.uint8) + frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery( + cap, frames_indices, total_frames_num + ) + valid_num_frames = len(valid_frame_indices) - i = 0 - for frame_idx in frames_indices: - cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) - ret, frame = cap.read() - if ret: - frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - i += 1 - else: - # when get a bad frame,continuous finding a next good frame - next_idx = frame_idx + 1 - while next_idx < total_frames_num: - ret, next_frame = cap.read() - if ret: - frames[i] = cv2.cvtColor(next_frame, cv2.COLOR_BGR2RGB) - i += 1 - break - next_idx += 1 - - if i != len(frames_indices): - warnings.warn( - f"Expected reading {len(frames_indices)} frames," - f"but only loaded {i} frames from video.", - UserWarning, - stacklevel=2, + if recovered_map: + logger.info( + "Frame recovery: %d frames recovered using forward scan.", + len(recovered_map), ) - # Use transformers transformers.video_utils.VideoMetadata format metadata = { "total_num_frames": total_frames_num, "fps": original_fps, "duration": total_duration, "video_backend": "opencv_dynamic_openpangu", - "frames_indices": frames_indices, + "frames_indices": valid_frame_indices, "do_sample_frames": False, } return frames, metadata From f277b23c01eb4956ca97a7822d4d73b86e15f436 Mon Sep 17 00:00:00 2001 From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Date: Wed, 25 Feb 2026 14:51:51 +0800 Subject: [PATCH 8/9] Update video.py Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> --- vllm/multimodal/video.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 640851e228b4..6a6cb07b017d 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -818,7 +818,6 @@ def load_bytes( frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery( cap, frames_indices, total_frames_num ) - valid_num_frames = len(valid_frame_indices) if recovered_map: logger.info( From 6f3124713a6d155cf552bef6687a43a5e348f901 Mon Sep 17 00:00:00 2001 From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Date: Wed, 25 Feb 2026 16:36:39 +0800 Subject: [PATCH 9/9] Update video.py Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> --- vllm/multimodal/video.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 6a6cb07b017d..fb4e19fa6745 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -749,7 +749,7 @@ def load_bytes( return out -@VIDEO_LOADER_REGISTRY.register("opencv_dynamic_openpangu") +@VIDEO_LOADER_REGISTRY.register("openpangu") class OpenCVDynamicOpenPanguVideoBackend(OpenCVVideoBackend): @classmethod def load_bytes(