From dc068bc99a6c4baccfe49d1adebd17bb5578737a Mon Sep 17 00:00:00 2001
From: hujiaxin <524446785@qq.com>
Date: Mon, 9 Feb 2026 20:18:02 +0800
Subject: [PATCH 1/9] openpangu-vl support video input

Signed-off-by: hujiaxin <524446785@qq.com>
---
 vllm/multimodal/video.py | 83 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index f123799ca901..03d966090323 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
+import warnings
 from abc import abstractmethod
 from io import BytesIO
 from typing import TYPE_CHECKING, Any, cast
@@ -747,3 +748,85 @@ def load_bytes(
             **kwargs,
         )
         return out
+
+
+@VIDEO_LOADER_REGISTRY.register("opencv_dynamic_openpangu")
+class OpenCVDynamicOpenPanguVideoBackend(OpenCVVideoBackend):
+    @classmethod
+    def load_bytes(
+        cls,
+        data: bytes,
+        num_frames: int = 32,
+        fps: int = 1,
+        **kwargs,
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        import cv2
+
+        backend = cls().get_cv2_video_api()
+        cap = cv2.VideoCapture(BytesIO(data), backend, [])
+        if not cap.isOpened():
+            raise ValueError("Could not open video stream")
+
+        total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        original_fps = float(cap.get(cv2.CAP_PROP_FPS))
+        # The timestamp of the rightmost frame, cannot be used to calculate frame 0.
+        total_duration = (total_frames_num - 1) / original_fps
+
+        # `fps` is the FPS parameter passed in for sampling,
+        # -1 indicates that sampling can be performed directly without FPS limitation.
+        if fps > 0:
+            # Num_frames is the maximum number of frames to sample.
+            # If fewer frames are sampled at this sample_fps, the update duration will be longer.
+            if num_frames >= int(total_duration * fps) + 1:
+                num_frames = int(total_duration * fps) + 1
+                # Under the new maximum frame rate, the video duration of the rightmost frame,
+                # cannot be calculated for frame 0.
+                total_duration = min(total_duration, (num_frames - 1) / fps)
+        elif fps != -1:
+            raise ValueError(f"requires dataset fps is -1 or greater than 0 but got {fps}")
+
+        sample_frame_timestamps = np.linspace(0, total_duration, num_frames, dtype=float)
+        frames_indices = [min(total_frames_num - 1, round(t * original_fps)) for t in sample_frame_timestamps]
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((len(frames_indices), height, width, 3), dtype=np.uint8)
+
+        i = 0
+        for frame_idx in frames_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+            ret, frame = cap.read()
+            if ret:
+                frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                i += 1
+            else:
+                # when get a bad frame,continuous finding a next good frame
+                next_idx = frame_idx + 1
+                while next_idx < total_frames_num:
+                    ret, next_frame = cap.read()
+                    if ret:
+                        frames[i] = cv2.cvtColor(next_frame, cv2.COLOR_BGR2RGB)
+                        i += 1
+                        break
+                    next_idx += 1
+
+        if i != len(frames_indices):
+            warnings.warn(
+                f"Expected reading {len(frames_indices)} frames,"
+                f"but only loaded {i} frames from video.",
+                UserWarning,
+                stacklevel=2
+            )
+
+        # Use transformers transformers.video_utils.VideoMetadata format
+        metadata = {
+            "total_num_frames": total_frames_num,
+            "fps": original_fps,
+            "duration": total_duration,
+            "video_backend": "opencv_dynamic_openpangu",
+            "frames_indices": frames_indices,
+            "do_sample_frames": False,
+            "sample_frame_timestamps": sample_frame_timestamps,
+        }
+        return frames, metadata
+        

From 5cc32bd0a42095beab9d173d3678d52b316aa4db Mon Sep 17 00:00:00 2001
From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Date: Tue, 10 Feb 2026 09:53:38 +0800
Subject: [PATCH 2/9] check code

check code

Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
---
 vllm/multimodal/video.py | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 03d966090323..6712f85cb363 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -760,6 +760,21 @@ def load_bytes(
         fps: int = 1,
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """
+        Load video frames with dynamic sampling based on duration.
+        Assume that total_num_frames = 10 and fps = 1.
+        The timestamp of frame 0 is 0.0.
+        The timestamp of frame 1 is 1.0.…
+        The timestamp of frame 9 (the last frame) should be 9.0, that is, (total_frames_num – 1) / original_fps.
+
+        Args:
+            data: Raw video bytes
+            num_frames: Not used in dynamic backend
+            fps: Target FPS for sampling (default: 2)
+
+        Returns:
+            Tuple of (frames_array, metadata_dict)
+        """
         import cv2
 
         backend = cls().get_cv2_video_api()
@@ -770,7 +785,10 @@ def load_bytes(
         total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         original_fps = float(cap.get(cv2.CAP_PROP_FPS))
         # The timestamp of the rightmost frame, cannot be used to calculate frame 0.
-        total_duration = (total_frames_num - 1) / original_fps
+        if total_frames_num >= 1 and original_fps > 0:
+            total_duration = (total_frames_num - 1) / original_fps
+        else:
+            total_duration = 0
 
         # `fps` is the FPS parameter passed in for sampling,
         # -1 indicates that sampling can be performed directly without FPS limitation.
@@ -783,10 +801,17 @@ def load_bytes(
                 # cannot be calculated for frame 0.
                 total_duration = min(total_duration, (num_frames - 1) / fps)
         elif fps != -1:
-            raise ValueError(f"requires dataset fps is -1 or greater than 0 but got {fps}")
+            raise ValueError(
+                f"requires dataset fps is -1 or greater than 0 but got {fps}"
+            )
 
-        sample_frame_timestamps = np.linspace(0, total_duration, num_frames, dtype=float)
-        frames_indices = [min(total_frames_num - 1, round(t * original_fps)) for t in sample_frame_timestamps]
+        sample_frame_timestamps = np.linspace(
+            0, total_duration, num_frames, dtype=float
+        )
+        frames_indices = [
+            min(total_frames_num - 1, round(t * original_fps))
+            for t in sample_frame_timestamps
+        ]
 
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
@@ -815,7 +840,7 @@ def load_bytes(
                 f"Expected reading {len(frames_indices)} frames,"
                 f"but only loaded {i} frames from video.",
                 UserWarning,
-                stacklevel=2
+                stacklevel=2,
             )
 
         # Use transformers transformers.video_utils.VideoMetadata format
@@ -829,4 +854,3 @@ def load_bytes(
             "sample_frame_timestamps": sample_frame_timestamps,
         }
         return frames, metadata
-        

From 994b97bd81485dae280b2a89aeae6393eb10e27a Mon Sep 17 00:00:00 2001
From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Date: Tue, 10 Feb 2026 10:53:45 +0800
Subject: [PATCH 3/9] Update video.py

Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
---
 vllm/multimodal/video.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 6712f85cb363..54bfc9e67159 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -758,6 +758,8 @@ def load_bytes(
         data: bytes,
         num_frames: int = 32,
         fps: int = 1,
+        max_duration: int = 300,
+        frame_recovery: bool = False,
         **kwargs,
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         """

From 8b37e17735f642a88c1fb0dc142b11af6123eb03 Mon Sep 17 00:00:00 2001
From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Date: Tue, 10 Feb 2026 11:15:00 +0800
Subject: [PATCH 4/9] Update video.py

Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
---
 vllm/multimodal/video.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 54bfc9e67159..2cf0cdf34a99 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -767,7 +767,7 @@ def load_bytes(
         Assume that total_num_frames = 10 and fps = 1.
         The timestamp of frame 0 is 0.0.
         The timestamp of frame 1 is 1.0.…
-        The timestamp of frame 9 (the last frame) should be 9.0, that is, (total_frames_num – 1) / original_fps.
+        The timestamp of frame 9 (the last frame) should be 9.0, that is, (total_frames_num – 1) / original_fps. # noqa: E501
 
         Args:
             data: Raw video bytes
@@ -796,10 +796,10 @@ def load_bytes(
         # -1 indicates that sampling can be performed directly without FPS limitation.
         if fps > 0:
             # Num_frames is the maximum number of frames to sample.
-            # If fewer frames are sampled at this sample_fps, the update duration will be longer.
+            # If fewer frames are sampled at this sample_fps, the update duration will be longer. # noqa: E501
             if num_frames >= int(total_duration * fps) + 1:
                 num_frames = int(total_duration * fps) + 1
-                # Under the new maximum frame rate, the video duration of the rightmost frame,
+                # Under the new maximum frame rate, the video duration of the rightmost frame, # noqa: E501
                 # cannot be calculated for frame 0.
                 total_duration = min(total_duration, (num_frames - 1) / fps)
         elif fps != -1:

From 0dfdda0ce813861bd2ae62f3884d6ac7e3d9f588 Mon Sep 17 00:00:00 2001
From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Date: Tue, 10 Feb 2026 11:21:08 +0800
Subject: [PATCH 5/9] Update video.py

Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
---
 vllm/multimodal/video.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 2cf0cdf34a99..c592e8c16bce 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -767,7 +767,8 @@ def load_bytes(
         Assume that total_num_frames = 10 and fps = 1.
         The timestamp of frame 0 is 0.0.
         The timestamp of frame 1 is 1.0.…
-        The timestamp of frame 9 (the last frame) should be 9.0, that is, (total_frames_num – 1) / original_fps. # noqa: E501
+        The timestamp of frame 9 (the last frame) should be 9.0, that is,
+        (total_frames_num – 1) / original_fps.
 
         Args:
             data: Raw video bytes

From c5fa900a0c636ad76ab2fb73cdb75bb388c4e1a9 Mon Sep 17 00:00:00 2001
From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Date: Thu, 12 Feb 2026 09:10:59 +0800
Subject: [PATCH 6/9] Update video.py

Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
---
 vllm/multimodal/video.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index c592e8c16bce..d61925e4f153 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -854,6 +854,5 @@ def load_bytes(
             "video_backend": "opencv_dynamic_openpangu",
             "frames_indices": frames_indices,
             "do_sample_frames": False,
-            "sample_frame_timestamps": sample_frame_timestamps,
         }
         return frames, metadata

From 3623ece333bf720b6a7794858106d83c1b7232c1 Mon Sep 17 00:00:00 2001
From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Date: Wed, 25 Feb 2026 14:46:38 +0800
Subject: [PATCH 7/9] Update video.py

Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
---
 vllm/multimodal/video.py | 41 ++++++++++------------------------------
 1 file changed, 10 insertions(+), 31 deletions(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index d61925e4f153..640851e228b4 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
-import warnings
 from abc import abstractmethod
 from io import BytesIO
 from typing import TYPE_CHECKING, Any, cast
@@ -773,7 +772,7 @@ def load_bytes(
         Args:
             data: Raw video bytes
             num_frames: Not used in dynamic backend
-            fps: Target FPS for sampling (default: 2)
+            fps: Target FPS for sampling (default: 1)
 
         Returns:
             Tuple of (frames_array, metadata_dict)
@@ -816,43 +815,23 @@ def load_bytes(
             for t in sample_frame_timestamps
         ]
 
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frames_indices), height, width, 3), dtype=np.uint8)
+        frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
+            cap, frames_indices, total_frames_num
+        )
+        valid_num_frames = len(valid_frame_indices)
 
-        i = 0
-        for frame_idx in frames_indices:
-            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
-            ret, frame = cap.read()
-            if ret:
-                frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                i += 1
-            else:
-                # when get a bad frame,continuous finding a next good frame
-                next_idx = frame_idx + 1
-                while next_idx < total_frames_num:
-                    ret, next_frame = cap.read()
-                    if ret:
-                        frames[i] = cv2.cvtColor(next_frame, cv2.COLOR_BGR2RGB)
-                        i += 1
-                        break
-                    next_idx += 1
-
-        if i != len(frames_indices):
-            warnings.warn(
-                f"Expected reading {len(frames_indices)} frames,"
-                f"but only loaded {i} frames from video.",
-                UserWarning,
-                stacklevel=2,
+        if recovered_map:
+            logger.info(
+                "Frame recovery: %d frames recovered using forward scan.",
+                len(recovered_map),
             )
 
-        # Use transformers transformers.video_utils.VideoMetadata format
         metadata = {
             "total_num_frames": total_frames_num,
             "fps": original_fps,
             "duration": total_duration,
             "video_backend": "opencv_dynamic_openpangu",
-            "frames_indices": frames_indices,
+            "frames_indices": valid_frame_indices,
             "do_sample_frames": False,
         }
         return frames, metadata

From f277b23c01eb4956ca97a7822d4d73b86e15f436 Mon Sep 17 00:00:00 2001
From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Date: Wed, 25 Feb 2026 14:51:51 +0800
Subject: [PATCH 8/9] Update video.py

Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
---
 vllm/multimodal/video.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 640851e228b4..6a6cb07b017d 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -818,7 +818,6 @@ def load_bytes(
         frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
             cap, frames_indices, total_frames_num
         )
-        valid_num_frames = len(valid_frame_indices)
 
         if recovered_map:
             logger.info(

From 6f3124713a6d155cf552bef6687a43a5e348f901 Mon Sep 17 00:00:00 2001
From: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
Date: Wed, 25 Feb 2026 16:36:39 +0800
Subject: [PATCH 9/9] Update video.py

Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com>
---
 vllm/multimodal/video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 6a6cb07b017d..fb4e19fa6745 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -749,7 +749,7 @@ def load_bytes(
         return out
 
 
-@VIDEO_LOADER_REGISTRY.register("opencv_dynamic_openpangu")
+@VIDEO_LOADER_REGISTRY.register("openpangu")
 class OpenCVDynamicOpenPanguVideoBackend(OpenCVVideoBackend):
     @classmethod
     def load_bytes(