From 56d8dee26a4f094cfa2865a66b1f172b44736ac8 Mon Sep 17 00:00:00 2001
From: chengyufang <cnyvfang@outlook.com>
Date: Wed, 18 Mar 2026 22:00:35 +0800
Subject: [PATCH 1/2] [Bugfix] Fix misuse of the merge_size parameter in
 _get_video_second_idx and _calculate_timestamps

Signed-off-by: chengyufang <cnyvfang@outlook.com>
---
 vllm/model_executor/models/qwen3_vl.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index bf02df7b4968..f7865fa790ea 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -745,17 +745,17 @@ def get_max_video_tokens(
         return num_video_soft_tokens
 
     def _calculate_timestamps(
-        self, indices: list[int] | torch.Tensor, video_fps: float, merge_size: int
+        self, indices: list[int] | torch.Tensor, video_fps: float, temporal_patch_size: int
     ):
         if not isinstance(indices, list):
             indices = indices.tolist()
-        if len(indices) % merge_size != 0:
+        if len(indices) % temporal_patch_size != 0:
             # don't update metadata's frames_indices directly
-            indices = indices + [indices[-1]] * (merge_size - len(indices) % merge_size)
+            indices = indices + [indices[-1]] * (temporal_patch_size - len(indices) % temporal_patch_size)
         timestamps = [idx / video_fps for idx in indices]
         timestamps = [
-            (timestamps[i] + timestamps[i + merge_size - 1]) / 2
-            for i in range(0, len(timestamps), merge_size)
+            (timestamps[i] + timestamps[i + temporal_patch_size - 1]) / 2
+            for i in range(0, len(timestamps), temporal_patch_size)
         ]
         return timestamps
 
@@ -767,7 +767,7 @@ def _get_video_second_idx(
         sampled_num_frames: int | None = None,
     ) -> list[int]:
         video_processor = self.get_video_processor()
-        merge_size = video_processor.merge_size
+        temporal_patch_size = video_processor.temporal_patch_size
         indices = metadata["frames_indices"]
 
         # metadata["fps"] refers to the true fps of the input video.
@@ -806,7 +806,7 @@ def _get_video_second_idx(
                 .astype(int)
                 .tolist()
             )
-        timestamps = self._calculate_timestamps(indices, video_fps, merge_size)
+        timestamps = self._calculate_timestamps(indices, video_fps, temporal_patch_size)
         return timestamps
 
 

From 187c310c8dfd1beb5f16f86b0fc5143610fadb83 Mon Sep 17 00:00:00 2001
From: chengyufang <cnyvfang@outlook.com>
Date: Wed, 18 Mar 2026 22:39:33 +0800
Subject: [PATCH 2/2] Modify the parameter names to comply with the
 Transformers implementation specification.

Signed-off-by: chengyufang <cnyvfang@outlook.com>
---
 vllm/model_executor/models/qwen3_vl.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index f7865fa790ea..4dd5b0631851 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -745,17 +745,17 @@ def get_max_video_tokens(
         return num_video_soft_tokens
 
     def _calculate_timestamps(
-        self, indices: list[int] | torch.Tensor, video_fps: float, temporal_patch_size: int
+        self, indices: list[int] | torch.Tensor, video_fps: float, merge_size: int
     ):
         if not isinstance(indices, list):
             indices = indices.tolist()
-        if len(indices) % temporal_patch_size != 0:
+        if len(indices) % merge_size != 0:
             # don't update metadata's frames_indices directly
-            indices = indices + [indices[-1]] * (temporal_patch_size - len(indices) % temporal_patch_size)
+            indices = indices + [indices[-1]] * (merge_size - len(indices) % merge_size)
         timestamps = [idx / video_fps for idx in indices]
         timestamps = [
-            (timestamps[i] + timestamps[i + temporal_patch_size - 1]) / 2
-            for i in range(0, len(timestamps), temporal_patch_size)
+            (timestamps[i] + timestamps[i + merge_size - 1]) / 2
+            for i in range(0, len(timestamps), merge_size)
         ]
         return timestamps