From 56d8dee26a4f094cfa2865a66b1f172b44736ac8 Mon Sep 17 00:00:00 2001 From: chengyufang Date: Wed, 18 Mar 2026 22:00:35 +0800 Subject: [PATCH 1/2] [Bugfix] Fix misuse of the merge_size parameter in _get_video_second_idx and _calculate_timestamps Signed-off-by: chengyufang --- vllm/model_executor/models/qwen3_vl.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index bf02df7b4968..f7865fa790ea 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -745,17 +745,17 @@ def get_max_video_tokens( return num_video_soft_tokens def _calculate_timestamps( - self, indices: list[int] | torch.Tensor, video_fps: float, merge_size: int + self, indices: list[int] | torch.Tensor, video_fps: float, temporal_patch_size: int ): if not isinstance(indices, list): indices = indices.tolist() - if len(indices) % merge_size != 0: + if len(indices) % temporal_patch_size != 0: # don't update metadata's frames_indices directly - indices = indices + [indices[-1]] * (merge_size - len(indices) % merge_size) + indices = indices + [indices[-1]] * (temporal_patch_size - len(indices) % temporal_patch_size) timestamps = [idx / video_fps for idx in indices] timestamps = [ - (timestamps[i] + timestamps[i + merge_size - 1]) / 2 - for i in range(0, len(timestamps), merge_size) + (timestamps[i] + timestamps[i + temporal_patch_size - 1]) / 2 + for i in range(0, len(timestamps), temporal_patch_size) ] return timestamps @@ -767,7 +767,7 @@ def _get_video_second_idx( sampled_num_frames: int | None = None, ) -> list[int]: video_processor = self.get_video_processor() - merge_size = video_processor.merge_size + temporal_patch_size = video_processor.temporal_patch_size indices = metadata["frames_indices"] # metadata["fps"] refers to the true fps of the input video. @@ -806,7 +806,7 @@ def _get_video_second_idx( .astype(int) .tolist() ) - timestamps = self._calculate_timestamps(indices, video_fps, merge_size) + timestamps = self._calculate_timestamps(indices, video_fps, temporal_patch_size) return timestamps From 187c310c8dfd1beb5f16f86b0fc5143610fadb83 Mon Sep 17 00:00:00 2001 From: chengyufang Date: Wed, 18 Mar 2026 22:39:33 +0800 Subject: [PATCH 2/2] Modify the parameter names to comply with the Transformers implementation specification. Signed-off-by: chengyufang --- vllm/model_executor/models/qwen3_vl.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index f7865fa790ea..4dd5b0631851 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -745,17 +745,17 @@ def get_max_video_tokens( return num_video_soft_tokens def _calculate_timestamps( - self, indices: list[int] | torch.Tensor, video_fps: float, temporal_patch_size: int + self, indices: list[int] | torch.Tensor, video_fps: float, merge_size: int ): if not isinstance(indices, list): indices = indices.tolist() - if len(indices) % temporal_patch_size != 0: + if len(indices) % merge_size != 0: # don't update metadata's frames_indices directly - indices = indices + [indices[-1]] * (temporal_patch_size - len(indices) % temporal_patch_size) + indices = indices + [indices[-1]] * (merge_size - len(indices) % merge_size) timestamps = [idx / video_fps for idx in indices] timestamps = [ - (timestamps[i] + timestamps[i + temporal_patch_size - 1]) / 2 - for i in range(0, len(timestamps), temporal_patch_size) + (timestamps[i] + timestamps[i + merge_size - 1]) / 2 + for i in range(0, len(timestamps), merge_size) ] return timestamps