From 019c549584d658fa739302e948a0b13fd823c116 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=94=AF=E5=8B=A4?= Date: Sat, 24 Jan 2026 11:57:35 +0800 Subject: [PATCH 1/3] fix use audio in video bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 唯勤 --- .../models/qwen3_omni_moe_thinker.py | 38 +++++++++---------- vllm/v1/worker/gpu_model_runner.py | 19 ++++++---- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 5ff77c5f584f..d6808efdb89b 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -23,7 +23,7 @@ """Inference-only Qwen3-Omni-Moe model (thinker part).""" from collections.abc import Callable, Iterable, Mapping, Sequence -from functools import partial +from functools import partial, reduce from typing import Any import numpy as np @@ -1769,9 +1769,11 @@ def embed_input_ids( input_ids: torch.Tensor, multimodal_embeddings: MultiModalEmbeddings | None = None, *, - is_multimodal: torch.Tensor | None = None, + is_multimodals: list[torch.Tensor] | None = None, handle_oov_mm_token: bool = False, ) -> torch.Tensor: + is_multimodal = reduce(torch.logical_or, is_multimodals) + inputs_embeds = self._embed_text_input_ids( input_ids, self.language_model.embed_input_ids, @@ -1785,7 +1787,7 @@ def embed_input_ids( deepstack_input_embeds = None # split the feat dim to obtain multi-scale visual feature has_vision_embeddings = [ - embeddings.shape[-1] != self.config.text_config.hidden_size + embeddings.shape[-1] > 0 and embeddings.shape[-1] != self.config.text_config.hidden_size for embeddings in multimodal_embeddings ] if self.visual.deepstack_visual_indexes is not None and any( @@ -1794,13 +1796,12 @@ def embed_input_ids( multiscale_len = len(self.visual.deepstack_visual_indexes) multimodal_embeddings_multiscale = [] is_vision = torch.zeros_like(is_multimodal) - mm_positions = torch.nonzero(is_multimodal, as_tuple=True)[0] - mm_position_idx = 0 + for index, embeddings in enumerate(multimodal_embeddings): - num_tokens = embeddings.shape[0] - current_positions = mm_positions[ - mm_position_idx : mm_position_idx + num_tokens - ] + if len(embeddings) == 0: + continue + + _is_multimodal = is_multimodals[index] # Vision embeddings if embeddings.shape[-1] != self.config.text_config.hidden_size: @@ -1811,13 +1812,7 @@ def embed_input_ids( ) multimodal_embeddings[index] = embeddings_main multimodal_embeddings_multiscale.append(embeddings_multiscale) - is_vision[current_positions] = True - - # Audio embeddings - else: - is_vision[current_positions] = False - - mm_position_idx += num_tokens + is_vision[_is_multimodal] = True deepstack_input_embeds = inputs_embeds.new_zeros( inputs_embeds.size(0), multiscale_len * inputs_embeds.size(1) @@ -1836,11 +1831,12 @@ def embed_input_ids( ) self._set_deepstack_input_embeds(deepstack_input_embeds) - inputs_embeds = _merge_multimodal_embeddings( - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - is_multimodal=is_multimodal, - ) + for is_multimodal, multimodal_embedding in zip(is_multimodals, multimodal_embeddings): + inputs_embeds = _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embedding, + is_multimodal=is_multimodal, + ) return inputs_embeds diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b46fc175d4b6..f1369dba90f4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2345,8 +2345,8 @@ def _execute_mm_encoder( def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", - shift_computed_tokens: int = 0, - ) -> tuple[list[torch.Tensor], torch.Tensor]: + shift_computed_tokens: int = 0, + ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens # Swap to the other buffer to avoid race condition with previous @@ -2355,6 +2355,7 @@ def _gather_mm_embeddings( is_mm_embed_buf = self.is_mm_embed_buffers[self.is_mm_embed_idx] mm_embeds = list[torch.Tensor]() + is_mm_embeds = list[torch.Tensor]() is_mm_embed = is_mm_embed_buf.cpu is_mm_embed[:total_num_scheduled_tokens] = False @@ -2415,6 +2416,8 @@ def _gather_mm_embeddings( True if is_embed is None else is_embed ) mm_embeds_req.append(mm_embeds_item) + is_mm_embeds.append(is_mm_embed[:total_num_scheduled_tokens].to(self.device)) + is_mm_embed[:total_num_scheduled_tokens] = False if self.is_multimodal_pruning_enabled and self.uses_mrope: assert req_state.mrope_positions is not None @@ -2432,8 +2435,10 @@ def _gather_mm_embeddings( mm_embeds.extend(mm_embeds_req) req_start_idx += num_scheduled_tokens - - is_mm_embed = is_mm_embed_buf.copy_to_gpu(total_num_scheduled_tokens) + + if not mm_embeds_req: + is_mm_embeds.append(torch.tensor([False] * total_num_scheduled_tokens, device=self.device)) + mm_embeds.append(torch.empty((0, 0), device=self.device)) if should_sync_mrope_positions: self._calc_mrope_positions(scheduler_output) @@ -2443,7 +2448,7 @@ def _gather_mm_embeddings( self._calc_xdrope_positions(scheduler_output) self.xdrope_positions.copy_to_gpu(total_num_scheduled_tokens) - return mm_embeds, is_mm_embed + return mm_embeds, is_mm_embeds def get_model(self) -> nn.Module: # get raw model out of the cudagraph wrapper. @@ -2645,7 +2650,7 @@ def _preprocess( encoder_cache=self.encoder_cache, ) as ec_connector_output: self._execute_mm_encoder(scheduler_output) - mm_embeds, is_mm_embed = self._gather_mm_embeddings(scheduler_output) + mm_embeds, is_mm_embeds = self._gather_mm_embeddings(scheduler_output) # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) @@ -2653,7 +2658,7 @@ def _preprocess( inputs_embeds_scheduled = self.model.embed_input_ids( self.input_ids.gpu[:num_scheduled_tokens], multimodal_embeddings=mm_embeds, - is_multimodal=is_mm_embed, + is_multimodals=is_mm_embeds, ) # TODO(woosuk): Avoid the copy. Optimize. From 96e528f95f3b95b285383043131e06fb022837ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=94=AF=E5=8B=A4?= Date: Sat, 24 Jan 2026 16:44:53 +0800 Subject: [PATCH 2/3] fix pre commit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 唯勤 --- .../model_executor/models/qwen3_omni_moe_thinker.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index d6808efdb89b..ec9e5914e7aa 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -1773,7 +1773,7 @@ def embed_input_ids( handle_oov_mm_token: bool = False, ) -> torch.Tensor: is_multimodal = reduce(torch.logical_or, is_multimodals) - + inputs_embeds = self._embed_text_input_ids( input_ids, self.language_model.embed_input_ids, @@ -1787,7 +1787,8 @@ def embed_input_ids( deepstack_input_embeds = None # split the feat dim to obtain multi-scale visual feature has_vision_embeddings = [ - embeddings.shape[-1] > 0 and embeddings.shape[-1] != self.config.text_config.hidden_size + embeddings.shape[-1] > 0 + and embeddings.shape[-1] != self.config.text_config.hidden_size for embeddings in multimodal_embeddings ] if self.visual.deepstack_visual_indexes is not None and any( @@ -1796,11 +1797,11 @@ def embed_input_ids( multiscale_len = len(self.visual.deepstack_visual_indexes) multimodal_embeddings_multiscale = [] is_vision = torch.zeros_like(is_multimodal) - + for index, embeddings in enumerate(multimodal_embeddings): if len(embeddings) == 0: continue - + _is_multimodal = is_multimodals[index] # Vision embeddings @@ -1831,7 +1832,9 @@ def embed_input_ids( ) self._set_deepstack_input_embeds(deepstack_input_embeds) - for is_multimodal, multimodal_embedding in zip(is_multimodals, multimodal_embeddings): + for is_multimodal, multimodal_embedding in zip( + is_multimodals, multimodal_embeddings + ): inputs_embeds = _merge_multimodal_embeddings( inputs_embeds=inputs_embeds, multimodal_embeddings=multimodal_embedding, From 36126e81937385a8749a85be2584add7ed638639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=94=AF=E5=8B=A4?= Date: Sat, 24 Jan 2026 17:01:22 +0800 Subject: [PATCH 3/3] fix pre commit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 唯勤 --- vllm/v1/worker/gpu_model_runner.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f1369dba90f4..65b11b3399ef 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2345,7 +2345,7 @@ def _execute_mm_encoder( def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", - shift_computed_tokens: int = 0, + shift_computed_tokens: int = 0, ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens @@ -2416,7 +2416,9 @@ def _gather_mm_embeddings( True if is_embed is None else is_embed ) mm_embeds_req.append(mm_embeds_item) - is_mm_embeds.append(is_mm_embed[:total_num_scheduled_tokens].to(self.device)) + is_mm_embeds.append( + is_mm_embed[:total_num_scheduled_tokens].to(self.device) + ) is_mm_embed[:total_num_scheduled_tokens] = False if self.is_multimodal_pruning_enabled and self.uses_mrope: @@ -2435,9 +2437,13 @@ def _gather_mm_embeddings( mm_embeds.extend(mm_embeds_req) req_start_idx += num_scheduled_tokens - + if not mm_embeds_req: - is_mm_embeds.append(torch.tensor([False] * total_num_scheduled_tokens, device=self.device)) + is_mm_embeds.append( + torch.tensor( + [False] * total_num_scheduled_tokens, device=self.device + ) + ) mm_embeds.append(torch.empty((0, 0), device=self.device)) if should_sync_mrope_positions: