diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 5ff77c5f584f..ec9e5914e7aa 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -23,7 +23,7 @@ """Inference-only Qwen3-Omni-Moe model (thinker part).""" from collections.abc import Callable, Iterable, Mapping, Sequence -from functools import partial +from functools import partial, reduce from typing import Any import numpy as np @@ -1769,9 +1769,11 @@ def embed_input_ids( input_ids: torch.Tensor, multimodal_embeddings: MultiModalEmbeddings | None = None, *, - is_multimodal: torch.Tensor | None = None, + is_multimodals: list[torch.Tensor] | None = None, handle_oov_mm_token: bool = False, ) -> torch.Tensor: + is_multimodal = reduce(torch.logical_or, is_multimodals) + inputs_embeds = self._embed_text_input_ids( input_ids, self.language_model.embed_input_ids, @@ -1785,7 +1787,8 @@ def embed_input_ids( deepstack_input_embeds = None # split the feat dim to obtain multi-scale visual feature has_vision_embeddings = [ - embeddings.shape[-1] != self.config.text_config.hidden_size + embeddings.shape[-1] > 0 + and embeddings.shape[-1] != self.config.text_config.hidden_size for embeddings in multimodal_embeddings ] if self.visual.deepstack_visual_indexes is not None and any( @@ -1794,13 +1797,12 @@ def embed_input_ids( multiscale_len = len(self.visual.deepstack_visual_indexes) multimodal_embeddings_multiscale = [] is_vision = torch.zeros_like(is_multimodal) - mm_positions = torch.nonzero(is_multimodal, as_tuple=True)[0] - mm_position_idx = 0 + for index, embeddings in enumerate(multimodal_embeddings): - num_tokens = embeddings.shape[0] - current_positions = mm_positions[ - mm_position_idx : mm_position_idx + num_tokens - ] + if len(embeddings) == 0: + continue + + _is_multimodal = is_multimodals[index] # Vision embeddings if embeddings.shape[-1] != self.config.text_config.hidden_size: @@ -1811,13 +1813,7 @@ def embed_input_ids( ) multimodal_embeddings[index] = embeddings_main multimodal_embeddings_multiscale.append(embeddings_multiscale) - is_vision[current_positions] = True - - # Audio embeddings - else: - is_vision[current_positions] = False - - mm_position_idx += num_tokens + is_vision[_is_multimodal] = True deepstack_input_embeds = inputs_embeds.new_zeros( inputs_embeds.size(0), multiscale_len * inputs_embeds.size(1) @@ -1836,11 +1832,14 @@ def embed_input_ids( ) self._set_deepstack_input_embeds(deepstack_input_embeds) - inputs_embeds = _merge_multimodal_embeddings( - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - is_multimodal=is_multimodal, - ) + for is_multimodal, multimodal_embedding in zip( + is_multimodals, multimodal_embeddings + ): + inputs_embeds = _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embedding, + is_multimodal=is_multimodal, + ) return inputs_embeds diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b46fc175d4b6..65b11b3399ef 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2346,7 +2346,7 @@ def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", shift_computed_tokens: int = 0, - ) -> tuple[list[torch.Tensor], torch.Tensor]: + ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens # Swap to the other buffer to avoid race condition with previous @@ -2355,6 +2355,7 @@ def _gather_mm_embeddings( is_mm_embed_buf = self.is_mm_embed_buffers[self.is_mm_embed_idx] mm_embeds = list[torch.Tensor]() + is_mm_embeds = list[torch.Tensor]() is_mm_embed = is_mm_embed_buf.cpu is_mm_embed[:total_num_scheduled_tokens] = False @@ -2415,6 +2416,10 @@ def _gather_mm_embeddings( True if is_embed is None else is_embed ) mm_embeds_req.append(mm_embeds_item) + is_mm_embeds.append( + is_mm_embed[:total_num_scheduled_tokens].to(self.device) + ) + is_mm_embed[:total_num_scheduled_tokens] = False if self.is_multimodal_pruning_enabled and self.uses_mrope: assert req_state.mrope_positions is not None @@ -2433,7 +2438,13 @@ def _gather_mm_embeddings( mm_embeds.extend(mm_embeds_req) req_start_idx += num_scheduled_tokens - is_mm_embed = is_mm_embed_buf.copy_to_gpu(total_num_scheduled_tokens) + if not mm_embeds_req: + is_mm_embeds.append( + torch.tensor( + [False] * total_num_scheduled_tokens, device=self.device + ) + ) + mm_embeds.append(torch.empty((0, 0), device=self.device)) if should_sync_mrope_positions: self._calc_mrope_positions(scheduler_output) @@ -2443,7 +2454,7 @@ def _gather_mm_embeddings( self._calc_xdrope_positions(scheduler_output) self.xdrope_positions.copy_to_gpu(total_num_scheduled_tokens) - return mm_embeds, is_mm_embed + return mm_embeds, is_mm_embeds def get_model(self) -> nn.Module: # get raw model out of the cudagraph wrapper. @@ -2645,7 +2656,7 @@ def _preprocess( encoder_cache=self.encoder_cache, ) as ec_connector_output: self._execute_mm_encoder(scheduler_output) - mm_embeds, is_mm_embed = self._gather_mm_embeddings(scheduler_output) + mm_embeds, is_mm_embeds = self._gather_mm_embeddings(scheduler_output) # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) @@ -2653,7 +2664,7 @@ def _preprocess( inputs_embeds_scheduled = self.model.embed_input_ids( self.input_ids.gpu[:num_scheduled_tokens], multimodal_embeddings=mm_embeds, - is_multimodal=is_mm_embed, + is_multimodals=is_mm_embeds, ) # TODO(woosuk): Avoid the copy. Optimize.