Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 20 additions & 21 deletions vllm/model_executor/models/qwen3_omni_moe_thinker.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"""Inference-only Qwen3-Omni-Moe model (thinker part)."""

from collections.abc import Callable, Iterable, Mapping, Sequence
from functools import partial
from functools import partial, reduce
from typing import Any

import numpy as np
Expand Down Expand Up @@ -1769,9 +1769,11 @@ def embed_input_ids(
input_ids: torch.Tensor,
multimodal_embeddings: MultiModalEmbeddings | None = None,
*,
is_multimodal: torch.Tensor | None = None,
is_multimodals: list[torch.Tensor] | None = None,
handle_oov_mm_token: bool = False,
) -> torch.Tensor:
is_multimodal = reduce(torch.logical_or, is_multimodals)

inputs_embeds = self._embed_text_input_ids(
input_ids,
self.language_model.embed_input_ids,
Expand All @@ -1785,7 +1787,8 @@ def embed_input_ids(
deepstack_input_embeds = None
# split the feat dim to obtain multi-scale visual feature
has_vision_embeddings = [
embeddings.shape[-1] != self.config.text_config.hidden_size
embeddings.shape[-1] > 0
and embeddings.shape[-1] != self.config.text_config.hidden_size
for embeddings in multimodal_embeddings
]
if self.visual.deepstack_visual_indexes is not None and any(
Expand All @@ -1794,13 +1797,12 @@ def embed_input_ids(
multiscale_len = len(self.visual.deepstack_visual_indexes)
multimodal_embeddings_multiscale = []
is_vision = torch.zeros_like(is_multimodal)
mm_positions = torch.nonzero(is_multimodal, as_tuple=True)[0]
mm_position_idx = 0

for index, embeddings in enumerate(multimodal_embeddings):
num_tokens = embeddings.shape[0]
current_positions = mm_positions[
mm_position_idx : mm_position_idx + num_tokens
]
if len(embeddings) == 0:
continue

_is_multimodal = is_multimodals[index]

# Vision embeddings
if embeddings.shape[-1] != self.config.text_config.hidden_size:
Expand All @@ -1811,13 +1813,7 @@ def embed_input_ids(
)
multimodal_embeddings[index] = embeddings_main
multimodal_embeddings_multiscale.append(embeddings_multiscale)
is_vision[current_positions] = True

# Audio embeddings
else:
is_vision[current_positions] = False

mm_position_idx += num_tokens
is_vision[_is_multimodal] = True

deepstack_input_embeds = inputs_embeds.new_zeros(
inputs_embeds.size(0), multiscale_len * inputs_embeds.size(1)
Expand All @@ -1836,11 +1832,14 @@ def embed_input_ids(
)
self._set_deepstack_input_embeds(deepstack_input_embeds)

inputs_embeds = _merge_multimodal_embeddings(
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
is_multimodal=is_multimodal,
)
for is_multimodal, multimodal_embedding in zip(
is_multimodals, multimodal_embeddings
):
inputs_embeds = _merge_multimodal_embeddings(
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embedding,
is_multimodal=is_multimodal,
)

return inputs_embeds

Expand Down
21 changes: 16 additions & 5 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2346,7 +2346,7 @@ def _gather_mm_embeddings(
self,
scheduler_output: "SchedulerOutput",
shift_computed_tokens: int = 0,
) -> tuple[list[torch.Tensor], torch.Tensor]:
) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens

# Swap to the other buffer to avoid race condition with previous
Expand All @@ -2355,6 +2355,7 @@ def _gather_mm_embeddings(
is_mm_embed_buf = self.is_mm_embed_buffers[self.is_mm_embed_idx]

mm_embeds = list[torch.Tensor]()
is_mm_embeds = list[torch.Tensor]()
is_mm_embed = is_mm_embed_buf.cpu
is_mm_embed[:total_num_scheduled_tokens] = False

Expand Down Expand Up @@ -2415,6 +2416,10 @@ def _gather_mm_embeddings(
True if is_embed is None else is_embed
)
mm_embeds_req.append(mm_embeds_item)
is_mm_embeds.append(
is_mm_embed[:total_num_scheduled_tokens].to(self.device)
)
is_mm_embed[:total_num_scheduled_tokens] = False

if self.is_multimodal_pruning_enabled and self.uses_mrope:
assert req_state.mrope_positions is not None
Expand All @@ -2433,7 +2438,13 @@ def _gather_mm_embeddings(
mm_embeds.extend(mm_embeds_req)
req_start_idx += num_scheduled_tokens

is_mm_embed = is_mm_embed_buf.copy_to_gpu(total_num_scheduled_tokens)
if not mm_embeds_req:
is_mm_embeds.append(
torch.tensor(
[False] * total_num_scheduled_tokens, device=self.device
)
)
mm_embeds.append(torch.empty((0, 0), device=self.device))

if should_sync_mrope_positions:
self._calc_mrope_positions(scheduler_output)
Expand All @@ -2443,7 +2454,7 @@ def _gather_mm_embeddings(
self._calc_xdrope_positions(scheduler_output)
self.xdrope_positions.copy_to_gpu(total_num_scheduled_tokens)

return mm_embeds, is_mm_embed
return mm_embeds, is_mm_embeds

def get_model(self) -> nn.Module:
# get raw model out of the cudagraph wrapper.
Expand Down Expand Up @@ -2645,15 +2656,15 @@ def _preprocess(
encoder_cache=self.encoder_cache,
) as ec_connector_output:
self._execute_mm_encoder(scheduler_output)
mm_embeds, is_mm_embed = self._gather_mm_embeddings(scheduler_output)
mm_embeds, is_mm_embeds = self._gather_mm_embeddings(scheduler_output)

# NOTE(woosuk): To unify token ids and soft tokens (vision
# embeddings), we always use embeddings (rather than token ids)
# as input to the multimodal model, even when the input is text.
inputs_embeds_scheduled = self.model.embed_input_ids(
self.input_ids.gpu[:num_scheduled_tokens],
multimodal_embeddings=mm_embeds,
is_multimodal=is_mm_embed,
is_multimodals=is_mm_embeds,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interface change breaks other multimodal models

High Severity

The parameter name in gpu_model_runner.py changed from is_multimodal to is_multimodals, but only qwen3_omni_moe_thinker.py was updated to accept the new parameter name. Other multimodal models (e.g., clip.py, eagle2_5_vl.py, gemma3_mm.py, ernie45_vl.py, qwen2_5_omni_thinker.py) still expect is_multimodal (singular). When the runner calls embed_input_ids(is_multimodals=...) on these models, a TypeError will be raised for unexpected keyword argument.

Additional Locations (1)

Fix in Cursor Fix in Web

)
Comment on lines 2664 to 2668
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This change, which passes is_multimodals to embed_input_ids, introduces a breaking change to the SupportsMultiModal interface. Currently, only Qwen3OmniMoeThinkerForConditionalGeneration is updated to handle this new parameter. Other multimodal models in the codebase that expect is_multimodal: torch.Tensor will fail at runtime.

To address this, you could either:

  1. Update all other multimodal models to accept the is_multimodals parameter.
  2. Implement a backward-compatibility mechanism. For instance, you could inspect the signature of self.model.embed_input_ids and, if it doesn't accept is_multimodals, compute a single combined mask using reduce(torch.logical_or, is_mm_embeds) and pass it as is_multimodal.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As the critical message, this commit should modify the interface _gather_mm_embeddingsand the interface embed_input_ids, a lot of the models code should be modified, i'm not sure should i do it now.


# TODO(woosuk): Avoid the copy. Optimize.
Expand Down