Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,29 @@ def forward(
)

def get_mrope_input_positions(
self,
input_tokens: list[int],
mm_features: list[MultiModalFeatureSpec] | None = None,
# V1 runner passes these explicitly; thinker extracts them
# from mm_features via gather_kwargs, so they're unused here.
hf_config: PretrainedConfig | None = None,
image_grid_thw: list[list[int]] | torch.Tensor | None = None,
video_grid_thw: list[list[int]] | torch.Tensor | None = None,
second_per_grid_ts: list[float] | None = None,
context_len: int = 0,
seq_len: int | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]:
if self.model_stage == "thinker":
if mm_features is None:
mm_features = []
return self.thinker.get_mrope_input_positions(input_tokens, mm_features)
seq_len_ = len(input_tokens)
positions = torch.arange(seq_len_).unsqueeze(0).expand(3, -1)
return positions, 0

def _get_mrope_input_positions_v1(
self,
input_tokens: list[int],
mm_features: list[MultiModalFeatureSpec] | None = None,
Expand Down
1 change: 1 addition & 0 deletions vllm_omni/worker_v2/model_states/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
# AND update the corresponding test in tests/worker_v2/test_init_model_state.py.
_OMNI_ARCHITECTURES: set[str] = {
"Qwen3OmniMoeForConditionalGeneration",
"Qwen2_5OmniForConditionalGeneration",
"MammothModa2ForConditionalGeneration",
"MiMoAudioForConditionalGeneration",
"MammothModa2ARForConditionalGeneration",
Expand Down
35 changes: 16 additions & 19 deletions vllm_omni/worker_v2/omni_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
get_uniform_token_count,
)

from vllm_omni.model_executor.models.output_templates import OmniOutput
from vllm_omni.worker_v2.model_states import init_omni_model_state
from vllm_omni.worker_v2.model_states.omni_model_state import OmniModelState

Expand Down Expand Up @@ -66,6 +67,12 @@ def load_model(self, *args: Any, **kwargs: Any) -> None:
# is safe for these models.
self._exclude_full_graph = self._model_returns_tuple or hasattr(self.model, "_last_captured_layers")

# Preprocess models get embeddings via run_preprocess(), not
# encoder_runner (whose buffer size would mismatch).
if getattr(self.model, "has_preprocess", False) and self.supports_mm_inputs:
self.supports_mm_inputs = False
self.encoder_cache = None

# ------------------------------------------------------------------
# CUDA Graph: conditionally exclude FULL mode
# ------------------------------------------------------------------
Expand Down Expand Up @@ -265,26 +272,16 @@ def execute_model(
self.kv_connector.pre_forward(scheduler_output)
model_output = self.model(**model_inputs)

# ★ TUPLE INTERCEPT: handle models that return (hidden, aux_dict).
# torch.compile may prevent in-model side-effects like
# self._last_captured_layers = ... from taking effect,
# so the tuple may surface here even when the model tries to
# store captured layers internally.
if isinstance(model_output, tuple) and len(model_output) == 2:
first, second = model_output
if isinstance(first, torch.Tensor):
self._last_aux_output = second
# Store captured layers on the model so
# make_omni_output can retrieve them.
if hasattr(self.model, "_last_captured_layers"):
self.model._last_captured_layers = second
hidden_states = first
else:
self._last_aux_output = None
hidden_states = model_output
# Extract hidden_states from model output.
self._last_aux_output = None
if isinstance(model_output, OmniOutput):
hidden_states = model_output.text_hidden_states
elif isinstance(model_output, tuple) and len(model_output) == 2:
hidden_states, self._last_aux_output = model_output
if hasattr(self.model, "_last_captured_layers"):
self.model._last_captured_layers = self._last_aux_output
else:
self._last_aux_output = None
hidden_states = model_output
raise TypeError(f"Unexpected model output type: {type(model_output)}")

# ★ POST-FORWARD: per-request postprocess
if not dummy_run and isinstance(hidden_states, torch.Tensor):
Expand Down