diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py index 067c08e3c7d..3a43d7b8c48 100644 --- a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py +++ b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py @@ -377,6 +377,29 @@ def forward( ) def get_mrope_input_positions( + self, + input_tokens: list[int], + mm_features: list[MultiModalFeatureSpec] | None = None, + # V1 runner passes these explicitly; thinker extracts them + # from mm_features via gather_kwargs, so they're unused here. + hf_config: PretrainedConfig | None = None, + image_grid_thw: list[list[int]] | torch.Tensor | None = None, + video_grid_thw: list[list[int]] | torch.Tensor | None = None, + second_per_grid_ts: list[float] | None = None, + context_len: int = 0, + seq_len: int | None = None, + audio_feature_lengths: torch.Tensor | None = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + if self.model_stage == "thinker": + if mm_features is None: + mm_features = [] + return self.thinker.get_mrope_input_positions(input_tokens, mm_features) + seq_len_ = len(input_tokens) + positions = torch.arange(seq_len_).unsqueeze(0).expand(3, -1) + return positions, 0 + + def _get_mrope_input_positions_v1( self, input_tokens: list[int], mm_features: list[MultiModalFeatureSpec] | None = None, diff --git a/vllm_omni/worker_v2/model_states/__init__.py b/vllm_omni/worker_v2/model_states/__init__.py index 9b72ced5567..79baf4f6ad0 100644 --- a/vllm_omni/worker_v2/model_states/__init__.py +++ b/vllm_omni/worker_v2/model_states/__init__.py @@ -19,6 +19,7 @@ # AND update the corresponding test in tests/worker_v2/test_init_model_state.py. _OMNI_ARCHITECTURES: set[str] = { "Qwen3OmniMoeForConditionalGeneration", + "Qwen2_5OmniForConditionalGeneration", "MammothModa2ForConditionalGeneration", "MiMoAudioForConditionalGeneration", "MammothModa2ARForConditionalGeneration", diff --git a/vllm_omni/worker_v2/omni_model_runner.py b/vllm_omni/worker_v2/omni_model_runner.py index fb0082ba998..31fb83dc947 100644 --- a/vllm_omni/worker_v2/omni_model_runner.py +++ b/vllm_omni/worker_v2/omni_model_runner.py @@ -30,6 +30,7 @@ get_uniform_token_count, ) +from vllm_omni.model_executor.models.output_templates import OmniOutput from vllm_omni.worker_v2.model_states import init_omni_model_state from vllm_omni.worker_v2.model_states.omni_model_state import OmniModelState @@ -66,6 +67,12 @@ def load_model(self, *args: Any, **kwargs: Any) -> None: # is safe for these models. self._exclude_full_graph = self._model_returns_tuple or hasattr(self.model, "_last_captured_layers") + # Preprocess models get embeddings via run_preprocess(), not + # encoder_runner (whose buffer size would mismatch). + if getattr(self.model, "has_preprocess", False) and self.supports_mm_inputs: + self.supports_mm_inputs = False + self.encoder_cache = None + # ------------------------------------------------------------------ # CUDA Graph: conditionally exclude FULL mode # ------------------------------------------------------------------ @@ -265,26 +272,16 @@ def execute_model( self.kv_connector.pre_forward(scheduler_output) model_output = self.model(**model_inputs) - # ★ TUPLE INTERCEPT: handle models that return (hidden, aux_dict). - # torch.compile may prevent in-model side-effects like - # self._last_captured_layers = ... from taking effect, - # so the tuple may surface here even when the model tries to - # store captured layers internally. - if isinstance(model_output, tuple) and len(model_output) == 2: - first, second = model_output - if isinstance(first, torch.Tensor): - self._last_aux_output = second - # Store captured layers on the model so - # make_omni_output can retrieve them. - if hasattr(self.model, "_last_captured_layers"): - self.model._last_captured_layers = second - hidden_states = first - else: - self._last_aux_output = None - hidden_states = model_output + # Extract hidden_states from model output. + self._last_aux_output = None + if isinstance(model_output, OmniOutput): + hidden_states = model_output.text_hidden_states + elif isinstance(model_output, tuple) and len(model_output) == 2: + hidden_states, self._last_aux_output = model_output + if hasattr(self.model, "_last_captured_layers"): + self.model._last_captured_layers = self._last_aux_output else: - self._last_aux_output = None - hidden_states = model_output + raise TypeError(f"Unexpected model output type: {type(model_output)}") # ★ POST-FORWARD: per-request postprocess if not dummy_run and isinstance(hidden_states, torch.Tensor):