vllm-project · tzhouam · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
@@ -377,6 +377,29 @@ def forward(
         )
 
     def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec] | None = None,
+        # V1 runner passes these explicitly; thinker extracts them
+        # from mm_features via gather_kwargs, so they're unused here.
+        hf_config: PretrainedConfig | None = None,
+        image_grid_thw: list[list[int]] | torch.Tensor | None = None,
+        video_grid_thw: list[list[int]] | torch.Tensor | None = None,
+        second_per_grid_ts: list[float] | None = None,
+        context_len: int = 0,
+        seq_len: int | None = None,
+        audio_feature_lengths: torch.Tensor | None = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        if self.model_stage == "thinker":
+            if mm_features is None:
+                mm_features = []
+            return self.thinker.get_mrope_input_positions(input_tokens, mm_features)
+        seq_len_ = len(input_tokens)
+        positions = torch.arange(seq_len_).unsqueeze(0).expand(3, -1)
+        return positions, 0
+
+    def _get_mrope_input_positions_v1(
         self,
         input_tokens: list[int],
         mm_features: list[MultiModalFeatureSpec] | None = None,

@@ -19,6 +19,7 @@
 # AND update the corresponding test in tests/worker_v2/test_init_model_state.py.
 _OMNI_ARCHITECTURES: set[str] = {
     "Qwen3OmniMoeForConditionalGeneration",
+    "Qwen2_5OmniForConditionalGeneration",
     "MammothModa2ForConditionalGeneration",
     "MiMoAudioForConditionalGeneration",
     "MammothModa2ARForConditionalGeneration",

@@ -30,6 +30,7 @@
     get_uniform_token_count,
 )
 
+from vllm_omni.model_executor.models.output_templates import OmniOutput
 from vllm_omni.worker_v2.model_states import init_omni_model_state
 from vllm_omni.worker_v2.model_states.omni_model_state import OmniModelState
 
@@ -66,6 +67,12 @@ def load_model(self, *args: Any, **kwargs: Any) -> None:
         # is safe for these models.
         self._exclude_full_graph = self._model_returns_tuple or hasattr(self.model, "_last_captured_layers")
 
+        # Preprocess models get embeddings via run_preprocess(), not
+        # encoder_runner (whose buffer size would mismatch).
+        if getattr(self.model, "has_preprocess", False) and self.supports_mm_inputs:
+            self.supports_mm_inputs = False
+            self.encoder_cache = None
+
     # ------------------------------------------------------------------
     # CUDA Graph: conditionally exclude FULL mode
     # ------------------------------------------------------------------
@@ -265,26 +272,16 @@ def execute_model(
                 self.kv_connector.pre_forward(scheduler_output)
                 model_output = self.model(**model_inputs)
 
-            # ★ TUPLE INTERCEPT: handle models that return (hidden, aux_dict).
-            # torch.compile may prevent in-model side-effects like
-            # self._last_captured_layers = ... from taking effect,
-            # so the tuple may surface here even when the model tries to
-            # store captured layers internally.
-            if isinstance(model_output, tuple) and len(model_output) == 2:
-                first, second = model_output
-                if isinstance(first, torch.Tensor):
-                    self._last_aux_output = second
-                    # Store captured layers on the model so
-                    # make_omni_output can retrieve them.
-                    if hasattr(self.model, "_last_captured_layers"):
-                        self.model._last_captured_layers = second
-                    hidden_states = first
-                else:
-                    self._last_aux_output = None
-                    hidden_states = model_output
+            # Extract hidden_states from model output.
+            self._last_aux_output = None
+            if isinstance(model_output, OmniOutput):
+                hidden_states = model_output.text_hidden_states
+            elif isinstance(model_output, tuple) and len(model_output) == 2:
+                hidden_states, self._last_aux_output = model_output
+                if hasattr(self.model, "_last_captured_layers"):
+                    self.model._last_captured_layers = self._last_aux_output
             else:
-                self._last_aux_output = None
-                hidden_states = model_output
+                raise TypeError(f"Unexpected model output type: {type(model_output)}")
 
         # ★ POST-FORWARD: per-request postprocess
         if not dummy_run and isinstance(hidden_states, torch.Tensor):