From f0d0f3993bd59299ac9d60a8a870337218257969 Mon Sep 17 00:00:00 2001
From: "jaeeun.kil" <jaeeun.kil@navercorp.com>
Date: Mon, 6 Apr 2026 13:32:01 +0900
Subject: [PATCH 1/4] feat: add HyperCLOVAX-SEED-Omni-8B vision decoder and
 full pipeline support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stacks on top of #869 (HyperCLOVAX audio decoder).

- Add HCXOmniForCausalLM thinker model (LLM stage, extends HCXVisionV2)
- Add HyperCLOVAXVisionPipeline diffusion model (TA-Tok decoder, 27×27 image tokens)
- Add hcx_omni.yaml 3-stage pipeline config (thinker TP=4 + vision/audio decoders)
- Add thinker2vision_decoder and thinker2audio_decoder stage input processors
- Add fan-out async pipeline topology (stage 0 → stage 1 AND stage 0 → stage 2)
- Add _stage0_is_llm guard in serving_chat to preserve HCX multimodal inputs
- Fix vLLM 0.18.0 compatibility (AttentionBackendEnum, _RUNNER_TASKS, TaskOption)
- Add E2E tests, unit tests, client demo, and benchmark scripts

Co-Authored-By: Hyunjoon Cho <with1015@github.com>

Signed-off-by: jaeeun.kil <jaeeun.kil@navercorp.com>
---
 benchmarks/hcx-omni/README.md                 |   60 +
 benchmarks/hcx-omni/benchmark_hcx_omni.py     |  400 +++++
 benchmarks/hcx-omni/run_benchmark.sh          |   38 +
 examples/online_serving/hcx_omni/README.md    |  235 +++
 .../online_serving/hcx_omni/client_demo.py    |  153 ++
 .../online_serving/hcx_omni/run_server.sh     |   52 +
 pyproject.toml                                |    1 +
 tests/e2e/offline_inference/test_hcx_omni.py  |  169 ++
 tests/e2e/online_serving/test_hcx_omni.py     |  125 ++
 tests/e2e/stage_configs/hcx_omni_ci.yaml      |   93 +
 tests/unit/__init__.py                        |    0
 tests/unit/conftest.py                        |   19 +
 tests/unit/model_executor/__init__.py         |    0
 .../test_hcx_omni_processing.py               |  187 ++
 tools/pre_commit/check_pickle_imports.py      |    2 +
 vllm_omni/config/model.py                     |  594 ++++---
 vllm_omni/diffusion/diffusion_engine.py       |  418 +++--
 vllm_omni/diffusion/ipc.py                    |   47 +-
 .../models/hyperclovax_vision/__init__.py     |   21 +
 .../hyperclovax_vision_transformer.py         |  146 ++
 .../models/hyperclovax_vision/layers.py       |  234 +++
 .../pipeline_hyperclovax_vision.py            |  433 +++++
 .../hyperclovax_vision/transformer_usp.py     |  307 ++++
 .../vision_token_embedder.py                  |  119 ++
 vllm_omni/diffusion/registry.py               |  141 +-
 vllm_omni/diffusion/request.py                |    2 +
 .../diffusion/worker/diffusion_worker.py      |   10 +-
 vllm_omni/engine/arg_utils.py                 |  208 +--
 vllm_omni/engine/input_processor.py           |   32 +
 vllm_omni/entrypoints/async_omni.py           | 1369 +++++++-------
 vllm_omni/entrypoints/async_omni_diffusion.py |  491 +++++
 vllm_omni/entrypoints/async_omni_llm.py       |  225 +++
 vllm_omni/entrypoints/cli/main.py             |    7 +-
 vllm_omni/entrypoints/omni.py                 | 1211 +++++++++++--
 vllm_omni/entrypoints/omni_diffusion.py       |  150 ++
 vllm_omni/entrypoints/omni_llm.py             |  242 +++
 vllm_omni/entrypoints/omni_stage.py           | 1572 +++++++++++++++++
 vllm_omni/entrypoints/openai/serving_chat.py  |  547 +++---
 vllm_omni/entrypoints/stage_utils.py          |  298 +++-
 vllm_omni/entrypoints/zmq_utils.py            |   95 +
 .../models/hcx_omni/__init__.py               |    3 +
 .../models/hcx_omni/hcx_omni.py               |  134 ++
 .../models/hcx_omni/hcx_omni_thinker.py       |  126 ++
 .../qwen2_5_omni/qwen2_5_omni_thinker.py      |  208 +--
 .../qwen3_omni/qwen3_omni_moe_thinker.py      | 1245 ++++---------
 vllm_omni/model_executor/models/registry.py   |   25 +-
 .../stage_configs/hcx_omni.yaml               |  102 ++
 .../hyperclovax_seed_omni.py                  |  145 ++
 vllm_omni/worker/gpu_ar_model_runner.py       |  417 ++---
 49 files changed, 9685 insertions(+), 3173 deletions(-)
 create mode 100644 benchmarks/hcx-omni/README.md
 create mode 100644 benchmarks/hcx-omni/benchmark_hcx_omni.py
 create mode 100755 benchmarks/hcx-omni/run_benchmark.sh
 create mode 100644 examples/online_serving/hcx_omni/README.md
 create mode 100644 examples/online_serving/hcx_omni/client_demo.py
 create mode 100755 examples/online_serving/hcx_omni/run_server.sh
 create mode 100644 tests/e2e/offline_inference/test_hcx_omni.py
 create mode 100644 tests/e2e/online_serving/test_hcx_omni.py
 create mode 100644 tests/e2e/stage_configs/hcx_omni_ci.yaml
 create mode 100644 tests/unit/__init__.py
 create mode 100644 tests/unit/conftest.py
 create mode 100644 tests/unit/model_executor/__init__.py
 create mode 100644 tests/unit/model_executor/test_hcx_omni_processing.py
 create mode 100644 vllm_omni/diffusion/models/hyperclovax_vision/__init__.py
 create mode 100644 vllm_omni/diffusion/models/hyperclovax_vision/hyperclovax_vision_transformer.py
 create mode 100644 vllm_omni/diffusion/models/hyperclovax_vision/layers.py
 create mode 100644 vllm_omni/diffusion/models/hyperclovax_vision/pipeline_hyperclovax_vision.py
 create mode 100644 vllm_omni/diffusion/models/hyperclovax_vision/transformer_usp.py
 create mode 100644 vllm_omni/diffusion/models/hyperclovax_vision/vision_token_embedder.py
 create mode 100644 vllm_omni/engine/input_processor.py
 create mode 100644 vllm_omni/entrypoints/async_omni_diffusion.py
 create mode 100644 vllm_omni/entrypoints/async_omni_llm.py
 create mode 100644 vllm_omni/entrypoints/omni_diffusion.py
 create mode 100644 vllm_omni/entrypoints/omni_llm.py
 create mode 100644 vllm_omni/entrypoints/omni_stage.py
 create mode 100644 vllm_omni/entrypoints/zmq_utils.py
 create mode 100644 vllm_omni/model_executor/models/hcx_omni/__init__.py
 create mode 100644 vllm_omni/model_executor/models/hcx_omni/hcx_omni.py
 create mode 100644 vllm_omni/model_executor/models/hcx_omni/hcx_omni_thinker.py
 create mode 100644 vllm_omni/model_executor/stage_configs/hcx_omni.yaml
 create mode 100644 vllm_omni/model_executor/stage_input_processors/hyperclovax_seed_omni.py

diff --git a/benchmarks/hcx-omni/README.md b/benchmarks/hcx-omni/README.md
new file mode 100644
index 00000000000..85abf6b086e
--- /dev/null
+++ b/benchmarks/hcx-omni/README.md
@@ -0,0 +1,60 @@
+# HyperCLOVAX-SEED-Omni-8B Benchmarks
+
+Measures end-to-end latency and throughput for:
+
+| Mode | Input → Output |
+|------|----------------|
+| T2T  | Text → Text (thinker only) |
+| T2V  | Text → Text + Image |
+| S2S  | Audio + Text → Text + Audio |
+
+## Prerequisites
+
+Start the server first:
+
+```bash
+cd examples/online_serving/hcx_omni
+./run_server.sh --model naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B
+```
+
+## Run Benchmarks
+
+```bash
+# All modes (10 requests each, sequential)
+bash benchmarks/hcx-omni/run_benchmark.sh
+
+# Custom settings via env vars
+NUM_PROMPTS=50 CONCURRENCY=4 MODE=t2v bash benchmarks/hcx-omni/run_benchmark.sh
+
+# S2S with a real audio file
+python benchmarks/hcx-omni/benchmark_hcx_omni.py \
+    --mode s2s --num-prompts 20 --audio-file /path/to/speech.wav
+
+# Save results to JSON
+python benchmarks/hcx-omni/benchmark_hcx_omni.py \
+    --mode all --num-prompts 10 --output-json results.json
+```
+
+## Metrics
+
+```
+t2v Results:
+  mode                : t2v
+  total               : 10
+  success             : 10
+  success_rate        : 100.0%
+  latency_mean        : 18.43s
+  latency_p50         : 17.91s
+  latency_p90         : 21.34s
+  latency_p99         : 22.10s
+  latency_min         : 15.20s
+  latency_max         : 22.10s
+```
+
+Expected latency ranges (A100 80GB × 6):
+
+| Mode | p50 latency | Notes |
+|------|------------|-------|
+| T2T  | ~2–4 s    | Thinker only |
+| T2V  | ~15–25 s  | Thinker + 50-step diffusion |
+| S2S  | ~5–12 s   | Thinker + BigVGAN vocoder |
diff --git a/benchmarks/hcx-omni/benchmark_hcx_omni.py b/benchmarks/hcx-omni/benchmark_hcx_omni.py
new file mode 100644
index 00000000000..baaf4c9210a
--- /dev/null
+++ b/benchmarks/hcx-omni/benchmark_hcx_omni.py
@@ -0,0 +1,400 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark script for HyperCLOVAX-SEED-Omni-8B online serving.
+
+Measures end-to-end latency and throughput for:
+  - Speech-to-Speech (S2S): audio input → text + audio output
+  - Text-to-Vision  (T2V): text prompt → text + image output
+  - Text-to-Text    (T2T): text prompt → text only (thinker stage only)
+
+Metrics reported per mode:
+  - Latency   : mean / p50 / p90 / p99 (seconds, wall-clock)
+  - Throughput: requests / second
+  - Success rate
+
+Usage:
+    # Start the server first (see run_server.sh), then:
+
+    # All modes (10 requests each)
+    python benchmark_hcx_omni.py --base-url http://localhost:8000/v1 --num-prompts 10
+
+    # S2S only, 50 requests, concurrency 4
+    python benchmark_hcx_omni.py --mode s2s --num-prompts 50 --concurrency 4
+
+    # T2V only
+    python benchmark_hcx_omni.py --mode t2v --num-prompts 20
+
+    # With a real audio file for S2S
+    python benchmark_hcx_omni.py --mode s2s --audio-file /path/to/speech.wav
+"""
+
+import argparse
+import asyncio
+import base64
+import io
+import json
+import statistics
+import time
+from dataclasses import dataclass, field
+
+import aiohttp
+
+# ---------------------------------------------------------------------------
+# Defaults
+# ---------------------------------------------------------------------------
+DEFAULT_BASE_URL = "http://localhost:8000/v1"
+DEFAULT_MODEL = "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B"
+
+# System prompt required for audio/image generation to activate
+SYSTEM_PROMPT = {
+    "role": "system",
+    "content": [
+        {
+            "type": "text",
+            "text": (
+                "당신은 CLOVA X입니다. 네이버가 만든 AI 어시스턴트로서 "
+                "오디오와 이미지를 인식하고 텍스트, 음성, 이미지를 생성할 수 있습니다."
+            ),
+        }
+    ],
+}
+
+T2V_PROMPTS = [
+    "귀여운 고양이 한 마리가 소파에 앉아 있는 그림을 그려줘.",
+    "밤하늘에 별이 빛나는 산 풍경 이미지를 만들어줘.",
+    "노란 해바라기가 가득한 들판을 그려줘.",
+    "현대적인 카페 인테리어 이미지를 생성해줘.",
+    "귀여운 강아지가 공원에서 뛰노는 그림을 그려줘.",
+    "파란 바다와 흰 모래 해변의 풍경을 그려줘.",
+    "봄날의 벚꽃이 흩날리는 공원 이미지를 만들어줘.",
+    "아늑한 서재에서 책을 읽는 사람의 그림을 그려줘.",
+    "빨간 지붕의 유럽풍 작은 마을 풍경을 그려줘.",
+    "우주 공간에서 지구를 바라보는 우주비행사 그림을 그려줘.",
+]
+
+S2S_PROMPTS = [
+    "이 오디오에서 무슨 내용이 들리나요?",
+    "방금 들은 내용을 한국어로 요약해줘.",
+    "이 소리가 무엇인지 설명해줘.",
+]
+
+T2T_PROMPTS = [
+    "대한민국의 수도는 어디인가요?",
+    "하늘은 왜 파란가요?",
+    "인공지능이란 무엇인가요?",
+    "건강한 식습관을 위한 조언을 해줘.",
+    "파이썬 프로그래밍 언어의 특징은 무엇인가요?",
+]
+
+
+# ---------------------------------------------------------------------------
+# Data classes
+# ---------------------------------------------------------------------------
+@dataclass
+class RequestResult:
+    mode: str
+    latency: float
+    success: bool
+    has_audio: bool = False
+    has_image: bool = False
+    text: str = ""
+    error: str = ""
+
+
+@dataclass
+class BenchmarkStats:
+    mode: str
+    total: int
+    success: int
+    latencies: list[float] = field(default_factory=list)
+
+    @property
+    def success_rate(self) -> float:
+        return self.success / self.total if self.total else 0.0
+
+    @property
+    def throughput(self) -> float:
+        return self.success / sum(self.latencies) * self.success if self.latencies else 0.0
+
+    def summary(self) -> dict:
+        if not self.latencies:
+            return {"mode": self.mode, "total": self.total, "success": 0}
+        s = sorted(self.latencies)
+        n = len(s)
+        return {
+            "mode": self.mode,
+            "total": self.total,
+            "success": self.success,
+            "success_rate": f"{self.success_rate:.1%}",
+            "latency_mean": f"{statistics.mean(s):.2f}s",
+            "latency_p50": f"{s[int(n * 0.50)]:.2f}s",
+            "latency_p90": f"{s[int(n * 0.90)]:.2f}s",
+            "latency_p99": f"{s[min(int(n * 0.99), n - 1)]:.2f}s",
+            "latency_min": f"{s[0]:.2f}s",
+            "latency_max": f"{s[-1]:.2f}s",
+        }
+
+
+# ---------------------------------------------------------------------------
+# Audio helpers
+# ---------------------------------------------------------------------------
+def make_sine_wav_b64(duration_sec: float = 1.0, sample_rate: int = 16000) -> str:
+    """Generate a simple 440 Hz sine wave and return as base64 WAV."""
+    try:
+        import numpy as np
+        import scipy.io.wavfile as wav
+
+        t = np.linspace(0, duration_sec, int(sample_rate * duration_sec), endpoint=False)
+        audio = (np.sin(2 * np.pi * 440 * t) * 0.5).astype(np.float32)
+        buf = io.BytesIO()
+        wav.write(buf, sample_rate, (audio * 32767).astype(np.int16))
+        return base64.b64encode(buf.getvalue()).decode()
+    except ImportError:
+        # Minimal WAV header without numpy (44-byte header + silence)
+        sample_rate = 16000
+        n_samples = int(sample_rate * duration_sec)
+        data = b"\x00\x00" * n_samples  # 16-bit silence
+        data_size = len(data)
+        header = (
+            b"RIFF"
+            + (data_size + 36).to_bytes(4, "little")
+            + b"WAVEfmt "
+            + (16).to_bytes(4, "little")
+            + (1).to_bytes(2, "little")  # PCM
+            + (1).to_bytes(2, "little")  # mono
+            + sample_rate.to_bytes(4, "little")
+            + (sample_rate * 2).to_bytes(4, "little")
+            + (2).to_bytes(2, "little")
+            + (16).to_bytes(2, "little")
+            + b"data"
+            + data_size.to_bytes(4, "little")
+        )
+        return base64.b64encode(header + data).decode()
+
+
+# ---------------------------------------------------------------------------
+# Async request functions
+# ---------------------------------------------------------------------------
+async def send_request(
+    session: aiohttp.ClientSession,
+    base_url: str,
+    model: str,
+    payload: dict,
+) -> tuple[float, dict]:
+    url = f"{base_url}/chat/completions"
+    t0 = time.perf_counter()
+    async with session.post(url, json=payload, timeout=aiohttp.ClientTimeout(total=300)) as resp:
+        body = await resp.json()
+    latency = time.perf_counter() - t0
+    return latency, body
+
+
+async def run_t2t(
+    session: aiohttp.ClientSession,
+    base_url: str,
+    model: str,
+    prompt: str,
+) -> RequestResult:
+    payload = {
+        "model": model,
+        "modalities": ["text"],
+        "messages": [
+            SYSTEM_PROMPT,
+            {"role": "user", "content": prompt},
+        ],
+        "max_tokens": 256,
+    }
+    try:
+        latency, body = await send_request(session, base_url, model, payload)
+        if "error" in body:
+            return RequestResult("t2t", latency, False, error=str(body["error"]))
+        text = body["choices"][0]["message"].get("content", "")
+        return RequestResult("t2t", latency, True, text=text)
+    except Exception as e:
+        return RequestResult("t2t", 0.0, False, error=str(e))
+
+
+async def run_t2v(
+    session: aiohttp.ClientSession,
+    base_url: str,
+    model: str,
+    prompt: str,
+) -> RequestResult:
+    payload = {
+        "model": model,
+        "modalities": ["text", "image"],
+        "messages": [
+            SYSTEM_PROMPT,
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": prompt}],
+            },
+        ],
+        "max_tokens": 800,
+    }
+    try:
+        latency, body = await send_request(session, base_url, model, payload)
+        if "error" in body:
+            return RequestResult("t2v", latency, False, error=str(body["error"]))
+
+        has_image = False
+        text = ""
+        for choice in body.get("choices", []):
+            msg = choice.get("message", {})
+            content = msg.get("content")
+            if isinstance(content, list):
+                for item in content:
+                    if isinstance(item, dict) and item.get("type") == "image_url":
+                        url = item.get("image_url", {}).get("url", "")
+                        if url.startswith("data:image"):
+                            has_image = True
+            elif isinstance(content, str):
+                text += content
+        return RequestResult("t2v", latency, True, has_image=has_image, text=text)
+    except Exception as e:
+        return RequestResult("t2v", 0.0, False, error=str(e))
+
+
+async def run_s2s(
+    session: aiohttp.ClientSession,
+    base_url: str,
+    model: str,
+    prompt: str,
+    audio_b64: str,
+) -> RequestResult:
+    payload = {
+        "model": model,
+        "modalities": ["text", "audio"],
+        "messages": [
+            SYSTEM_PROMPT,
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "input_audio",
+                        "input_audio": {"data": audio_b64, "format": "wav"},
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            },
+        ],
+        "max_tokens": 512,
+    }
+    try:
+        latency, body = await send_request(session, base_url, model, payload)
+        if "error" in body:
+            return RequestResult("s2s", latency, False, error=str(body["error"]))
+
+        has_audio = False
+        text = ""
+        for choice in body.get("choices", []):
+            msg = choice.get("message", {})
+            audio = msg.get("audio")
+            if audio and audio.get("data"):
+                has_audio = True
+            content = msg.get("content")
+            if isinstance(content, str) and content and content != "None":
+                text += content
+        return RequestResult("s2s", latency, True, has_audio=has_audio, text=text)
+    except Exception as e:
+        return RequestResult("s2s", 0.0, False, error=str(e))
+
+
+# ---------------------------------------------------------------------------
+# Benchmark runner
+# ---------------------------------------------------------------------------
+async def run_benchmark(
+    mode: str,
+    base_url: str,
+    model: str,
+    num_prompts: int,
+    concurrency: int,
+    audio_b64: str,
+) -> BenchmarkStats:
+    stats = BenchmarkStats(mode=mode, total=num_prompts, success=0)
+    semaphore = asyncio.Semaphore(concurrency)
+
+    async def bounded(i: int) -> RequestResult:
+        async with semaphore:
+            if mode == "t2t":
+                prompt = T2T_PROMPTS[i % len(T2T_PROMPTS)]
+                return await run_t2t(session, base_url, model, prompt)
+            elif mode == "t2v":
+                prompt = T2V_PROMPTS[i % len(T2V_PROMPTS)]
+                return await run_t2v(session, base_url, model, prompt)
+            else:  # s2s
+                prompt = S2S_PROMPTS[i % len(S2S_PROMPTS)]
+                return await run_s2s(session, base_url, model, prompt, audio_b64)
+
+    connector = aiohttp.TCPConnector(limit=concurrency)
+    async with aiohttp.ClientSession(connector=connector) as session:
+        tasks = [bounded(i) for i in range(num_prompts)]
+        results = await asyncio.gather(*tasks)
+
+    for r in results:
+        if r.success:
+            stats.success += 1
+            stats.latencies.append(r.latency)
+        else:
+            print(f"  [FAIL] {r.error[:80]}")
+
+    return stats
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(description="HyperCLOVAX-SEED-Omni-8B benchmark")
+    parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
+    parser.add_argument("--model", default=DEFAULT_MODEL)
+    parser.add_argument(
+        "--mode",
+        choices=["t2t", "t2v", "s2s", "all"],
+        default="all",
+        help="Benchmark mode (default: all)",
+    )
+    parser.add_argument("--num-prompts", type=int, default=10)
+    parser.add_argument("--concurrency", type=int, default=1)
+    parser.add_argument("--audio-file", default=None, help="WAV file for S2S input")
+    parser.add_argument("--output-json", default=None, help="Save results to JSON file")
+    args = parser.parse_args()
+
+    # Prepare audio
+    if args.audio_file:
+        with open(args.audio_file, "rb") as f:
+            audio_b64 = base64.b64encode(f.read()).decode()
+        print(f"Using audio file: {args.audio_file}")
+    else:
+        audio_b64 = make_sine_wav_b64(1.0)
+        print("Using synthetic 1s 440Hz sine wave audio")
+
+    modes = ["t2t", "t2v", "s2s"] if args.mode == "all" else [args.mode]
+
+    print(f"\nBenchmark: {args.base_url}")
+    print(f"Model    : {args.model}")
+    print(f"Prompts  : {args.num_prompts} per mode, concurrency={args.concurrency}")
+    print()
+
+    all_stats = []
+    for mode in modes:
+        print(f"Running {mode.upper()} ({args.num_prompts} requests)...")
+        stats = asyncio.run(
+            run_benchmark(mode, args.base_url, args.model, args.num_prompts, args.concurrency, audio_b64)
+        )
+        all_stats.append(stats)
+        s = stats.summary()
+        print(f"  {mode.upper()} Results:")
+        for k, v in s.items():
+            print(f"    {k:20s}: {v}")
+        print()
+
+    if args.output_json:
+        with open(args.output_json, "w") as f:
+            json.dump([s.summary() for s in all_stats], f, indent=2, ensure_ascii=False)
+        print(f"Results saved to {args.output_json}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/hcx-omni/run_benchmark.sh b/benchmarks/hcx-omni/run_benchmark.sh
new file mode 100755
index 00000000000..a93385d149d
--- /dev/null
+++ b/benchmarks/hcx-omni/run_benchmark.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# HyperCLOVAX-SEED-Omni-8B Benchmark Script
+# Run from vllm-omni root directory.
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+cd "$ROOT"
+
+BASE_URL="${BASE_URL:-http://localhost:8000/v1}"
+NUM_PROMPTS="${NUM_PROMPTS:-10}"
+CONCURRENCY="${CONCURRENCY:-1}"
+MODE="${MODE:-all}"
+OUTPUT_DIR="$SCRIPT_DIR/results"
+
+mkdir -p "$OUTPUT_DIR"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+OUTPUT_JSON="$OUTPUT_DIR/benchmark_${MODE}_${TIMESTAMP}.json"
+
+echo "=================================================="
+echo "  HyperCLOVAX-SEED-Omni-8B Benchmark"
+echo "  BASE_URL    : $BASE_URL"
+echo "  MODE        : $MODE"
+echo "  NUM_PROMPTS : $NUM_PROMPTS"
+echo "  CONCURRENCY : $CONCURRENCY"
+echo "  OUTPUT      : $OUTPUT_JSON"
+echo "=================================================="
+
+python benchmarks/hcx-omni/benchmark_hcx_omni.py \
+    --base-url "$BASE_URL" \
+    --mode "$MODE" \
+    --num-prompts "$NUM_PROMPTS" \
+    --concurrency "$CONCURRENCY" \
+    --output-json "$OUTPUT_JSON"
+
+echo ""
+echo "Done. Results: $OUTPUT_JSON"
diff --git a/examples/online_serving/hcx_omni/README.md b/examples/online_serving/hcx_omni/README.md
new file mode 100644
index 00000000000..5c1bb458e84
--- /dev/null
+++ b/examples/online_serving/hcx_omni/README.md
@@ -0,0 +1,235 @@
+# HyperCLOVAX-SEED-Omni-8B with vLLM-Omni
+
+[HyperCLOVAX-SEED-Omni-8B](https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B)
+is an omni-modal model by NAVER Cloud that supports:
+
+| Input  | Output          |
+|--------|-----------------|
+| Text   | Text            |
+| Audio  | Text + Audio    |
+| Image  | Text            |
+| Text   | Text + Image    |
+| Audio  | Text + Audio + Image |
+
+## Architecture
+
+The model uses a 3-stage pipeline:
+
+```
+Stage 0 (Thinker) ──→ Stage 1 (Vision Decoder, diffusion)
+         │
+         └──────────→ Stage 2 (Audio Decoder, unit-BigVGAN)
+```
+
+- **Thinker**: Qwen2.5-VL vision encoder + Qwen2Audio encoder + HyperCLOVAX language model.
+  Outputs text tokens and discrete audio/vision codes in the vocabulary.
+- **Vision Decoder**: Diffusion-based image generation from 729 discrete TA-Tok codes.
+- **Audio Decoder**: Unit-BigVGAN vocoder from CosyVoice2 FSQ discrete audio codes.
+
+## Hardware Requirements
+
+| Setup     | GPUs                                        |
+|-----------|---------------------------------------------|
+| Default   | 6 × GPU ≥24 GB (4 for thinker TP, 1+1 for decoders) |
+| Minimal   | 3 × GPU ≥24 GB (1 for thinker, 1+1 for decoders) |
+
+## Quick Start
+
+### 1. Start the Server
+
+```bash
+# 6-GPU setup (production)
+./run_server.sh --model naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B
+
+# Custom GPU allocation
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 ./run_server.sh
+```
+
+### 2. Run the Client Demo
+
+```bash
+# All modes: text-only, text-to-vision, speech-to-speech
+python client_demo.py --base-url http://localhost:8000/v1
+
+# Speech-to-Speech with your own audio file
+python client_demo.py --mode s2s --audio-file /path/to/speech.wav
+
+# Text-to-Vision
+python client_demo.py --mode t2v --prompt "고양이 그림을 그려줘"
+```
+
+### 3. Use the OpenAI API Directly
+
+**Speech-to-Speech:**
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+    "modalities": ["text", "audio"],
+    "messages": [{
+      "role": "user",
+      "content": [
+        {"type": "input_audio", "input_audio": {"data": "<base64-wav>", "format": "wav"}},
+        {"type": "text", "text": "이 오디오에 무슨 내용이 있나요?"}
+      ]
+    }]
+  }'
+```
+
+**Text-to-Vision:**
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+    "modalities": ["text", "image"],
+    "messages": [{
+      "role": "user",
+      "content": [
+        {"type": "text", "text": "귀여운 강아지 한 마리가 공원에서 뛰노는 그림을 그려줘."}
+      ]
+    }]
+  }'
+```
+
+## System Prompt (Required for Audio/Image Generation)
+
+The thinker model decides whether to emit discrete audio or image tokens based
+on context. **A system prompt is required** to reliably activate audio/image
+generation. Without it, the model typically responds in text only.
+
+```python
+SYSTEM_PROMPT = {
+    "role": "system",
+    "content": [
+        {
+            "type": "text",
+            "text": (
+                "당신은 CLOVA X입니다. 네이버가 만든 AI 어시스턴트로서 "
+                "오디오와 이미지를 인식하고 텍스트, 음성, 이미지를 생성할 수 있습니다."
+            ),
+        }
+    ],
+}
+```
+
+Include it as the first message in every request that expects audio or image output.
+
+## Mode Activation Conditions
+
+### Speech-to-Speech (S2S)
+
+**Requirements:**
+- `modalities: ["text", "audio"]`
+- Audio input via `input_audio` content block (base64-encoded WAV/MP3)
+- System prompt included
+
+The thinker generates discrete audio unit tokens (`<|audio0000|>` … `<|audio6560|>`)
+in its output, which are routed to the audio decoder (BigVGAN). The audio
+response is in `choices[N].message.audio.data` (base64 WAV).
+
+```python
+response = client.chat.completions.create(
+    model=MODEL,
+    modalities=["text", "audio"],
+    messages=[
+        SYSTEM_PROMPT,
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {"data": audio_b64, "format": "wav"},
+                },
+                {"type": "text", "text": "이 오디오에 무슨 내용이 있나요?"},
+            ],
+        },
+    ],
+)
+
+# The response may have two choices: one with text, one with audio
+for choice in response.choices:
+    if choice.message.audio:
+        wav_bytes = base64.b64decode(choice.message.audio.data)
+```
+
+### Text-to-Vision (T2V)
+
+**Requirements:**
+- `modalities: ["text", "image"]`
+- Text-only user message (no audio input)
+- System prompt included
+
+The thinker generates 729 discrete vision codes (`<|vision00000|>` … `<|vision65535|>`,
+27×27 TA-Tok tokens), which are routed to the vision decoder (diffusion, 50 steps by
+default). The image is returned in `choices[N].message.content` as an
+`image_url` item with a `data:image/png;base64,...` URL.
+
+```python
+response = client.chat.completions.create(
+    model=MODEL,
+    modalities=["text", "image"],
+    messages=[
+        SYSTEM_PROMPT,
+        {
+            "role": "user",
+            "content": [{"type": "text", "text": "귀여운 강아지가 공원에서 뛰노는 그림을 그려줘."}],
+        },
+    ],
+)
+
+# Parse raw JSON to access image_url content
+import json, httpx
+raw = json.loads(response._raw_response.content)
+for choice in raw["choices"]:
+    content = choice["message"].get("content", [])
+    if isinstance(content, list):
+        for item in content:
+            if item.get("type") == "image_url":
+                data_url = item["image_url"]["url"]   # "data:image/png;base64,..."
+                img_bytes = base64.b64decode(data_url.split(",", 1)[1])
+```
+
+### Text-to-Text (T2T)
+
+No special requirements. The thinker responds in text only.
+
+```python
+response = client.chat.completions.create(
+    model=MODEL,
+    modalities=["text"],
+    messages=[{"role": "user", "content": "대한민국의 수도는 어디인가요?"}],
+)
+print(response.choices[0].message.content)
+```
+
+## Response Structure
+
+| Mode | `choices[i].message` field | Content |
+|------|--------------------------|---------|
+| T2T  | `content` (str)          | Text response |
+| S2S  | `content` (str)          | Text transcript |
+| S2S  | `audio.data` (str)       | base64 WAV |
+| T2V  | `content` (list)         | `[{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}]` |
+
+> **Note:** S2S responses typically contain two `choices` entries — one with
+> the text and one with the audio. Iterate over all choices to collect both.
+
+## Stage Config
+
+The default stage config is at
+`vllm_omni/model_executor/stage_configs/hcx_omni.yaml`.
+
+Key parameters:
+
+| Stage | Type      | `model_arch` / `model_class_name`  | GPU   |
+|-------|-----------|------------------------------------|-------|
+| 0     | LLM       | `HCXVisionV2ForCausalLM`           | 0-3   |
+| 1     | Diffusion | `HyperCLOVAXVisionPipeline`        | 4     |
+| 2     | Diffusion | `HyperCLOVAXAudioPipeline`         | 5     |
+
+## Benchmarks
+
+See [`benchmarks/hcx-omni/`](../../../benchmarks/hcx-omni/) for latency and
+throughput measurement scripts.
diff --git a/examples/online_serving/hcx_omni/client_demo.py b/examples/online_serving/hcx_omni/client_demo.py
new file mode 100644
index 00000000000..bdc2ac58a4d
--- /dev/null
+++ b/examples/online_serving/hcx_omni/client_demo.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""HyperCLOVAX-SEED-Omni-8B client demo.
+
+Demonstrates Speech-to-Speech and Text-to-Vision via the OpenAI-compatible
+HTTP API provided by vLLM-Omni.
+
+Usage:
+    # Start the server first (see run_server.sh), then:
+    python client_demo.py --base-url http://localhost:8000/v1
+
+    # With a local audio file:
+    python client_demo.py --audio-file path/to/speech.wav
+
+    # Text-to-Vision only:
+    python client_demo.py --mode t2v --prompt "고양이 그림을 그려줘"
+"""
+
+import argparse
+import base64
+import io
+import sys
+from pathlib import Path
+
+try:
+    from openai import OpenAI
+except ImportError:
+    print("Please install openai: pip install openai")
+    sys.exit(1)
+
+
+def encode_audio_file(path: str) -> str:
+    """Base64-encode a WAV/MP3 file."""
+    with open(path, "rb") as f:
+        return base64.b64encode(f.read()).decode()
+
+
+def encode_audio_array(array, sample_rate: int = 16000) -> str:
+    """Base64-encode a numpy audio array as WAV."""
+    import numpy as np
+    import scipy.io.wavfile as wav
+
+    if not isinstance(array, np.ndarray):
+        array = np.array(array)
+    buf = io.BytesIO()
+    wav.write(buf, sample_rate, (array * 32767).astype(np.int16))
+    return base64.b64encode(buf.getvalue()).decode()
+
+
+def speech_to_speech(client: OpenAI, audio_b64: str, prompt: str = "이 오디오에 무슨 내용이 있나요?"):
+    """Send audio → receive text + audio."""
+    print(f"\n[Speech-to-Speech] prompt: {prompt!r}")
+    response = client.chat.completions.create(
+        model="naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+        modalities=["text", "audio"],
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "input_audio",
+                        "input_audio": {"data": audio_b64, "format": "wav"},
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ],
+    )
+    choice = response.choices[0]
+    print(f"Text response: {choice.message.content}")
+    if hasattr(choice.message, "audio") and choice.message.audio:
+        audio_data = base64.b64decode(choice.message.audio.data)
+        out_path = Path("/tmp/hcx_omni_response.wav")
+        out_path.write_bytes(audio_data)
+        print(f"Audio saved to: {out_path}")
+    return response
+
+
+def text_to_vision(client: OpenAI, prompt: str = "귀여운 강아지 한 마리가 공원에서 뛰노는 그림을 그려줘."):
+    """Send text → receive text + image."""
+    print(f"\n[Text-to-Vision] prompt: {prompt!r}")
+    response = client.chat.completions.create(
+        model="naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+        modalities=["text", "image"],
+        messages=[
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": prompt}],
+            }
+        ],
+    )
+    choice = response.choices[0]
+    print(f"Text response: {choice.message.content}")
+    if hasattr(choice.message, "image") and choice.message.image:
+        img_data = base64.b64decode(choice.message.image.data)
+        out_path = Path("/tmp/hcx_omni_generated.png")
+        out_path.write_bytes(img_data)
+        print(f"Image saved to: {out_path}")
+    return response
+
+
+def text_only(client: OpenAI, prompt: str = "대한민국의 수도는 어디인가요?"):
+    """Pure text conversation (thinker only)."""
+    print(f"\n[Text-only] prompt: {prompt!r}")
+    response = client.chat.completions.create(
+        model="naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+        modalities=["text"],
+        messages=[{"role": "user", "content": prompt}],
+    )
+    print(f"Response: {response.choices[0].message.content}")
+    return response
+
+
+def main():
+    parser = argparse.ArgumentParser(description="HyperCLOVAX-SEED-Omni-8B demo")
+    parser.add_argument("--base-url", default="http://localhost:8000/v1")
+    parser.add_argument(
+        "--mode",
+        choices=["s2s", "t2v", "text", "all"],
+        default="all",
+        help="Demo mode: s2s=Speech-to-Speech, t2v=Text-to-Vision, text=Text-only",
+    )
+    parser.add_argument("--audio-file", default=None, help="Path to input audio file")
+    parser.add_argument("--prompt", default=None, help="Text prompt override")
+    args = parser.parse_args()
+
+    client = OpenAI(api_key="EMPTY", base_url=args.base_url)
+
+    if args.mode in ("text", "all"):
+        text_only(client, prompt=args.prompt or "대한민국의 수도는 어디인가요?")
+
+    if args.mode in ("t2v", "all"):
+        text_to_vision(client, prompt=args.prompt or "귀여운 강아지 한 마리가 공원에서 뛰노는 그림을 그려줘.")
+
+    if args.mode in ("s2s", "all"):
+        if args.audio_file:
+            audio_b64 = encode_audio_file(args.audio_file)
+        else:
+            # Generate synthetic 1-second sine wave
+            try:
+                import numpy as np
+
+                t = np.linspace(0, 1, 16000, endpoint=False)
+                audio_array = np.sin(2 * np.pi * 440 * t).astype(np.float32)
+                audio_b64 = encode_audio_array(audio_array)
+            except ImportError:
+                print("numpy not available, skipping S2S demo")
+                return
+        speech_to_speech(client, audio_b64, prompt=args.prompt or "이 오디오에 무슨 내용이 있나요?")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/hcx_omni/run_server.sh b/examples/online_serving/hcx_omni/run_server.sh
new file mode 100755
index 00000000000..c3cbafba4b2
--- /dev/null
+++ b/examples/online_serving/hcx_omni/run_server.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Launch HyperCLOVAX-SEED-Omni-8B with vLLM-Omni.
+#
+# Requirements:
+#   - 6× GPUs (≥24 GB VRAM each):
+#       GPU 0-3: Thinker (tensor_parallel_size=4)
+#       GPU 4  : Vision decoder
+#       GPU 5  : Audio decoder
+#   - HF model: naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B
+#
+# Usage:
+#   ./run_server.sh [--model MODEL] [--port PORT] [--stage-configs-path PATH]
+
+set -e
+
+MODEL="${MODEL:-naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B}"
+PORT="${PORT:-8000}"
+HOST="${HOST:-0.0.0.0}"
+STAGE_CONFIG="${STAGE_CONFIG:-}"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DEFAULT_STAGE_CONFIG="$SCRIPT_DIR/../../../vllm_omni/model_executor/stage_configs/hcx_omni.yaml"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)         MODEL="$2";       shift 2 ;;
+        --port)          PORT="$2";        shift 2 ;;
+        --host)          HOST="$2";        shift 2 ;;
+        --stage-configs-path) STAGE_CONFIG="$2"; shift 2 ;;
+        --help)
+            echo "Usage: $0 [--model MODEL] [--port PORT] [--host HOST] [--stage-configs-path PATH]"
+            exit 0 ;;
+        *) echo "Unknown: $1"; exit 1 ;;
+    esac
+done
+
+[[ -z "$STAGE_CONFIG" ]] && STAGE_CONFIG="$DEFAULT_STAGE_CONFIG"
+
+echo "================================================="
+echo " HyperCLOVAX-SEED-Omni-8B  vLLM-Omni Server"
+echo "================================================="
+echo " Model       : $MODEL"
+echo " Stage config: $STAGE_CONFIG"
+echo " Endpoint    : http://$HOST:$PORT/v1"
+echo "================================================="
+
+python -m vllm_omni.entrypoints.openai.api_server \
+    --model "$MODEL" \
+    --stage-configs-path "$STAGE_CONFIG" \
+    --port "$PORT" \
+    --host "$HOST" \
+    --trust-remote-code
diff --git a/pyproject.toml b/pyproject.toml
index 9b034a7c8e9..c11d08b4e83 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -237,3 +237,4 @@ ue = "ue"
 semantics = "semantics"
 fullset = "fullset"
 Vai = "Vai"
+nd = "nd"
diff --git a/tests/e2e/offline_inference/test_hcx_omni.py b/tests/e2e/offline_inference/test_hcx_omni.py
new file mode 100644
index 00000000000..565a6a0f4e7
--- /dev/null
+++ b/tests/e2e/offline_inference/test_hcx_omni.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""E2E tests for HyperCLOVAX-SEED-Omni-8B.
+
+Tests cover:
+  - Text-only inference (comprehension)
+  - Speech-to-Speech  (audio input → audio output)
+  - Text-to-Vision    (text input  → image output)
+  - Audio-to-Vision   (audio input → image + audio output)
+"""
+
+from pathlib import Path
+
+import pytest
+
+from tests.conftest import (
+    generate_synthetic_audio,
+    generate_synthetic_image,
+    modify_stage_config,
+)
+from tests.utils import hardware_test
+
+MODEL = "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B"
+
+_CI_YAML = str(Path(__file__).parent.parent / "stage_configs" / "hcx_omni_ci.yaml")
+
+
+def _ci_config(enforce_eager: bool = True) -> str:
+    updates: dict = {
+        "stage_args": {
+            0: {"engine_args.enforce_eager": str(enforce_eager).lower()},
+        }
+    }
+    return modify_stage_config(_CI_YAML, updates=updates)
+
+
+stage_config = _ci_config(enforce_eager=True)
+test_params = [(MODEL, stage_config)]
+
+
+# ------------------------------------------------------------------ #
+# Helper                                                               #
+# ------------------------------------------------------------------ #
+
+
+def _text_prompt(text: str) -> dict:
+    return {
+        "role": "user",
+        "content": [{"type": "text", "text": text}],
+    }
+
+
+def _audio_text_prompt(audio_array, text: str) -> dict:
+    return {
+        "role": "user",
+        "content": [
+            {"type": "input_audio", "input_audio": {"data": audio_array, "format": "wav"}},
+            {"type": "text", "text": text},
+        ],
+    }
+
+
+def _image_text_prompt(image_array, text: str) -> dict:
+    return {
+        "role": "user",
+        "content": [
+            {"type": "image_url", "image_url": {"url": image_array}},
+            {"type": "text", "text": text},
+        ],
+    }
+
+
+# ------------------------------------------------------------------ #
+# Tests                                                                #
+# ------------------------------------------------------------------ #
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(
+    res={"cuda": "L4"},
+    num_cards={"cuda": 3},
+)
+@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
+def test_text_to_text(omni_runner, omni_runner_handler) -> None:
+    """Text-only (comprehension) request — verifies thinker stage alone."""
+    request_config = {
+        "prompts": "What is the capital of South Korea?",
+        "output_modalities": ["text"],
+    }
+    results = omni_runner.run(request_config)
+    assert results and len(results) > 0
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(
+    res={"cuda": "L4"},
+    num_cards={"cuda": 3},
+)
+@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
+def test_audio_to_audio(omni_runner, omni_runner_handler) -> None:
+    """Speech-to-Speech: audio input processed by thinker → audio decoder."""
+    audio = generate_synthetic_audio(1, 1, 16000)["np_array"]
+    if len(audio.shape) == 2:
+        audio = audio.squeeze()
+
+    request_config = {
+        "prompts": _audio_text_prompt(audio, "Repeat what you heard."),
+        "output_modalities": ["text", "audio"],
+    }
+    results = omni_runner.run(request_config)
+    assert results and len(results) > 0
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(
+    res={"cuda": "L4"},
+    num_cards={"cuda": 3},
+)
+@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
+def test_text_to_image(omni_runner, omni_runner_handler) -> None:
+    """Text-to-Vision: text prompt → image generated by vision decoder."""
+    request_config = {
+        "prompts": "Draw a picture of a cat sitting on a sofa.",
+        "output_modalities": ["text", "image"],
+    }
+    results = omni_runner.run(request_config)
+    assert results and len(results) > 0
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(
+    res={"cuda": "L4"},
+    num_cards={"cuda": 3},
+)
+@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
+def test_image_to_text(omni_runner, omni_runner_handler) -> None:
+    """Image understanding: image input → text description."""
+    image = generate_synthetic_image(224, 224)["np_array"]
+    request_config = {
+        "prompts": _image_text_prompt(image, "Describe this image."),
+        "output_modalities": ["text"],
+    }
+    results = omni_runner.run(request_config)
+    assert results and len(results) > 0
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(
+    res={"cuda": "L4"},
+    num_cards={"cuda": 3},
+)
+@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
+def test_multimodal_to_multimodal(omni_runner, omni_runner_handler) -> None:
+    """Full omni: audio + image input → text + audio + image output."""
+    audio = generate_synthetic_audio(1, 1, 16000)["np_array"]
+    if len(audio.shape) == 2:
+        audio = audio.squeeze()
+
+    request_config = {
+        "prompts": "Listen to the audio and draw what you hear.",
+        "output_modalities": ["text", "audio", "image"],
+    }
+    results = omni_runner.run(request_config)
+    assert results and len(results) > 0
diff --git a/tests/e2e/online_serving/test_hcx_omni.py b/tests/e2e/online_serving/test_hcx_omni.py
new file mode 100644
index 00000000000..491c8371797
--- /dev/null
+++ b/tests/e2e/online_serving/test_hcx_omni.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""E2E online serving tests for HyperCLOVAX-SEED-Omni-8B.
+
+Tests the OpenAI-compatible HTTP API for Speech-to-Speech and
+Text-to-Vision generation.
+"""
+
+import os
+from pathlib import Path
+
+import pytest
+
+from tests.conftest import (
+    OmniServerParams,
+    generate_synthetic_audio,
+    generate_synthetic_image,
+)
+from tests.utils import hardware_test
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
+
+MODEL = "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B"
+_CI_YAML = str(Path(__file__).parent.parent / "stage_configs" / "hcx_omni_ci.yaml")
+
+test_params = [OmniServerParams(model=MODEL, stage_config_path=_CI_YAML)]
+
+SYSTEM_PROMPT = {
+    "role": "system",
+    "content": [
+        {
+            "type": "text",
+            "text": (
+                "당신은 CLOVA X입니다. 네이버가 만든 AI 어시스턴트로서 "
+                "오디오와 이미지를 인식하고 텍스트, 음성, 이미지를 생성할 수 있습니다."
+            ),
+        }
+    ],
+}
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards={"cuda": 3})
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_speech_to_speech(omni_server, omni_server_handler) -> None:
+    """Speech-to-Speech: audio input → text + audio response."""
+    audio = generate_synthetic_audio(1, 1, 16000)["np_array"]
+    if len(audio.shape) == 2:
+        audio = audio.squeeze()
+
+    messages = [
+        SYSTEM_PROMPT,
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {"data": audio, "format": "wav"},
+                },
+                {"type": "text", "text": "이 오디오에서 무슨 내용이 들리나요?"},
+            ],
+        },
+    ]
+    request_config = {
+        "messages": messages,
+        "modalities": ["text", "audio"],
+        "stream": False,
+    }
+    response = omni_server.chat(request_config)
+    assert response is not None
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards={"cuda": 3})
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_text_to_vision(omni_server, omni_server_handler) -> None:
+    """Text-to-Vision: text prompt → text + image response."""
+    messages = [
+        SYSTEM_PROMPT,
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "고양이 한 마리가 소파에 앉아 있는 그림을 그려줘."},
+            ],
+        },
+    ]
+    request_config = {
+        "messages": messages,
+        "modalities": ["text", "image"],
+        "stream": False,
+    }
+    response = omni_server.chat(request_config)
+    assert response is not None
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards={"cuda": 3})
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_image_understanding(omni_server, omni_server_handler) -> None:
+    """Image understanding: image input → text description."""
+    image = generate_synthetic_image(224, 224)["np_array"]
+    messages = [
+        SYSTEM_PROMPT,
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image}"},
+                },
+                {"type": "text", "text": "이 이미지에 무엇이 있나요?"},
+            ],
+        },
+    ]
+    request_config = {
+        "messages": messages,
+        "modalities": ["text"],
+        "stream": False,
+    }
+    response = omni_server.chat(request_config)
+    assert response is not None
diff --git a/tests/e2e/stage_configs/hcx_omni_ci.yaml b/tests/e2e/stage_configs/hcx_omni_ci.yaml
new file mode 100644
index 00000000000..f455422689e
--- /dev/null
+++ b/tests/e2e/stage_configs/hcx_omni_ci.yaml
@@ -0,0 +1,93 @@
+# Stage config for HyperCLOVAX-SEED-Omni-8B CI tests.
+# Verified on 3x 24GB GPU (L4/RTX3090/RTX4090).
+# Stage 0 (thinker): 4xTP → single GPU in CI uses 1xTP
+
+runtime:
+  connectors:
+    shared_memory_connector:
+      extra:
+        shm_threshold_bytes: 65536
+      name: SharedMemoryConnector
+  defaults:
+    max_inflight: 1
+    window_size: -1
+  edges:
+    - { from: 0, to: 1, window_size: -1 }
+    - { from: 0, to: 2, window_size: -1 }
+  enabled: true
+
+stage_args:
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      process: true
+      devices: "0"
+    engine_args:
+      model_stage: thinker
+      model_arch: HCXVisionV2ForCausalLM
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      tensor_parallel_size: 1
+      max_model_len: 4096
+      max_num_batched_tokens: 4096
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+      limit_mm_per_prompt:
+        audio: 1
+        image: 1
+      load_format: dummy
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.1
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: true
+      repetition_penalty: 1.0
+
+  - stage_id: 1
+    stage_type: diffusion
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      engine_output_type: image
+      gpu_memory_utilization: 0.75
+      model_class_name: HyperCLOVAXVisionPipeline
+      model_stage: decoder/vision
+      model_subdir: decoder/vision
+      trust_remote_code: true
+      enforce_eager: true
+    engine_input_source:
+      - 0
+    final_output: true
+    final_output_type: image
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.hyperclovax_seed_omni.thinker2vision_decoder
+
+  - stage_id: 2
+    stage_type: diffusion
+    runtime:
+      process: true
+      devices: "2"
+      max_batch_size: 1
+    engine_args:
+      engine_output_type: audio
+      gpu_memory_utilization: 0.4
+      model_class_name: HyperCLOVAXAudioPipeline
+      model_stage: decoder/audio
+      model_subdir: decoder/audio/NCZSCosybigvganDecoder.mar
+      trust_remote_code: true
+      enforce_eager: true
+    engine_input_source:
+      - 0
+    final_output: true
+    final_output_type: audio
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.hyperclovax_seed_omni.thinker2audio_decoder
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
new file mode 100644
index 00000000000..42656022e64
--- /dev/null
+++ b/tests/unit/conftest.py
@@ -0,0 +1,19 @@
+"""conftest.py for unit tests — stubs out heavy vllm_omni init."""
+
+import sys
+import types
+
+# Provide a lightweight stub for vllm_omni so that submodule imports
+# (e.g. vllm_omni.model_executor.stage_input_processors) don't trigger the
+# full package __init__.py which requires a complete vLLM installation.
+_stub = types.ModuleType("vllm_omni")
+_stub.__path__ = []
+_stub.__spec__ = None
+sys.modules.setdefault("vllm_omni", _stub)
+
+# Stub out vllm_omni.inputs.data.OmniTokensPrompt
+_inputs = types.ModuleType("vllm_omni.inputs")
+_inputs_data = types.ModuleType("vllm_omni.inputs.data")
+_inputs_data.OmniTokensPrompt = dict  # type: ignore[attr-defined]
+sys.modules.setdefault("vllm_omni.inputs", _inputs)
+sys.modules.setdefault("vllm_omni.inputs.data", _inputs_data)
diff --git a/tests/unit/model_executor/__init__.py b/tests/unit/model_executor/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/unit/model_executor/test_hcx_omni_processing.py b/tests/unit/model_executor/test_hcx_omni_processing.py
new file mode 100644
index 00000000000..71b544a070e
--- /dev/null
+++ b/tests/unit/model_executor/test_hcx_omni_processing.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for HCXOmni multimodal token processing.
+
+Tests verify that:
+  1. Audio tokens are correctly positioned and embedded.
+  2. Image tokens (continuous path via Qwen2.5-VL) are correctly positioned.
+  3. Discrete audio/image token boundaries match config.json values.
+  4. Stage input processors correctly extract discrete tokens from mixed output.
+"""
+
+# Token ID boundaries (from HyperCLOVAX-SEED-Omni-8B config.json)
+DISCRETE_AUDIO_UNIT_0_ID = 128606
+DISCRETE_IMAGE_UNIT_0_ID = 135168
+DISCRETE_AUDIO_VOCAB_SIZE = 6561
+DISCRETE_IMAGE_VOCAB_SIZE = 65536
+DISCRETE_IMAGE_TOKEN_LENGTH = 729  # 27 * 27
+
+
+class TestDiscreteTokenBoundaries:
+    """Verify token ID arithmetic matches config.json."""
+
+    def test_audio_range(self):
+        assert DISCRETE_AUDIO_UNIT_0_ID == 128606
+        assert DISCRETE_AUDIO_UNIT_0_ID + DISCRETE_AUDIO_VOCAB_SIZE - 1 == 135166
+
+    def test_image_range(self):
+        assert DISCRETE_IMAGE_UNIT_0_ID == 135168
+        # Image codebook is 2^16 = 65536
+        assert DISCRETE_IMAGE_VOCAB_SIZE == 65536
+
+    def test_no_overlap(self):
+        audio_end = DISCRETE_AUDIO_UNIT_0_ID + DISCRETE_AUDIO_VOCAB_SIZE
+        assert audio_end < DISCRETE_IMAGE_UNIT_0_ID, "Audio and image token ranges must not overlap"
+
+    def test_image_token_count_is_square(self):
+        """TA-Tok produces 27×27 = 729 tokens per image."""
+        import math
+
+        side = math.isqrt(DISCRETE_IMAGE_TOKEN_LENGTH)
+        assert side * side == DISCRETE_IMAGE_TOKEN_LENGTH
+
+
+class TestExtractDiscreteTokens:
+    """Test the _extract_discrete_tokens helper."""
+
+    def _extract(self, token_ids, start_id, vocab_size):
+        return [tid - start_id for tid in token_ids if start_id <= tid < start_id + vocab_size]
+
+    def test_extract_audio_tokens(self):
+        token_ids = [
+            100,
+            200,  # text
+            DISCRETE_AUDIO_UNIT_0_ID,
+            DISCRETE_AUDIO_UNIT_0_ID + 42,
+            DISCRETE_AUDIO_UNIT_0_ID + 100,
+            300,  # text
+        ]
+        result = self._extract(token_ids, DISCRETE_AUDIO_UNIT_0_ID, DISCRETE_AUDIO_VOCAB_SIZE)
+        assert result == [0, 42, 100]
+
+    def test_extract_image_tokens(self):
+        token_ids = [
+            100,
+            DISCRETE_IMAGE_UNIT_0_ID,
+            DISCRETE_IMAGE_UNIT_0_ID + 255,
+            200,
+        ]
+        result = self._extract(token_ids, DISCRETE_IMAGE_UNIT_0_ID, DISCRETE_IMAGE_VOCAB_SIZE)
+        assert result == [0, 255]
+
+    def test_no_overlap_extraction(self):
+        """Audio extraction must not pick up image tokens and vice versa."""
+        mixed = [
+            DISCRETE_AUDIO_UNIT_0_ID + 5,
+            DISCRETE_IMAGE_UNIT_0_ID + 5,
+        ]
+        audio = self._extract(mixed, DISCRETE_AUDIO_UNIT_0_ID, DISCRETE_AUDIO_VOCAB_SIZE)
+        image = self._extract(mixed, DISCRETE_IMAGE_UNIT_0_ID, DISCRETE_IMAGE_VOCAB_SIZE)
+        assert audio == [5]
+        assert image == [5]
+
+    def test_truncate_and_pad_image(self):
+        """Vision decoder needs exactly DISCRETE_IMAGE_TOKEN_LENGTH codes."""
+        codes = list(range(DISCRETE_IMAGE_TOKEN_LENGTH + 50))  # too long
+        truncated = codes[:DISCRETE_IMAGE_TOKEN_LENGTH]
+        assert len(truncated) == DISCRETE_IMAGE_TOKEN_LENGTH
+
+        codes_short = list(range(100))  # too short
+        padded = codes_short + [0] * (DISCRETE_IMAGE_TOKEN_LENGTH - len(codes_short))
+        assert len(padded) == DISCRETE_IMAGE_TOKEN_LENGTH
+
+
+class TestStageInputProcessor:
+    """Test thinker2vision_decoder and thinker2audio_decoder processors."""
+
+    def _make_fake_output(self, token_ids: list[int]):
+        """Create a minimal fake EngineCoreOutput-like object."""
+        from types import SimpleNamespace
+
+        output = SimpleNamespace(
+            token_ids=token_ids,
+        )
+        thinker_out = SimpleNamespace(
+            outputs=[output],
+            request_id="test-001",
+            prompt_token_ids=[1, 2, 3],
+        )
+        return thinker_out
+
+    def test_vision_decoder_extracts_image_tokens(self):
+        """thinker2vision_decoder should extract exactly 729 image tokens."""
+        image_codes = list(range(DISCRETE_IMAGE_UNIT_0_ID, DISCRETE_IMAGE_UNIT_0_ID + DISCRETE_IMAGE_TOKEN_LENGTH))
+        audio_codes = list(range(DISCRETE_AUDIO_UNIT_0_ID, DISCRETE_AUDIO_UNIT_0_ID + 20))
+        token_ids = [100, 200] + audio_codes + image_codes + [300]
+
+        thinker_out = self._make_fake_output(token_ids)
+
+        from types import SimpleNamespace
+
+        stage_list = {0: SimpleNamespace(engine_outputs=[thinker_out])}
+
+        from vllm_omni.model_executor.stage_input_processors.hyperclovax_seed_omni import (
+            thinker2vision_decoder,
+        )
+
+        results = thinker2vision_decoder(stage_list, [0])
+        assert len(results) == 1
+        prompt_ids = results[0]["prompt_token_ids"]
+        assert len(prompt_ids) == DISCRETE_IMAGE_TOKEN_LENGTH
+        assert all(0 <= tid < DISCRETE_IMAGE_VOCAB_SIZE for tid in prompt_ids)
+
+    def test_audio_decoder_extracts_audio_tokens(self):
+        """thinker2audio_decoder should extract discrete audio tokens."""
+        audio_codes = list(range(DISCRETE_AUDIO_UNIT_0_ID, DISCRETE_AUDIO_UNIT_0_ID + 50))
+        token_ids = [100, 200] + audio_codes + [300]
+
+        thinker_out = self._make_fake_output(token_ids)
+
+        from types import SimpleNamespace
+
+        stage_list = {0: SimpleNamespace(engine_outputs=[thinker_out])}
+
+        from vllm_omni.model_executor.stage_input_processors.hyperclovax_seed_omni import (
+            thinker2audio_decoder,
+        )
+
+        results = thinker2audio_decoder(stage_list, [0])
+        assert len(results) == 1
+        additional = results[0]["additional_information"]
+        audio_tokens = additional["audio_tokens"][0]
+        assert len(audio_tokens) == 50
+        assert all(0 <= tid < DISCRETE_AUDIO_VOCAB_SIZE for tid in audio_tokens)
+
+    def test_vision_decoder_no_output_if_no_image_tokens(self):
+        """thinker2vision_decoder returns empty list when no image tokens present."""
+        token_ids = [100, 200, 300]  # text only
+
+        thinker_out = self._make_fake_output(token_ids)
+
+        from types import SimpleNamespace
+
+        stage_list = {0: SimpleNamespace(engine_outputs=[thinker_out])}
+
+        from vllm_omni.model_executor.stage_input_processors.hyperclovax_seed_omni import (
+            thinker2vision_decoder,
+        )
+
+        results = thinker2vision_decoder(stage_list, [0])
+        assert results == []
+
+    def test_audio_decoder_no_output_if_no_audio_tokens(self):
+        """thinker2audio_decoder returns empty list when no audio tokens present."""
+        token_ids = [100, 200, 300]  # text only
+
+        thinker_out = self._make_fake_output(token_ids)
+
+        from types import SimpleNamespace
+
+        stage_list = {0: SimpleNamespace(engine_outputs=[thinker_out])}
+
+        from vllm_omni.model_executor.stage_input_processors.hyperclovax_seed_omni import (
+            thinker2audio_decoder,
+        )
+
+        results = thinker2audio_decoder(stage_list, [0])
+        assert results == []
diff --git a/tools/pre_commit/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py
index 1c08a1543d2..89384686bb7 100644
--- a/tools/pre_commit/check_pickle_imports.py
+++ b/tools/pre_commit/check_pickle_imports.py
@@ -20,6 +20,8 @@
     "tests/utils.py",
     "vllm_omni/diffusion/distributed/group_coordinator.py",
     "tests/diffusion/attention/test_attention_sp.py",
+    # cloudpickle needed to serialize arbitrary worker_cls types across processes
+    "vllm_omni/entrypoints/omni_llm.py",
 }
 
 PICKLE_RE = re.compile(
diff --git a/vllm_omni/config/model.py b/vllm_omni/config/model.py
index 588efabfc4f..f9e49abc678 100644
--- a/vllm_omni/config/model.py
+++ b/vllm_omni/config/model.py
@@ -1,13 +1,61 @@
-from dataclasses import MISSING, field
+import warnings
+from importlib.util import find_spec
 from typing import Any
 
-from pydantic import ConfigDict, TypeAdapter
-from vllm.config import ModelConfig
-from vllm.config.utils import config
+import torch
+import vllm.envs as envs
+from pydantic import ConfigDict
+from pydantic.dataclasses import dataclass
+
+try:
+    from vllm.attention.backends.registry import AttentionBackendEnum
+except ImportError:
+    from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.config import ModelConfig, config
+
+try:
+    from vllm.config.model import (
+        _RUNNER_CONVERTS,
+        _RUNNER_TASKS,
+        ConvertOption,
+        ConvertType,
+        RunnerOption,
+        TaskOption,
+        _get_and_verify_dtype,
+        get_served_model_name,
+    )
+except ImportError:
+    # vLLM 0.18.0: _RUNNER_TASKS and TaskOption were removed/renamed
+    _RUNNER_TASKS: dict = {
+        "generate": {"generate", "auto"},
+        "pooling": {"embed", "classify", "reward", "score"},
+    }
+    from vllm.config.model import (  # type: ignore[no-redef]
+        _RUNNER_CONVERTS,
+        ConvertOption,
+        ConvertType,
+        RunnerOption,
+        _get_and_verify_dtype,
+        get_served_model_name,
+    )
+
+    TaskOption = str  # type: ignore[misc,assignment]
+from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
+from vllm.config.pooler import PoolerConfig
 from vllm.logger import init_logger
-from vllm.transformers_utils.config import get_hf_text_config
-from vllm.transformers_utils.model_arch_config_convertor import (
-    ModelArchConfigConvertorBase,
+from vllm.platforms import current_platform
+from vllm.transformers_utils.config import (
+    get_config,
+    get_hf_image_processor_config,
+    get_hf_text_config,
+    get_pooling_config,
+)
+from vllm.transformers_utils.gguf_utils import (
+    maybe_patch_hf_config_from_gguf,
+)
+from vllm.transformers_utils.utils import (
+    is_gguf,
+    maybe_model_redirect,
 )
 
 import vllm_omni.model_executor.models as me_models
@@ -15,103 +63,37 @@
 logger = init_logger(__name__)
 
 
-class OmniModelArchConfigConvertor(ModelArchConfigConvertorBase):
-    """Config convertor for Omni multi-stage models.
-
-    Pre-quantized checkpoints (e.g. modelopt FP8) store quantization
-    config in a stage-specific sub-config (e.g.
-    thinker_config.text_config.quantization_config) with correct relative
-    prefixes.  The legacy hf_quant_config.json sits at the top level with
-    "thinker."-prefixed names that don't match vllm-omni's module names.
-
-    This convertor accepts an optional *stage_config_name* so that only
-    the relevant stage's quantization config is surfaced.
-    """
-
-    def __init__(
-        self,
-        hf_config,
-        hf_text_config,
-        stage_config_name: str | None = None,
-    ):
-        super().__init__(hf_config, hf_text_config)
-        self.stage_config_name = stage_config_name
-
-    def get_quantization_config(self):
-        # When a stage_config_name is set, look for quantization config
-        # in that stage's text_config first (has correct relative prefixes).
-        if self.stage_config_name is not None:
-            stage_cfg = getattr(self.hf_config, self.stage_config_name, None)
-            if stage_cfg is not None:
-                text_cfg = getattr(stage_cfg, "text_config", None)
-                if text_cfg is not None:
-                    quant_cfg = self._normalize_quantization_config(text_cfg)
-                    if quant_cfg is not None:
-                        return quant_cfg
-
-            # For non-thinker stages (talker, code2wav) whose text_config
-            # has no quantization_config, return None so quantization is
-            # not applied to stages that were not quantized.
-            return None
-
-        return super().get_quantization_config()
-
-
-@config(config=ConfigDict(arbitrary_types_allowed=True))
+@config
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class OmniModelConfig(ModelConfig):
     """Configuration for Omni models, extending the base ModelConfig.
 
-     This configuration class extends the base vLLM ModelConfig with
-     omni-specific fields for multi-stage pipeline processing.
-
-     Attributes:
-         hf_config: The model's HF Transformers config (default: None)
-         hf_text_config: The sub text_config of the model's hf_config (default: None)
-         stage_id: Identifier for the stage in a multi-stage pipeline (default: 0)
-         async_chunk: If set to True, perform async chunk
-         model_stage: Stage type identifier, e.g., "thinker" or "talker"
-             (default: "thinker")
-         model_arch: Model architecture name
-             (default: "Qwen2_5OmniForConditionalGeneration")
-         worker_type: Model Type, e.g., "ar" or "generation"
-         engine_output_type: Optional output type specification for the engine.
-             Used to route outputs to appropriate processors (e.g., "image",
-             "audio", "latents"). If None, output type is inferred.
-         stage_connector_config: Stage connector configuration dictionary.
-             Contains "name" (connector name), "extra" (extra connector config).
-         task_type: Default task type for TTS models (CustomVoice, VoiceDesign, or Base).
-             If not specified, will be inferred from model path.
-
-
-    The correct way to initialize this class is via vLLM config, as most
-    of the logic for handling values is in the ModelConfig's __post_init__.
-
-       Example:
-         >>> config = OmniModelConfig.from_vllm_model_config(
-         ...     vllm_config,
-         ...     stage_id=0,
-         ...     model_stage="thinker",
-         ...     model_arch="Qwen2_5OmniForConditionalGeneration"
-         ... )
+    This configuration class extends the base vLLM ModelConfig with
+    omni-specific fields for multi-stage pipeline processing.
+
+    Attributes:
+        stage_id: Identifier for the stage in a multi-stage pipeline (default: 0)
+        model_stage: Stage type identifier, e.g., "thinker" or "talker"
+            (default: "thinker")
+        model_arch: Model architecture name
+            (default: "Qwen2_5OmniForConditionalGeneration")
+        engine_output_type: Optional output type specification for the engine.
+            Used to route outputs to appropriate processors (e.g., "image",
+            "audio", "latents"). If None, output type is inferred.
+
+    Example:
+        >>> config = OmniModelConfig(
+        ...     stage_id=0,
+        ...     model_stage="thinker",
+        ...     model_arch="Qwen2_5OmniForConditionalGeneration"
+        ... )
     """
 
     stage_id: int = 0
-    async_chunk: bool = False
     model_stage: str = "thinker"
-    model_arch: str | None = None
-    worker_type: str | None = None
+    model_arch: str = "Qwen2_5OmniForConditionalGeneration"
     engine_output_type: str | None = None
     hf_config_name: str | None = None
-    custom_process_next_stage_input_func: str | None = None
-    stage_connector_config: dict[str, Any] = field(
-        default_factory=lambda: {
-            "name": "SharedMemoryConnector",
-            "extra": {},
-        }
-    )
-    omni_kv_config: dict | None = None
-    codec_frame_rate_hz: float | None = None
-    task_type: str | None = None
 
     @property
     def registry(self):
@@ -119,32 +101,7 @@ def registry(self):
 
     @property
     def architectures(self) -> list[str]:
-        if self.model_arch is not None:
-            return [self.model_arch]
-        return super().architectures
-
-    @property
-    def embedding_size(self):
-        if self.hf_config_name is not None:
-            stage_config = getattr(self.hf_config, self.hf_config_name, None)
-            override = getattr(stage_config, "embedding_size", None)
-            if override is not None:
-                return override
-        return super().embedding_size
-
-    def get_model_arch_config(self):
-        # For multi-stage omni models, use a stage-aware convertor so that
-        # only the correct stage's quantization config is surfaced.
-        # Without this, a pre-quantized thinker checkpoint would also
-        # apply quantization to the talker/code2wav stages.
-        if self.hf_config_name is not None:
-            convertor = OmniModelArchConfigConvertor(
-                self.hf_config,
-                self.hf_text_config,
-                stage_config_name=self.hf_config_name,
-            )
-            return convertor.convert()
-        return super().get_model_arch_config()
+        return [self.model_arch]
 
     def draw_hf_text_config(self):
         # transformers' get_text_config method is used to get the text config from thinker_config.
@@ -164,113 +121,290 @@ def draw_hf_text_config(self):
             )
             return get_hf_text_config(self.hf_config)
 
-    def _patch_qwen3_tts(self):
-        """Patches the value of `position_id_per_seconds` in Qwen3's
-        TTS's talker_config into the this class's codec_frame_rate_hz.
-        """
-        talker_cfg = getattr(self.hf_config, "talker_config", None)
-        if isinstance(talker_cfg, dict):
-            pos_per_sec = talker_cfg.get("position_id_per_seconds")
+    def __post_init__(
+        self,
+        # Multimodal config init vars
+        limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
+        enable_mm_embeds: bool | None,
+        media_io_kwargs: dict[str, dict[str, Any]] | None,
+        mm_processor_kwargs: dict[str, Any] | None,
+        mm_processor_cache_gb: float | None,
+        mm_processor_cache_type: MMCacheType | None,
+        mm_shm_cache_max_object_size_mb: int | None,
+        mm_encoder_tp_mode: MMEncoderTPMode | None,
+        mm_encoder_attn_backend: AttentionBackendEnum | str | None,
+        interleave_mm_strings: bool | None,
+        skip_mm_profiling: bool | None,
+        video_pruning_rate: float | None,
+    ) -> None:
+        # Keep set served_model_name before maybe_model_redirect(self.model)
+        self.served_model_name = get_served_model_name(self.model, self.served_model_name)
+        self.model = maybe_model_redirect(self.model)
+        # The tokenizer is consistent with the model by default.
+        if self.tokenizer is None:
+            self.tokenizer = self.model
+        if self.tokenizer_revision is None:
+            self.tokenizer_revision = self.revision
+        self.tokenizer = maybe_model_redirect(self.tokenizer)
+
+        if isinstance(self.hf_config_path, str):
+            self.hf_config_path = maybe_model_redirect(self.hf_config_path)
+
+        if callable(self.hf_overrides):
+            hf_overrides_kw = {}
+            hf_overrides_fn = self.hf_overrides
+            dict_overrides: dict[str, Any] = {}
         else:
-            pos_per_sec = getattr(talker_cfg, "position_id_per_seconds", None)
-        if pos_per_sec is not None:
-            try:
-                fps = float(pos_per_sec)
-            except Exception:
-                fps = None
-            if fps is not None and fps > 0:
-                self.codec_frame_rate_hz = fps
-
-    def _maybe_override_text_config(self):
-        """Override hf_text_config with omni-specific logic for multi-stage
-        models (e.g., thinker_config, talker_config).
-        """
-        new_hf_text_config = self.draw_hf_text_config()
-        if new_hf_text_config is not self.hf_text_config:
-            self.hf_text_config = new_hf_text_config
-            # Recalculate dependent attributes
-            self.attention_chunk_size = getattr(self.hf_text_config, "attention_chunk_size", None)
-            # Recalculate max_model_len since it depends on hf_text_config
-            self.max_model_len = self.get_and_verify_max_len(self.original_max_model_len)
-            # Reset sliding_window if needed
-            if self.disable_sliding_window and self.hf_text_config is not None:
-                self.hf_text_config.sliding_window = None
-
-    @classmethod
-    def from_vllm_model_config(cls, model_config: ModelConfig, **omni_kwargs):
-        """Create OmniModelConfig from an existing vLLM ModelConfig
-        and additional Omni specific kwargs.
-
-        NOTE: The validation and __post_init__ for ModelConfig is expensive;
-        to avoid calling it a second time, we explicitly retrieve defaults
-        from dataclass attributes for values not passed to omni_kwargs,
-        and use that to initialize a __new__ instance. This is significantly
-        faster than creating the OmniModelConfig directly from the ModelConfig,
-        and saves us from having to pass all kwargs to the OmniModelConfig.
-        """
-        # Add missing defaults to the omni kwargs and ensure values are valid
-        cls.add_defaults_to_omni_kwargs(omni_kwargs)
-        cls._validate_omni_fields(**omni_kwargs)
-
-        # Allocate the new omni config and copy the model config & omni fields
-        omni_cfg = object.__new__(cls)
-        omni_cfg.__dict__.update(model_config.__dict__)
-        omni_cfg.__dict__.update(omni_kwargs)
-
-        # Apply any model specific patches or necessary overrides
-        if (
-            omni_cfg.codec_frame_rate_hz is None
-            and omni_cfg.model_arch == "Qwen3TTSTalkerForConditionalGenerationARVLLM"
-        ):
-            omni_cfg._patch_qwen3_tts()
-
-        omni_cfg._maybe_override_text_config()
-
-        if omni_cfg.hf_config is not None:
-            omni_cfg.hf_config.architectures = omni_cfg.architectures
-
-        return omni_cfg
-
-    @classmethod
-    def _validate_omni_fields(cls, **omni_kwargs):
-        """Validate omni-specific fields; we use TypeAdapters here to quickly
-        validate only omni kwargs to avoid rerunning validation on the
-        ModelConfig.
-
-        NOTE: This assumes add_defaults_to_omni_kwargs has already been called,
-        so that all omni fields are present in the provided omni_kwargs.
-        """
-        omni_fields = set(cls.__dataclass_fields__) - set(ModelConfig.__dataclass_fields__)
-
-        for key, value in omni_kwargs.items():
-            if key not in omni_fields:
-                raise ValueError(f"Unexpected omni kwarg: {key}")
-
-            field_type = cls.__dataclass_fields__[key].type
-            if field_type is not Any:
-                TypeAdapter(field_type).validate_python(value)
-
-        # We should not have any uninitialized keys
-        uninitialized_fields = omni_fields - omni_kwargs.keys()
-        if len(uninitialized_fields):
-            logger.error(f"The following OmniModelConfig keys were not initialized: {uninitialized_fields}")
-
-    @classmethod
-    def add_defaults_to_omni_kwargs(cls, omni_kwargs):
-        """Because we init the OmniModelConfig with __new__ to sidestep expensive
-        validation, we need to be careful to ensure fields with default factories
-        are initialized, otherwise we will get an AttributeError when we use it.
-
-        To work around this issue, we explicitly add defaults to the omni_kwargs
-        dict provided to ensure all fields are defined correctly.
-
-        NOTE: omni_kwargs are mutated in place.
-        """
-        omni_fields = set(cls.__dataclass_fields__) - set(ModelConfig.__dataclass_fields__)
-
-        for field_name in omni_fields - set(omni_kwargs.keys()):
-            field_def = cls.__dataclass_fields__[field_name]
-            if field_def.default_factory is not MISSING:
-                omni_kwargs[field_name] = field_def.default_factory()
-            elif field_def.default is not MISSING:
-                omni_kwargs[field_name] = field_def.default
+            # Separate dict overrides from flat ones
+            # We'll determine how to apply dict overrides after loading the config
+            hf_overrides_kw = {}
+            dict_overrides = {}
+            for key, value in self.hf_overrides.items():
+                if isinstance(value, dict):
+                    dict_overrides[key] = value
+                else:
+                    hf_overrides_kw[key] = value
+            hf_overrides_fn = None
+
+        self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
+
+        if (backend := envs.VLLM_ATTENTION_BACKEND) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
+            raise ValueError(
+                "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
+                "module was not found. See "
+                "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile "  # noqa: E501
+                "for instructions on how to install it."
+            )
+
+        if self.override_attention_dtype is not None and not current_platform.is_rocm():
+            warnings.warn(
+                "override-attention-dtype is set but not using ROCm platform",
+                stacklevel=2,
+            )
+
+        if self.enable_sleep_mode and not current_platform.is_sleep_mode_available():
+            raise ValueError("Sleep mode is not supported on current platform.")
+
+        hf_config = get_config(
+            self.hf_config_path or self.model,
+            self.trust_remote_code,
+            self.revision,
+            self.code_revision,
+            self.config_format,
+            hf_overrides_kw=hf_overrides_kw,
+            hf_overrides_fn=hf_overrides_fn,
+        )
+        hf_config = maybe_patch_hf_config_from_gguf(
+            self.model,
+            hf_config,
+        )
+
+        self.hf_config = hf_config
+        if dict_overrides:
+            self._apply_dict_overrides(hf_config, dict_overrides)
+        self.hf_text_config = self.draw_hf_text_config()
+        self.attention_chunk_size = getattr(self.hf_text_config, "attention_chunk_size", None)
+        self.encoder_config = self._get_encoder_config()
+        # Try to load image processor config, but allow it to fail for stages that don't need it
+        try:
+            self.hf_image_processor_config = get_hf_image_processor_config(
+                self.model, hf_token=self.hf_token, revision=self.revision
+            )
+        except (OSError, ValueError, IndexError) as e:
+            # Some stages (e.g., code2wav, talker) don't need image processor
+            # Log warning but allow initialization to continue
+            logger.warning(
+                f"Failed to load image processor config for model '{self.model}': {e}. "
+                "This is expected for stages that don't require image processing."
+            )
+            self.hf_image_processor_config = None
+
+        architectures = self.architectures
+        registry = self.registry
+        is_generative_model = registry.is_text_generation_model(architectures, self)
+        is_pooling_model = registry.is_pooling_model(architectures, self)
+
+        def _task_to_convert(task: TaskOption) -> ConvertType:
+            if task == "embedding" or task == "embed":
+                return "embed"
+            if task == "classify":
+                return "classify"
+            if task == "reward":
+                return "reward"
+            if task == "score":
+                new_task = self._get_default_pooling_task(architectures)
+                return "classify" if new_task == "classify" else "embed"
+
+            return "none"
+
+        if self.task is not None:
+            runner: RunnerOption = "auto"
+            convert: ConvertOption = "auto"
+            msg_prefix = (
+                "The 'task' option has been deprecated and will be removed in v0.13.0 or v1.0, whichever comes first."
+            )
+            msg_hint = "Please remove this option."
+
+            is_generative_task = self.task in _RUNNER_TASKS["generate"]
+            is_pooling_task = self.task in _RUNNER_TASKS["pooling"]
+
+            if is_generative_model and is_pooling_model:
+                if is_generative_task:
+                    runner = "generate"
+                    convert = "auto"
+                    msg_hint = (
+                        "Please replace this option with `--runner "
+                        "generate` to continue using this model "
+                        "as a generative model."
+                    )
+                elif is_pooling_task:
+                    runner = "pooling"
+                    convert = "auto"
+                    msg_hint = (
+                        "Please replace this option with `--runner "
+                        "pooling` to continue using this model "
+                        "as a pooling model."
+                    )
+                else:  # task == "auto"
+                    pass
+            elif is_generative_model or is_pooling_model:
+                if is_generative_task:
+                    runner = "generate"
+                    convert = "auto"
+                    msg_hint = "Please remove this option"
+                elif is_pooling_task:
+                    runner = "pooling"
+                    convert = _task_to_convert(self.task)
+                    msg_hint = (
+                        "Please replace this option with `--convert "
+                        f"{convert}` to continue using this model "
+                        "as a pooling model."
+                    )
+                else:  # task == "auto"
+                    pass
+            else:
+                # Neither generative nor pooling model - try to convert if possible
+                if is_pooling_task:
+                    runner = "pooling"
+                    convert = _task_to_convert(self.task)
+                    msg_hint = (
+                        "Please replace this option with `--runner pooling "
+                        f"--convert {convert}` to continue using this model "
+                        "as a pooling model."
+                    )
+                else:
+                    debug_info = {
+                        "architectures": architectures,
+                        "is_generative_model": is_generative_model,
+                        "is_pooling_model": is_pooling_model,
+                    }
+                    raise AssertionError(
+                        "The model should be a generative or "
+                        "pooling model when task is set to "
+                        f"{self.task!r}. Found: {debug_info}"
+                    )
+
+            self.runner = runner
+            self.convert = convert
+
+            msg = f"{msg_prefix} {msg_hint}"
+            warnings.warn(msg, DeprecationWarning, stacklevel=2)
+
+        self.runner_type = self._get_runner_type(architectures, self.runner)
+        self.convert_type = self._get_convert_type(architectures, self.runner_type, self.convert)
+
+        if self.runner_type == "generate" and not is_generative_model:
+            generate_converts = _RUNNER_CONVERTS["generate"]
+            if self.convert_type not in generate_converts:
+                # Currently we don't have any converters for generative models
+                raise ValueError("This model does not support `--runner generate`.")
+        if self.runner_type == "pooling" and not is_pooling_model:
+            pooling_converts = _RUNNER_CONVERTS["pooling"]
+            if self.convert_type not in pooling_converts:
+                convert_option = "<" + "|".join(pooling_converts) + ">"
+                raise ValueError(
+                    "This model does not support `--runner pooling`. "
+                    f"You can pass `--convert {convert_option} to adapt "
+                    "it into a pooling model."
+                )
+
+        # Note: Initialize these attributes early because transformers fallback
+        # may fail to load dynamic modules in child processes
+        model_info, arch = registry.inspect_model_cls(architectures, self)
+        self._model_info = model_info
+        self._architecture = arch
+        logger.info("Resolved architecture: %s", arch)
+
+        # Init pooler config if needed
+        if self.runner_type == "pooling":
+            if self.pooler_config is None:
+                self.pooler_config = PoolerConfig()
+
+            base_config = get_pooling_config(self.model, self.revision)
+            if base_config is not None:
+                # Only set values that are not overridden by the user
+                for k, v in base_config.items():
+                    if getattr(self.pooler_config, k) is None:
+                        setattr(self.pooler_config, k, v)
+
+            default_pooling_type = self._model_info.default_pooling_type
+            if self.pooler_config.pooling_type is None:
+                self.pooler_config.pooling_type = default_pooling_type
+
+        self.dtype: torch.dtype = _get_and_verify_dtype(
+            self.model,
+            self.hf_config,
+            self.dtype,
+            is_pooling_model=self.runner_type == "pooling",
+            revision=self.revision,
+        )
+
+        self.original_max_model_len = self.max_model_len
+        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
+        # Init multimodal config if needed
+        if self._model_info.supports_multimodal:
+            if mm_encoder_tp_mode == "data" and not self._model_info.supports_multimodal_encoder_tp_data:
+                logger.warning_once(
+                    "This model does not support `--mm-encoder-tp-mode data`. "
+                    "Falling back to `--mm-encoder-tp-mode weights`."
+                )
+                mm_encoder_tp_mode = "weights"
+
+            mm_config_kwargs = dict(
+                limit_per_prompt=limit_mm_per_prompt,
+                enable_mm_embeds=enable_mm_embeds,
+                media_io_kwargs=media_io_kwargs,
+                mm_processor_kwargs=mm_processor_kwargs,
+                mm_processor_cache_gb=mm_processor_cache_gb,
+                mm_processor_cache_type=mm_processor_cache_type,
+                mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb,
+                mm_encoder_tp_mode=mm_encoder_tp_mode,
+                mm_encoder_attn_backend=mm_encoder_attn_backend,
+                interleave_mm_strings=interleave_mm_strings,
+                skip_mm_profiling=skip_mm_profiling,
+                video_pruning_rate=video_pruning_rate,
+            )
+
+            mm_config_kwargs = {k: v for k, v in mm_config_kwargs.items() if v is not None}
+
+            self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
+
+        # Multimodal GGUF models must use original repo for mm processing
+        if is_gguf(self.tokenizer) and self.is_multimodal_model:
+            raise ValueError(
+                "Loading a multimodal GGUF model needs to use original "
+                "tokenizer. Please specify the unquantized hf model's "
+                "repo name or path using the --tokenizer argument."
+            )
+
+        if self.disable_sliding_window:
+            # Set after get_and_verify_max_len to ensure that max_model_len
+            # can be correctly capped to sliding window size
+            self.hf_text_config.sliding_window = None
+
+        # Avoid running try_verify_and_update_config multiple times
+        self.config_updated = False
+
+        self._verify_quantization()
+        self._verify_cuda_graph()
+        self._verify_bnb_config()
diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py
index fe940d623e5..c28fe0943ce 100644
--- a/vllm_omni/diffusion/diffusion_engine.py
+++ b/vllm_omni/diffusion/diffusion_engine.py
@@ -7,31 +7,25 @@
 import queue
 import threading
 import time
-from collections.abc import Iterable
+import weakref
+from collections.abc import Callable, Iterable
+from dataclasses import dataclass
 from typing import Any
 
-import numpy as np
 import PIL.Image
-import torch
 from vllm.logger import init_logger
 
-from vllm_omni.diffusion.data import (
-    DiffusionOutput,
-    DiffusionRequestAbortedError,
-    OmniDiffusionConfig,
-)
-from vllm_omni.diffusion.executor.abstract import DiffusionExecutor
+from vllm_omni.diffusion.data import SHUTDOWN_MESSAGE, OmniDiffusionConfig
 from vllm_omni.diffusion.registry import (
     DiffusionModelRegistry,
     get_diffusion_post_process_func,
     get_diffusion_pre_process_func,
 )
 from vllm_omni.diffusion.request import OmniDiffusionRequest
-from vllm_omni.diffusion.sched import RequestScheduler, SchedulerInterface, StepScheduler
-from vllm_omni.diffusion.sched.interface import DiffusionRequestStatus
-from vllm_omni.diffusion.worker.utils import RunnerOutput
+from vllm_omni.diffusion.scheduler import Scheduler, scheduler
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniTextPrompt
 from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.utils.platform_utils import get_diffusion_worker_class
 
 logger = init_logger(__name__)
 
@@ -43,33 +37,43 @@ def supports_image_input(model_class_name: str) -> bool:
     return bool(getattr(model_cls, "support_image_input", False))
 
 
-def supports_audio_input(model_class_name: str) -> bool:
-    model_cls = DiffusionModelRegistry._try_load_model_cls(model_class_name)
-    if model_cls is None:
-        return False
-    return bool(getattr(model_cls, "support_audio_input", False))
-
-
-def image_color_format(model_class_name: str) -> str:
-    model_cls = DiffusionModelRegistry._try_load_model_cls(model_class_name)
-    return getattr(model_cls, "color_format", "RGB")
-
-
-def supports_audio_output(model_class_name: str) -> bool:
-    model_cls = DiffusionModelRegistry._try_load_model_cls(model_class_name)
-    if model_cls is None:
-        return False
-    return bool(getattr(model_cls, "support_audio_output", False))
+@dataclass
+class BackgroundResources:
+    """
+    Used as a finalizer for clean shutdown.
+    Create a BackgroundResources instance to encapsulate all background resources
+    (e.g., the scheduler and worker processes) that need explicit cleanup.
+    This object holds references to external system resources that are not managed
+    by Python's garbage collector (like OS processes, message queues, etc.),
+    so they must be cleaned up manually to avoid resource leaks or zombie processes.
+    """
+
+    scheduler: Scheduler | None = None
+    processes: list[mp.Process] | None = None
+
+    def __call__(self):
+        """Clean up background resources."""
+        if scheduler is not None:
+            try:
+                for _ in range(scheduler.num_workers):
+                    scheduler.mq.enqueue(SHUTDOWN_MESSAGE)
+                scheduler.close()
+            except Exception as exc:
+                logger.warning("Failed to send shutdown signal: %s", exc)
+        for proc in self.processes:
+            if not proc.is_alive():
+                continue
+            proc.join(30)
+            if proc.is_alive():
+                logger.warning("Terminating diffusion worker %s after timeout", proc.name)
+                proc.terminate()
+                proc.join(30)
 
 
 class DiffusionEngine:
     """The diffusion engine for vLLM-Omni diffusion models."""
 
-    def __init__(
-        self,
-        od_config: OmniDiffusionConfig,
-        scheduler: SchedulerInterface | None = None,
-    ):
+    def __init__(self, od_config: OmniDiffusionConfig):
         """Initialize the diffusion engine.
 
         Args:
@@ -86,17 +90,9 @@ def __init__(
             and "sampling_params" in inspect.signature(self.post_process_func).parameters
         )
 
-        executor_class = DiffusionExecutor.get_class(od_config)
-        self.executor = executor_class(od_config)
-        self.step_execution = bool(getattr(od_config, "step_execution", False))
-        self.scheduler: SchedulerInterface = scheduler or (
-            StepScheduler() if self.step_execution else RequestScheduler()
-        )
-        self.scheduler.initialize(od_config)
-        self._rpc_lock = threading.RLock()
-        self.abort_queue: queue.Queue[str] = queue.Queue()
-        self.execute_fn = self.executor.execute_step if self.step_execution else self.executor.execute_request
-
+        self._processes: list[mp.Process] = []
+        self._closed = False
+        self._make_client()
         try:
             self._dummy_run()
         except Exception as e:
@@ -322,10 +318,7 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]:
             return results
 
     @staticmethod
-    def make_engine(
-        config: OmniDiffusionConfig,
-        scheduler: SchedulerInterface | None = None,
-    ) -> DiffusionEngine:
+    def make_engine(config: OmniDiffusionConfig) -> "DiffusionEngine":
         """Factory method to create a DiffusionEngine instance.
 
         Args:
@@ -334,48 +327,14 @@ def make_engine(
         Returns:
             An instance of DiffusionEngine.
         """
-        return DiffusionEngine(config, scheduler=scheduler)
-
-    def add_req_and_wait_for_response(self, request: OmniDiffusionRequest) -> DiffusionOutput:
-        with self._rpc_lock:
-            target_sched_req_id = self.scheduler.add_request(request)
-
-            # keep scheduling and executing until the target request is finished
-            while True:
-                self._process_aborts_queue()
-                sched_output = self.scheduler.schedule()
-                if sched_output.is_empty:
-                    if target_sched_req_id in sched_output.finished_req_ids:
-                        return self._finalize_finished_request(target_sched_req_id)
-                    if not self.scheduler.has_requests():
-                        raise RuntimeError("Diffusion scheduler has no runnable requests.")
-                    continue
-
-                # NOTE: add_req_and_wait_for_response() is synchronous, and
-                # the scheduler currently enforces _max_batch_size = 1 (see
-                # vllm_omni/diffusion/sched/base_scheduler.py), so we directly
-                # take the single scheduled request here.
-                sched_req_id = sched_output.scheduled_req_ids[0]
-                try:
-                    runner_output = self.execute_fn(sched_output)
-                except Exception as exc:
-                    logger.error("Execution failed for diffusion request %s", sched_req_id, exc_info=True)
-                    runner_output = RunnerOutput(
-                        req_id=sched_req_id,
-                        step_index=None,
-                        finished=True,
-                        result=DiffusionOutput(error=str(exc)),
-                    )
+        return DiffusionEngine(config)
 
-                self._process_aborts_queue()
+    def _make_client(self):
+        # TODO rename it
+        scheduler.initialize(self.od_config)
 
-                finished_req_ids = self.scheduler.update_from_output(sched_output, runner_output)
-                if target_sched_req_id in finished_req_ids:
-                    return self._finalize_finished_request(
-                        target_sched_req_id,
-                        runner_output=runner_output,
-                        missing_result_error="Diffusion execution finished without a final output.",
-                    )
+        # Get the broadcast handle from the initialized scheduler
+        broadcast_handle = scheduler.get_broadcast_handle()
 
     def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
         """Start or stop profiling on all diffusion workers.
@@ -384,41 +343,145 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N
             is_start: True to start profiling, False to stop.
             profile_prefix: Optional prefix for trace filename.
         """
-        if is_start:
-            if profile_prefix is None:
-                profile_prefix = f"diffusion_{int(time.time())}"
-            logger.info(f"Starting diffusion profiling with prefix: {profile_prefix}")
-        else:
-            logger.info("Stopping diffusion profiling...")
+        Start torch profiling on all diffusion workers.
+
+        Creates a directory (if needed) and sets up a base filename template
+        for per-rank profiler traces (typically saved as <template>_rank<N>.json).
+
+        Args:
+            trace_filename: Optional base filename (without extension or rank suffix).
+                            If None, generates one using current timestamp.
+        """
+        if trace_filename is None:
+            trace_filename = f"stage_0_diffusion_{int(time.time())}_rank"
+
+        trace_dir = os.environ.get("VLLM_TORCH_PROFILER_DIR", "./profiles")
+
+        # Expand ~ and ~user, then make absolute (robust against cwd changes)
+        trace_dir = os.path.expanduser(trace_dir)
+        trace_dir = os.path.abspath(trace_dir)
 
         try:
-            self.collective_rpc(method="profile", args=(is_start, profile_prefix))
+            os.makedirs(trace_dir, exist_ok=True)
+        except OSError as exc:
+            logger.error(f"Failed to create profiler directory {trace_dir}: {exc}")
+            raise
+
+        # Build final template path (without rank or extension — torch.profiler appends those)
+        full_template = os.path.join(trace_dir, trace_filename)
+
+        expected_pattern = f"{full_template}*.json"
+        logger.info(f"Starting diffusion profiling → {expected_pattern}")
+
+        # Also log the absolute directory once (useful in multi-node or containers)
+        logger.debug(f"Profiler output directory: {trace_dir}")
+
+        # Propagate to all workers
+        try:
+            self.collective_rpc(method="start_profile", args=(full_template,))
         except Exception as e:
-            action = "start" if is_start else "stop"
-            logger.error(f"Failed to {action} profiling on workers", exc_info=True)
-            if is_start:
-                raise RuntimeError(f"Could not {action} profiler: {e}") from e
+            logger.error("Failed to start profiling on workers", exc_info=True)
+            raise RuntimeError(f"Could not start profiler: {e}") from e
+
+    def stop_profile(self) -> dict:
+        """
+        Stop profiling on all workers and collect the final trace/table paths.
+
+        The worker (torch_profiler.py) now handles trace export, compression to .gz,
+        and deletion of the original .json file. This method only collects and
+        reports the paths returned by the workers.
+
+        Returns:
+            dict with keys:
+            - "traces": list of final trace file paths (usually .json.gz)
+            - "tables": list of table strings (one per rank)
+        """
+        logger.info("Stopping diffusion profiling and collecting results...")
+
+        try:
+            # Give worker enough time — export + compression + table can be slow
+            results = self.collective_rpc(method="stop_profile", timeout=60000)
+        except Exception:
+            logger.error("Failed to stop profiling on workers", exc_info=True)
+            return {"traces": [], "tables": []}
+
+        output_files = {"traces": [], "tables": []}
+        successful_traces = 0
+
+        if not results:
+            logger.warning("No profiling results returned from any rank")
+            return output_files
+
+        for rank, res in enumerate(results):
+            if not isinstance(res, dict):
+                logger.warning(f"Rank {rank}: invalid result format (got {type(res)})")
+                continue
+
+            # 1. Trace file — should be .json.gz if compression succeeded
+            trace_path = res.get("trace")
+            if trace_path:
+                # We trust the worker — it created/compressed the file
+                logger.info(f"[Rank {rank}] Final trace: {trace_path}")
+                output_files["traces"].append(trace_path)
+                successful_traces += 1
+
+                # Optional: warn if path looks suspicious (e.g. still .json)
+                if not trace_path.endswith((".json.gz", ".json")):
+                    logger.warning(f"Rank {rank}: unusual trace path extension: {trace_path}")
+
+            # 2. Table file — plain text
+            table = res.get("table")
+            if table:
+                output_files["tables"].append(table)
+
+        # Final summary logging
+        num_ranks = len(results)
+        if successful_traces > 0:
+            final_paths_str = ", ".join(output_files["traces"][:3])
+            if len(output_files["traces"]) > 3:
+                final_paths_str += f" ... (+{len(output_files['traces']) - 3} more)"
+
+            logger.info(
+                f"Profiling stopped. Collected {successful_traces} trace file(s) "
+                f"from {num_ranks} rank(s). "
+                f"Final trace paths: {final_paths_str}"
+            )
+        elif output_files["traces"]:
+            logger.info(
+                f"Profiling stopped but no traces were successfully collected. "
+                f"Reported paths: {', '.join(output_files['traces'][:3])}"
+                f"{' ...' if len(output_files['traces']) > 3 else ''}"
+            )
+        else:
+            logger.info("Profiling stopped — no trace files were collected from any rank.")
+
+        if output_files["tables"]:
+            logger.debug(f"Collected {len(output_files['tables'])} profiling table(s)")
+
+        return output_files
 
     def _dummy_run(self):
         """A dummy run to warm up the model."""
+        prompt = "dummy run"
+        # note that num_inference_steps=1 will cause timestep and temb None in the pipeline
         num_inference_steps = 1
-        height = 512
-        width = 512
+        height = 256
+        width = 256
         if supports_image_input(self.od_config.model_class_name):
             # Provide a dummy image input if the model supports it
-            color_format = image_color_format(self.od_config.model_class_name)
-            dummy_image = PIL.Image.new(color_format, (width, height))
-        else:
-            dummy_image = None
 
-        if supports_audio_input(self.od_config.model_class_name):
-            audio_sr = 16000
-            audio_duration_sec = 4
-            audio_array = np.random.randn(audio_sr * audio_duration_sec).astype(np.float32)
-            dummy_audio = audio_array[audio_sr * 1 : audio_sr * 3]
+            dummy_image = PIL.Image.new("RGB", (width, height), color=(0, 0, 0))
         else:
             dummy_audio = None
 
+        # Collect dummy extra tokens from the pipeline class if available.
+        # Some pipelines (e.g. HyperCLOVAXVisionPipeline) require tokens in
+        # req.extra that are normally populated by stage input processors.
+        model_cls = DiffusionModelRegistry._try_load_model_cls(self.od_config.model_class_name)
+        dummy_extra = {}
+        if model_cls is not None and hasattr(model_cls, "get_dummy_extra"):
+            dummy_extra = model_cls.get_dummy_extra()
+
         prompt: OmniTextPrompt = {
             "prompt": "dummy run",
             "multi_modal_data": {"image": dummy_image, "audio": dummy_audio},
@@ -426,6 +489,7 @@ def _dummy_run(self):
         req = OmniDiffusionRequest(
             prompts=[prompt],
             request_ids=["dummy_req_id"],
+            extra=dummy_extra,
             sampling_params=OmniDiffusionSamplingParams(
                 height=height,
                 width=width,
@@ -441,14 +505,12 @@ def _dummy_run(self):
             ),
         )
         logger.info("dummy run to warm up the model")
-        request = self.pre_process_func(req) if self.pre_process_func is not None else req
-        output = self.add_req_and_wait_for_response(request)
-        if output.error:
-            raise RuntimeError(f"Dummy run failed: {output.error}")
+        requests = self.pre_process_func([req]) if self.pre_process_func is not None else [req]
+        self.add_req_and_wait_for_response(requests)
 
     def collective_rpc(
         self,
-        method: str,
+        method: str | Callable,
         timeout: float | None = None,
         args: tuple = (),
         kwargs: dict | None = None,
@@ -457,7 +519,7 @@ def collective_rpc(
         """Call a method on worker processes and get results immediately.
 
         Args:
-            method: The method name (str) to execute on workers
+            method: The method name (str) or callable to execute on workers
             timeout: Optional timeout in seconds
             args: Positional arguments for the method
             kwargs: Keyword arguments for the method
@@ -466,91 +528,61 @@ def collective_rpc(
         Returns:
             Single result if unique_reply_rank is provided, otherwise list of results
         """
-        assert isinstance(method, str), "Only string method names are supported for now"
+        if self._closed:
+            raise RuntimeError("DiffusionEngine is closed.")
 
         deadline = None if timeout is None else time.monotonic() + timeout
-        acquired = False
-        try:
-            if deadline is None:
-                self._rpc_lock.acquire()
-                acquired = True
-            else:
-                lock_timeout = max(0, deadline - time.monotonic())
-                acquired = self._rpc_lock.acquire(timeout=lock_timeout)
-            if not acquired:
-                raise TimeoutError(f"RPC call to {method} timed out waiting for engine lock.")
-
-            rpc_timeout = None if deadline is None else max(0, deadline - time.monotonic())
-            if deadline is not None and rpc_timeout <= 0:
-                raise TimeoutError(f"RPC call to {method} timed out.")
-
-            return self.executor.collective_rpc(
-                method=method,
-                timeout=rpc_timeout,
-                args=args,
-                kwargs=kwargs,
-                unique_reply_rank=unique_reply_rank,
-            )
-        finally:
-            if acquired:
-                self._rpc_lock.release()
-
-    def close(self) -> None:
-        if hasattr(self, "scheduler"):
-            self.scheduler.close()
-        if hasattr(self, "executor"):
-            self.executor.shutdown()
+        kwargs = kwargs or {}
+
+        assert isinstance(method, str)
+        send_method = method
+
+        # Prepare RPC request message
+        rpc_request = {
+            "type": "rpc",
+            "method": send_method,
+            "args": args,
+            "kwargs": kwargs,
+            "output_rank": unique_reply_rank,
+        }
 
-    def abort(self, request_id: str | Iterable[str]) -> None:
-        request_ids = [request_id] if isinstance(request_id, str) else list(request_id)
-        for req_id in request_ids:
-            self.abort_queue.put(req_id)
+        try:
+            # Broadcast RPC request to all workers via unified message queue
+            scheduler.mq.enqueue(rpc_request)
 
-    def _process_aborts_queue(self) -> None:
-        if self.abort_queue.empty():
-            return
+            # Determine which workers we expect responses from
+            num_responses = 1 if unique_reply_rank is not None else self.od_config.num_gpus
 
-        request_ids: list[str] = []
-        while not self.abort_queue.empty():
-            ids = self.abort_queue.get_nowait()
-            request_ids.extend((ids,) if isinstance(ids, str) else ids)
+            responses = []
+            for _ in range(num_responses):
+                dequeue_timeout = None if deadline is None else (deadline - time.monotonic())
+                try:
+                    if scheduler.result_mq is None:
+                        raise RuntimeError("Result queue not initialized")
 
-        self._abort_requests(request_ids)
+                    response = scheduler.result_mq.dequeue(timeout=dequeue_timeout)
 
-    def _abort_requests(self, request_ids: str | Iterable[str]) -> None:
-        request_ids = [request_ids] if isinstance(request_ids, str) else list(request_ids)
+                    # Check if response indicates an error
+                    if isinstance(response, dict) and response.get("status") == "error":
+                        raise RuntimeError(
+                            f"Worker failed with error '{response.get('error')}', "
+                            "please check the stack trace above for the root cause"
+                        )
 
-        sched_req_ids: list[str] = []
-        for request_id in dict.fromkeys(request_ids):
-            sched_req_id = self.scheduler.get_sched_req_id(request_id)
-            if sched_req_id is not None:
-                sched_req_ids.append(sched_req_id)
+                    responses.append(response)
+                except TimeoutError as e:
+                    raise TimeoutError(f"RPC call to {method} timed out.") from e
 
-        for sched_req_id in dict.fromkeys(sched_req_ids):
-            if self.scheduler.get_request_state(sched_req_id) is not None:
-                self.scheduler.finish_requests(sched_req_id, DiffusionRequestStatus.FINISHED_ABORTED)
+            return responses[0] if unique_reply_rank is not None else responses
 
-    def _finalize_finished_request(
-        self,
-        sched_req_id: str,
-        runner_output: RunnerOutput | None = None,
-        missing_result_error: str = "Diffusion scheduler finished target request without execution output.",
-    ) -> DiffusionOutput:
-        state = self.scheduler.get_request_state(sched_req_id)
-        popped_state = self.scheduler.pop_request_state(sched_req_id)
-        state = state or popped_state
-
-        if state is None:
-            raise RuntimeError(f"Diffusion scheduler lost state for request {sched_req_id}.")
-
-        if state.status == DiffusionRequestStatus.FINISHED_ABORTED:
-            request_id = state.req.request_ids[0] if state.req.request_ids else sched_req_id
-            return DiffusionOutput(
-                aborted=True,
-                abort_message=f"Request {request_id} aborted.",
-            )
+        except Exception as e:
+            logger.error(f"RPC call failed: {e}")
+            raise
 
-        if runner_output is not None and runner_output.result is not None:
-            return runner_output.result
+    def close(self) -> None:
+        self._finalizer()
 
-        return DiffusionOutput(error=missing_result_error)
+    def abort(self, request_id: str | Iterable[str]) -> None:
+        # TODO implement it
+        logger.warning("DiffusionEngine abort is not implemented yet")
+        pass
diff --git a/vllm_omni/diffusion/ipc.py b/vllm_omni/diffusion/ipc.py
index 6a96533fd40..3282ccdf0e8 100644
--- a/vllm_omni/diffusion/ipc.py
+++ b/vllm_omni/diffusion/ipc.py
@@ -17,7 +17,7 @@
 
 from vllm_omni.diffusion.data import DiffusionOutput
 
-_SHM_TENSOR_THRESHOLD = 1_000_000  # 1 MB
+_SHM_TENSOR_THRESHOLD = 0  # Always use SHM for CUDA tensor safety
 
 
 def _tensor_to_shm(tensor: torch.Tensor) -> dict[str, Any]:
@@ -25,19 +25,24 @@ def _tensor_to_shm(tensor: torch.Tensor) -> dict[str, Any]:
 
     The shared memory segment remains alive after this call (the local fd is
     closed, but the segment persists until ``_tensor_from_shm`` unlinks it).
+
+    BFloat16 and other numpy-incompatible dtypes are stored as raw uint8 bytes
+    and reconstructed using the stored ``torch_dtype``.
     """
     from multiprocessing import shared_memory
 
     import numpy as np
 
+    orig_dtype = tensor.dtype
     tensor = tensor.detach().cpu().contiguous()
-    original_dtype = tensor.dtype
-    # NumPy does not support bfloat16; promote to float32 for the SHM
-    # transfer and record the original dtype so _tensor_from_shm can
-    # convert back.  The round-trip is lossless for bfloat16 values.
-    if original_dtype == torch.bfloat16:
-        tensor = tensor.to(torch.float32)
-    arr = tensor.numpy()
+    # BFloat16 (and some other dtypes) are not natively supported by numpy.
+    # Use a raw uint8 byte view so data can be round-tripped without precision loss.
+    try:
+        arr = tensor.numpy()
+        use_raw_bytes = False
+    except TypeError:
+        arr = tensor.view(torch.uint8).numpy()
+        use_raw_bytes = True
     nbytes = arr.nbytes
     shm = shared_memory.SharedMemory(create=True, size=nbytes)
     shm_arr = np.ndarray(arr.shape, dtype=arr.dtype, buffer=shm.buf[:nbytes])
@@ -46,9 +51,10 @@ def _tensor_to_shm(tensor: torch.Tensor) -> dict[str, Any]:
         "__tensor_shm__": True,
         "name": shm.name,
         "shape": list(tensor.shape),
-        "torch_dtype": str(original_dtype),
+        "torch_dtype": str(orig_dtype),
         "numpy_dtype": str(arr.dtype),
         "nbytes": nbytes,
+        "raw_bytes": use_raw_bytes,
     }
     shm.close()
     return handle
@@ -63,19 +69,22 @@ def _tensor_from_shm(handle: dict[str, Any]) -> torch.Tensor:
     shm = shared_memory.SharedMemory(name=handle["name"])
     try:
         np_dtype = np.dtype(handle["numpy_dtype"])
-        arr = np.ndarray(handle["shape"], dtype=np_dtype, buffer=shm.buf[: handle["nbytes"]])
-        tensor = torch.from_numpy(arr.copy())
-        # Restore the original dtype if it differs from the numpy-compatible
-        # dtype used for the SHM transfer (e.g. bfloat16 → float32 → bfloat16).
-        torch_dtype_str = handle.get("torch_dtype", "")
-        if torch_dtype_str:
-            original_dtype = getattr(torch, torch_dtype_str.replace("torch.", ""), None)
-            if original_dtype is not None and tensor.dtype != original_dtype:
-                tensor = tensor.to(original_dtype)
+        if handle.get("raw_bytes"):
+            # Data was stored as raw uint8 bytes (e.g. BFloat16 round-trip).
+            byte_arr = np.ndarray(handle["nbytes"], dtype=np.uint8, buffer=shm.buf[: handle["nbytes"]])
+            raw = torch.from_numpy(byte_arr.copy())
+        else:
+            arr = np.ndarray(handle["shape"], dtype=np_dtype, buffer=shm.buf[: handle["nbytes"]])
+            raw = torch.from_numpy(arr.copy())
     finally:
         shm.close()
         shm.unlink()
-    return tensor
+    # Restore the original torch dtype (handles BF16 raw-byte round-trip).
+    torch_dtype_str = handle["torch_dtype"].replace("torch.", "")
+    torch_dtype = getattr(torch, torch_dtype_str)
+    if raw.dtype != torch_dtype or handle.get("raw_bytes"):
+        raw = raw.view(torch_dtype).reshape(handle["shape"])
+    return raw
 
 
 def _pack_tensor_if_large(val: torch.Tensor) -> torch.Tensor | dict:
diff --git a/vllm_omni/diffusion/models/hyperclovax_vision/__init__.py b/vllm_omni/diffusion/models/hyperclovax_vision/__init__.py
new file mode 100644
index 00000000000..fbf54a827d8
--- /dev/null
+++ b/vllm_omni/diffusion/models/hyperclovax_vision/__init__.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""HyperCLOVAX Vision Decoder diffusion model components."""
+
+from vllm_omni.diffusion.models.hyperclovax_vision.hyperclovax_vision_transformer import (
+    HyperCLOVAXVisionTransformer2DModel,
+)
+from vllm_omni.diffusion.models.hyperclovax_vision.pipeline_hyperclovax_vision import (
+    HyperCLOVAXVisionPipeline,
+    get_hyperclovax_vision_post_process_func,
+)
+from vllm_omni.diffusion.models.hyperclovax_vision.vision_token_embedder import (
+    VisionTokenEmbedder,
+)
+
+__all__ = [
+    "HyperCLOVAXVisionPipeline",
+    "HyperCLOVAXVisionTransformer2DModel",
+    "VisionTokenEmbedder",
+    "get_hyperclovax_vision_post_process_func",
+]
diff --git a/vllm_omni/diffusion/models/hyperclovax_vision/hyperclovax_vision_transformer.py b/vllm_omni/diffusion/models/hyperclovax_vision/hyperclovax_vision_transformer.py
new file mode 100644
index 00000000000..eea0bc1c5c5
--- /dev/null
+++ b/vllm_omni/diffusion/models/hyperclovax_vision/hyperclovax_vision_transformer.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from NAVER Cloud Corp. vision-decoder-api
+
+"""
+HyperCLOVAX Vision Transformer for vision token to image generation.
+
+This module implements the VisionTransformer diffusion model that converts
+vision token embeddings to latent representations for image generation.
+"""
+
+import torch
+import torch.nn as nn
+
+from vllm_omni.diffusion.data import OmniDiffusionConfig
+
+from .layers import (
+    EmbedAND,
+    LastLayer,
+    MLPEmbedder,
+    SingleStreamBlock,
+    timestep_embedding,
+)
+
+
+class HyperCLOVAXVisionTransformer2DModel(nn.Module):
+    """
+    Vision Transformer for vision token to image generation.
+
+    This transformer processes vision token embeddings concatenated with
+    noisy latents to predict noise for the diffusion process.
+
+    Architecture:
+        - Input projection: (in_channels + context_in_dim) -> hidden_size
+        - Time embedding: 256 -> hidden_size
+        - Vector embedding: context_in_dim -> hidden_size
+        - Position embedding: EmbedAND with 3D axes
+        - Single stream blocks: 35 parallel attention+MLP blocks
+        - Output layer: hidden_size -> out_channels
+
+    Args:
+        od_config: OmniDiffusionConfig containing model configuration
+        in_channels: Number of latent channels (default: 16)
+        vec_in_dim: Vision pooler output dimension (default: 1536)
+        context_in_dim: Vision hidden state dimension (default: 1536)
+        hidden_size: Transformer hidden dimension (default: 1920)
+        mlp_ratio: MLP expansion ratio (default: 4.0)
+        num_heads: Number of attention heads (default: 24)
+        depth_single_blocks: Number of single stream blocks (default: 35)
+        axes_dim: Position embedding axes dimensions (default: [8, 36, 36])
+        theta: RoPE theta parameter (default: 10000)
+        use_patchify: Whether to use 2x2 patchification (default: False)
+    """
+
+    def __init__(
+        self,
+        od_config: OmniDiffusionConfig,
+        in_channels: int = 16,
+        vec_in_dim: int = 1536,
+        context_in_dim: int = 1536,
+        hidden_size: int = 1920,
+        mlp_ratio: float = 4.0,
+        num_heads: int = 24,
+        depth_single_blocks: int = 35,
+        axes_dim: tuple[int, int, int] = (8, 36, 36),
+        theta: int = 10_000,
+        use_patchify: bool = False,
+    ):
+        super().__init__()
+
+        self.od_config = od_config
+        self.in_channels = in_channels
+        self.context_in_dim = context_in_dim
+        self.out_channels = in_channels
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.use_patchify = use_patchify
+        self.depth_single_blocks = depth_single_blocks
+
+        if hidden_size % num_heads != 0:
+            raise ValueError(f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}")
+
+        pe_dim = hidden_size // num_heads
+        axes_dim_list = list(axes_dim)
+        if sum(axes_dim_list) != pe_dim:
+            raise ValueError(f"Got {axes_dim_list} but expected positional dim {pe_dim}")
+
+        # Position embedding
+        self.pe_embedder = EmbedAND(dim=pe_dim, theta=theta, axes_dim=axes_dim_list)
+
+        # Input projections
+        self.img_in = nn.Linear(in_channels + context_in_dim, hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=hidden_size)
+        self.vector_in = MLPEmbedder(vec_in_dim, hidden_size)
+
+        # Single stream blocks
+        self.single_blocks = nn.ModuleList(
+            [SingleStreamBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth_single_blocks)]
+        )
+
+        # Output layer
+        self.final_layer = LastLayer(hidden_size, 1, self.out_channels)
+
+    def forward(
+        self,
+        img: torch.Tensor,
+        img_ids: torch.Tensor,
+        timesteps: torch.Tensor,
+        y: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Forward pass of the transformer.
+
+        Args:
+            img: Input tensor (B, L, in_channels + context_in_dim)
+                 Concatenation of noisy latents and vision spatial features
+            img_ids: Position IDs tensor (B, L, 3)
+            timesteps: Sigma/timestep tensor (B,) in [0, 1]
+            y: Vision pooler output tensor (B, vec_in_dim)
+
+        Returns:
+            Output tensor (B, L, out_channels) - predicted noise
+        """
+        if img.ndim != 3:
+            raise ValueError("Input img tensor must have 3 dimensions.")
+
+        # Project input
+        img = self.img_in(img)
+
+        # Time and vector embedding
+        vec = self.time_in(
+            timestep_embedding(timesteps, 256).to(dtype=self.time_in.in_layer.weight.dtype, device=img.device)
+        )
+        vec = vec + self.vector_in(y)
+
+        # Position embedding
+        pe = self.pe_embedder(img_ids)
+
+        # Single stream blocks
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+
+        # Final projection
+        img = self.final_layer(img, vec)
+
+        return img
diff --git a/vllm_omni/diffusion/models/hyperclovax_vision/layers.py b/vllm_omni/diffusion/models/hyperclovax_vision/layers.py
new file mode 100644
index 00000000000..e5018af8954
--- /dev/null
+++ b/vllm_omni/diffusion/models/hyperclovax_vision/layers.py
@@ -0,0 +1,234 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from NAVER Cloud Corp. vision-decoder-api
+
+"""
+Common layers for HyperCLOVAX Vision Decoder.
+
+This module contains utility layers used in the VisionTransformer:
+- RoPE (Rotary Position Embedding)
+- EmbedAND (N-dimensional position embedding)
+- MLPEmbedder (MLP for timestep and vector embeddings)
+- RMSNorm (Root Mean Square Layer Normalization)
+- QKNorm (Query-Key normalization)
+- Modulation (Adaptive layer normalization modulation)
+- SingleStreamBlock (Parallel attention and MLP block)
+- LastLayer (Final projection layer)
+"""
+
+import math
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+# Flash Attention support detection: require CUDA with compute capability >= 8.0
+try:
+    from torch.nn.attention import SDPBackend, sdpa_kernel
+
+    def _flash_attn_available() -> bool:
+        if not torch.cuda.is_available():
+            return False
+        major, _ = torch.cuda.get_device_capability()
+        return major >= 8  # Flash Attention requires Ampere or newer
+
+    FLASH_ATTN_AVAILABLE = _flash_attn_available()
+except ImportError:
+    FLASH_ATTN_AVAILABLE = False
+    sdpa_kernel = None
+    SDPBackend = None
+
+
+def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
+    """Rotary Position Embedding computation."""
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+
+
+def apply_rope(xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """Apply rotary position embedding to query and key."""
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+
+
+def attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, pe: torch.Tensor) -> torch.Tensor:
+    """Attention with rotary position embedding and Flash Attention optimization."""
+    q, k = apply_rope(q, k, pe)
+
+    # Use Flash Attention when available (Ampere+), otherwise let PyTorch pick
+    if FLASH_ATTN_AVAILABLE and q.is_cuda:
+        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+            x = F.scaled_dot_product_attention(q, k, v)
+    else:
+        x = F.scaled_dot_product_attention(q, k, v)
+
+    x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+
+
+@torch.no_grad()
+def timestep_embedding(
+    t: torch.Tensor,
+    dim: int,
+    max_period: float = 10000,
+    time_factor: float = 1000.0,
+) -> torch.Tensor:
+    """Create sinusoidal timestep embeddings."""
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+
+
+class EmbedAND(nn.Module):
+    """N-dimensional position embedding."""
+
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+
+
+class MLPEmbedder(nn.Module):
+    """MLP for timestep and vector embeddings."""
+
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+
+
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization."""
+
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * self.scale
+
+
+class QKNorm(nn.Module):
+    """Query-Key normalization."""
+
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+
+    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+
+
+@dataclass
+class ModulationOut:
+    shift: torch.Tensor
+    scale: torch.Tensor
+    gate: torch.Tensor
+
+
+class Modulation(nn.Module):
+    """Adaptive layer normalization modulation."""
+
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+
+    def forward(self, vec: torch.Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(F.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+
+
+class SingleStreamBlock(nn.Module):
+    """Single stream transformer block (parallel attention and MLP)."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+
+    def forward(self, x: torch.Tensor, vec: torch.Tensor, pe: torch.Tensor) -> torch.Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+
+        attn = attention(q, k, v, pe=pe)
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod.gate * output
+
+
+class LastLayer(nn.Module):
+    """Final projection layer with adaptive normalization."""
+
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+
+    def forward(self, x: torch.Tensor, vec: torch.Tensor) -> torch.Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
diff --git a/vllm_omni/diffusion/models/hyperclovax_vision/pipeline_hyperclovax_vision.py b/vllm_omni/diffusion/models/hyperclovax_vision/pipeline_hyperclovax_vision.py
new file mode 100644
index 00000000000..85bc4334395
--- /dev/null
+++ b/vllm_omni/diffusion/models/hyperclovax_vision/pipeline_hyperclovax_vision.py
@@ -0,0 +1,433 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from NAVER Cloud Corp. vision-decoder-api
+
+"""
+HyperCLOVAX Vision Pipeline for vLLM-Omni.
+
+This pipeline converts vision tokens to images using a VisionTransformer
+diffusion model. It supports:
+- Vision token embedding
+- Flow matching diffusion
+- Autoguidance (optional transformer2)
+- xDiT USP sequence parallelism
+"""
+
+import json
+import logging
+import os
+from collections.abc import Iterable
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers import AutoencoderKL
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from einops import rearrange, repeat
+from vllm.model_executor.models.utils import AutoWeightsLoader
+from vllm.transformers_utils.config import get_hf_file_to_dict
+
+from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig
+from vllm_omni.diffusion.distributed.utils import get_local_device
+from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
+from vllm_omni.diffusion.request import OmniDiffusionRequest
+
+from .hyperclovax_vision_transformer import HyperCLOVAXVisionTransformer2DModel
+from .vision_token_embedder import VisionTokenEmbedder
+
+logger = logging.getLogger(__name__)
+
+
+def get_hyperclovax_vision_post_process_func(od_config: OmniDiffusionConfig):
+    """
+    Get post-processing function for HyperCLOVAX Vision pipeline.
+
+    Returns a function that converts model output tensors to PIL images.
+    """
+    model_name = od_config.model
+    if os.path.exists(model_name):
+        model_path = model_name
+    else:
+        from vllm_omni.model_executor.model_loader.weight_utils import (
+            download_weights_from_hf_specific,
+        )
+
+        model_path = download_weights_from_hf_specific(model_name, None, ["*"])
+
+    # VaeImageProcessor expects the spatial downsampling ratio (e.g. 8),
+    # not the latent scaling_factor (e.g. 0.13025). Read from vae config if available.
+    vae_spatial_factor = 8  # default for AutoencoderKL (2^3 downsampling)
+    vae_config_path = os.path.join(model_path, "vae/config.json")
+    if os.path.exists(vae_config_path):
+        with open(vae_config_path) as f:
+            config = json.load(f)
+            # block_out_channels determines spatial downsampling: 2^(len-1)
+            block_out_channels = config.get("block_out_channels")
+            if block_out_channels:
+                vae_spatial_factor = 2 ** (len(block_out_channels) - 1)
+
+    image_processor = VaeImageProcessor(vae_scale_factor=vae_spatial_factor)
+
+    def post_process_func(images: torch.Tensor):
+        """Convert tensor images to PIL images."""
+        return image_processor.postprocess(images)
+
+    return post_process_func
+
+
+class HyperCLOVAXVisionPipeline(nn.Module):
+    """
+    HyperCLOVAX Vision Pipeline for vision token to image generation.
+
+    This pipeline:
+    1. Embeds vision tokens using VisionTokenEmbedder
+    2. Runs flow matching diffusion with VisionTransformer
+    3. Decodes latents to images using VAE
+    4. Optionally applies autoguidance with transformer2
+
+    Args:
+        od_config: OmniDiffusionConfig containing model configuration
+        prefix: Prefix for weight loading (default: "")
+    """
+
+    @staticmethod
+    def get_dummy_extra() -> dict:
+        """Return dummy extra dict for warmup dummy run."""
+        import numpy as np
+
+        # token_length=729, vocab_size=65536 per token_embedder/config.json
+        return {"vision_tokens": np.zeros((1, 729), dtype=np.int64)}
+
+    def __init__(
+        self,
+        *,
+        od_config: OmniDiffusionConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.od_config = od_config
+        self.device = get_local_device()
+
+        model = od_config.model
+        local_files_only = os.path.exists(model)
+
+        def _load_component_config(subfolder: str) -> dict:
+            if os.path.isdir(model):
+                cfg_path = os.path.join(model, subfolder, "config.json")
+                if os.path.exists(cfg_path):
+                    with open(cfg_path) as f:
+                        return json.load(f)
+                return {}
+            cfg = get_hf_file_to_dict(f"{subfolder}/config.json", model)
+            return cfg or {}
+
+        def _build_transformer_kwargs(cfg: dict) -> dict:
+            axes_dim = cfg.get("axes_dim", [8, 36, 36])
+            return {
+                "in_channels": cfg.get("in_channels", 16),
+                "vec_in_dim": cfg.get("vec_in_dim", 1536),
+                "context_in_dim": cfg.get("context_in_dim", 1536),
+                "hidden_size": cfg.get("hidden_size", 1920),
+                "mlp_ratio": cfg.get("mlp_ratio", 4.0),
+                "num_heads": cfg.get("num_heads", 24),
+                "depth_single_blocks": cfg.get("depth_single_blocks", 35),
+                "axes_dim": tuple(axes_dim),
+                "theta": cfg.get("theta", 10000),
+                "use_patchify": cfg.get("use_patchify", False),
+            }
+
+        transformer_cfg = _load_component_config("transformer")
+
+        # 1. Load scheduler
+        self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            model, subfolder="scheduler", local_files_only=local_files_only
+        )
+
+        # 2. Load VAE
+        self.vae = AutoencoderKL.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to(
+            self.device
+        )
+
+        # 3. Initialize token embedder — read dims from token_embedder/config.json
+        token_embedder_cfg = _load_component_config("token_embedder")
+        self.token_embedder = VisionTokenEmbedder(
+            vocab_size=token_embedder_cfg.get("vocab_size", 65536),
+            embedding_dim=token_embedder_cfg.get("embedding_dim", 1536),
+            token_length=token_embedder_cfg.get("token_length", 729),
+        )
+
+        # 4. Initialize transformer
+        self.transformer = HyperCLOVAXVisionTransformer2DModel(
+            od_config=od_config,
+            **_build_transformer_kwargs(transformer_cfg),
+        )
+
+        # 5. Initialize transformer2 for autoguidance (if available)
+        transformer2_exists = False
+        if os.path.isdir(model):
+            # Local path: check filesystem
+            transformer2_path = os.path.join(model, "transformer2")
+            transformer2_exists = os.path.exists(transformer2_path)
+        else:
+            # Remote HF repo: check if transformer2 subfolder exists
+            try:
+                from huggingface_hub import HfFileSystem
+
+                fs = HfFileSystem()
+                transformer2_exists = fs.exists(f"{model}/transformer2")
+            except Exception:
+                transformer2_exists = False
+
+        if transformer2_exists:
+            transformer2_cfg = _load_component_config("transformer2")
+            if not transformer2_cfg:
+                transformer2_cfg = transformer_cfg
+            self.transformer2 = HyperCLOVAXVisionTransformer2DModel(
+                od_config=od_config,
+                **_build_transformer_kwargs(transformer2_cfg),
+            )
+        else:
+            self.transformer2 = None
+
+        # Weight sources for vLLM loader
+        self.weights_sources = [
+            DiffusersPipelineLoader.ComponentSource(
+                model_or_path=od_config.model,
+                subfolder="transformer",
+                revision=None,
+                prefix="transformer.",
+                fall_back_to_pt=True,
+            ),
+            DiffusersPipelineLoader.ComponentSource(
+                model_or_path=od_config.model,
+                subfolder="token_embedder",
+                revision=None,
+                prefix="token_embedder.",
+                fall_back_to_pt=True,
+            ),
+        ]
+
+        # Add transformer2 weights if available
+        if self.transformer2 is not None:
+            self.weights_sources.append(
+                DiffusersPipelineLoader.ComponentSource(
+                    model_or_path=od_config.model,
+                    subfolder="transformer2",
+                    revision=None,
+                    prefix="transformer2.",
+                    fall_back_to_pt=True,
+                )
+            )
+
+        # VAE configuration
+        self.vae_scale_factor = 8
+        self.vae_scaling_factor = getattr(self.vae.config, "scaling_factor", 1.0)
+        self.vae_shift_factor = getattr(self.vae.config, "shift_factor", 0.0)
+
+        # Apply USP parallelization if configured
+        if od_config.parallel_config.sequence_parallel_size > 1:
+            try:
+                from .transformer_usp import parallelize_transformer
+
+                self.transformer = parallelize_transformer(self.transformer)
+                if self.transformer2 is not None:
+                    self.transformer2 = parallelize_transformer(self.transformer2)
+                logger.info("USP parallelization applied successfully")
+            except ImportError:
+                logger.warning("xDiT not available, skipping USP parallelization")
+
+        self.to(self.device)
+
+    def _prepare_latents(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype | None = None,
+        generator: torch.Generator | None = None,
+    ) -> torch.Tensor:
+        """Prepare random latents for diffusion."""
+        dtype = dtype or self.od_config.dtype
+
+        latent_h = height // self.vae_scale_factor
+        latent_w = width // self.vae_scale_factor
+        latent_channels = 16  # VAE has 16 latent channels
+
+        shape = (batch_size, latent_channels, latent_h, latent_w)
+        latents = torch.randn(shape, device=self.device, dtype=dtype, generator=generator)
+
+        return latents
+
+    def _prepare_img_ids(
+        self,
+        batch_size: int,
+        img_h: int,
+        img_w: int,
+    ) -> torch.Tensor:
+        """Prepare position IDs for the transformer."""
+        img_ids = torch.zeros(img_h, img_w, 3)
+        img_ids[..., 1] = img_ids[..., 1] + torch.arange(img_h)[:, None]
+        img_ids[..., 2] = img_ids[..., 2] + torch.arange(img_w)[None, :]
+        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=batch_size)
+        return img_ids.to(device=self.device, dtype=self.od_config.dtype)
+
+    def _prepare_vision_spatial(
+        self,
+        vision_hidden: torch.Tensor,
+        img_h: int,
+        img_w: int,
+    ) -> torch.Tensor:
+        """
+        Prepare vision spatial features for concatenation with latents.
+
+        Interpolates vision hidden states to match latent spatial dimensions.
+        """
+        # vision_hidden: (B, L, C) where L is typically 729 (27x27)
+        cond_h = cond_w = int(vision_hidden.shape[1] ** 0.5)
+
+        # Reshape to spatial format
+        vision_spatial = rearrange(vision_hidden, "b (h w) c -> b c h w", h=cond_h, w=cond_w)
+
+        # Interpolate to match latent size
+        vision_spatial = F.interpolate(vision_spatial, size=(img_h, img_w), mode="bilinear", align_corners=False)
+
+        # Reshape back to sequence format
+        vision_spatial = rearrange(vision_spatial, "b c h w -> b (h w) c")
+
+        return vision_spatial
+
+    def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        """Decode latents to images using VAE."""
+        latents = latents / self.vae_scaling_factor + self.vae_shift_factor
+        images = self.vae.decode(latents).sample
+        return images
+
+    def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput:
+        """
+        Generate images from vision tokens.
+
+        Args:
+            req: OmniDiffusionRequest containing:
+                - extra["vision_tokens"]: Vision token IDs (B, L) or (L,)
+                - height: Output image height (default: 768)
+                - width: Output image width (default: 768)
+                - num_inference_steps: Number of diffusion steps (default: 50)
+                - guidance_scale: Autoguidance scale (default: 0.0)
+                - seed: Random seed (optional)
+
+        Returns:
+            DiffusionOutput with generated images
+        """
+        # Extract vision tokens from request
+        vision_tokens = req.extra.get("vision_tokens")
+        if vision_tokens is None:
+            return DiffusionOutput(output=None, error="vision_tokens required in req.extra")
+
+        # Convert to tensor if needed
+        if isinstance(vision_tokens, list):
+            vision_tokens = torch.tensor(vision_tokens, dtype=torch.long)
+        elif isinstance(vision_tokens, np.ndarray):
+            vision_tokens = torch.from_numpy(vision_tokens).long()
+
+        if vision_tokens.ndim == 1:
+            vision_tokens = vision_tokens.unsqueeze(0)
+
+        vision_tokens = vision_tokens.to(self.device)
+        batch_size = vision_tokens.shape[0]
+
+        # Get parameters from request sampling_params
+        sp = req.sampling_params
+        height = sp.height if sp.height else 768
+        width = sp.width if sp.width else 768
+        num_steps = sp.num_inference_steps if sp.num_inference_steps else 50
+        guidance_scale = sp.guidance_scale if sp.guidance_scale else 0.0
+
+        # Setup generator for reproducibility
+        generator = sp.generator
+        if generator is None and sp.seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(sp.seed)
+
+        dtype = self.od_config.dtype
+
+        # 1. Vision Token Embedding
+        vision_cond = self.token_embedder(vision_tokens)
+        vision_hidden = vision_cond["vision_last_hidden_state"].to(dtype)
+        vision_pooler = vision_cond["vision_pooler_output"].to(dtype)
+
+        # 2. Prepare latents
+        latents = self._prepare_latents(batch_size, height, width, dtype=dtype, generator=generator)
+
+        # 3. Prepare position IDs
+        img_h = height // self.vae_scale_factor
+        img_w = width // self.vae_scale_factor
+        img_ids = self._prepare_img_ids(batch_size, img_h, img_w)
+
+        # 4. Prepare vision spatial features
+        vision_spatial = self._prepare_vision_spatial(vision_hidden, img_h, img_w)
+
+        # 5. Set timesteps
+        self.scheduler.set_timesteps(num_steps, device=self.device)
+        timesteps = self.scheduler.timesteps
+
+        # Determine if using autoguidance
+        use_autoguidance = self.transformer2 is not None and guidance_scale > 0
+
+        # 6. Denoising loop
+        for i, t in enumerate(timesteps):
+            # Prepare input: concatenate latents with vision spatial
+            if self.transformer.use_patchify:
+                x_t = rearrange(latents, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+            else:
+                x_t = rearrange(latents, "b c h w -> b (h w) c")
+
+            x_t = torch.cat((x_t, vision_spatial), dim=2)
+
+            # FlowMatchEulerDiscreteScheduler.timesteps are already float sigmas in [0, 1].
+            # Do not divide by num_train_timesteps again.
+            sigma = torch.full((batch_size,), t.item(), device=self.device, dtype=torch.float32)
+
+            # Forward pass
+            pred = self.transformer(
+                img=x_t,
+                img_ids=img_ids,
+                timesteps=sigma,
+                y=vision_pooler,
+            )
+
+            # Apply autoguidance
+            if use_autoguidance:
+                pred2 = self.transformer2(
+                    img=x_t,
+                    img_ids=img_ids,
+                    timesteps=sigma,
+                    y=vision_pooler,
+                )
+                pred = pred + guidance_scale * (pred - pred2)
+
+            # Unpatchify prediction
+            if self.transformer.use_patchify:
+                pred = rearrange(
+                    pred,
+                    "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+                    h=img_h // 2,
+                    w=img_w // 2,
+                    ph=2,
+                    pw=2,
+                )
+            else:
+                pred = rearrange(pred, "b (h w) c -> b c h w", h=img_h, w=img_w)
+
+            # Scheduler step
+            latents = self.scheduler.step(pred, t, latents, generator=generator).prev_sample
+
+        # 7. Decode latents
+        images = self._decode_latents(latents)
+
+        return DiffusionOutput(output=images)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """Load model weights using AutoWeightsLoader."""
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm_omni/diffusion/models/hyperclovax_vision/transformer_usp.py b/vllm_omni/diffusion/models/hyperclovax_vision/transformer_usp.py
new file mode 100644
index 00000000000..c9bbc1b8375
--- /dev/null
+++ b/vllm_omni/diffusion/models/hyperclovax_vision/transformer_usp.py
@@ -0,0 +1,307 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from NAVER Cloud Corp. vision-decoder-api
+
+"""
+VisionTransformer USP Wrapper for xDiT Integration.
+
+This module provides Unified Sequence Parallelism (USP) support for the
+HyperCLOVAX VisionTransformer used in vision token to image generation.
+
+USP enables multi-GPU acceleration by:
+- Splitting input sequences across GPUs (Ulysses parallelism)
+- Using ring attention patterns for long sequences
+- Efficiently gathering outputs after processing
+"""
+
+import functools
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+from .layers import timestep_embedding
+
+# xDiT imports
+try:
+    from xfuser.core.distributed import (
+        get_sequence_parallel_rank,
+        get_sequence_parallel_world_size,
+        get_sp_group,
+    )
+    from xfuser.model_executor.layers.usp import USP
+
+    XDIT_AVAILABLE = True
+except ImportError:
+    XDIT_AVAILABLE = False
+
+
+def split_sequence(tensor: torch.Tensor, dim: int = 1) -> torch.Tensor:
+    """
+    Split tensor along sequence dimension for parallel processing.
+
+    Args:
+        tensor: Input tensor to split
+        dim: Dimension to split along (default: 1 for sequence dim)
+
+    Returns:
+        Local chunk of the tensor for this rank
+    """
+    if not XDIT_AVAILABLE or get_sequence_parallel_world_size() <= 1:
+        return tensor
+
+    world_size = get_sequence_parallel_world_size()
+    rank = get_sequence_parallel_rank()
+
+    chunks = torch.chunk(tensor, world_size, dim=dim)
+    return chunks[rank].contiguous()
+
+
+def gather_sequence(tensor: torch.Tensor, dim: int = 1) -> torch.Tensor:
+    """
+    Gather tensor from all ranks along sequence dimension.
+
+    Args:
+        tensor: Local tensor chunk
+        dim: Dimension to gather along (default: 1 for sequence dim)
+
+    Returns:
+        Full tensor gathered from all ranks
+    """
+    if not XDIT_AVAILABLE or get_sequence_parallel_world_size() <= 1:
+        return tensor
+
+    return get_sp_group().all_gather(tensor.contiguous(), dim=dim)
+
+
+def split_rope_embedding(pe: torch.Tensor, seq_len: int) -> torch.Tensor:
+    """
+    Split RoPE position embedding for sequence parallelism.
+
+    The VisionTransformer uses 3D position encoding with axes_dim [8, 36, 36].
+    The PE tensor has shape (B, 1, L, head_dim//2, 2, 2) after EmbedAND.
+
+    Args:
+        pe: Position embedding tensor
+        seq_len: Original sequence length
+
+    Returns:
+        Local chunk of position embeddings
+    """
+    if not XDIT_AVAILABLE or get_sequence_parallel_world_size() <= 1:
+        return pe
+
+    world_size = get_sequence_parallel_world_size()
+    rank = get_sequence_parallel_rank()
+
+    # PE shape: (B, 1, L, D, 2, 2) where L is sequence length
+    # Split along dim 2 (sequence dimension)
+    seq_dim = 2
+    chunks = torch.chunk(pe, world_size, dim=seq_dim)
+    return chunks[rank].contiguous()
+
+
+def apply_rope_usp(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary position embedding with USP support.
+
+    Args:
+        xq: Query tensor (B, H, L, D)
+        xk: Key tensor (B, H, L, D)
+        freqs_cis: RoPE frequencies (B, 1, L, D//2, 2, 2)
+
+    Returns:
+        Tuple of rotated query and key tensors
+    """
+    # Reshape for RoPE application
+    # xq: (B, H, L, D) -> (B, H, L, D//2, 1, 2)
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+
+    # freqs_cis: (B, 1, L, D//2, 2, 2) - contains cos and sin
+    # Apply rotation: x_out = x * cos + rotate(x) * sin
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+
+    return (
+        xq_out.reshape(*xq.shape).type_as(xq),
+        xk_out.reshape(*xk.shape).type_as(xk),
+    )
+
+
+def parallelize_transformer(transformer: nn.Module) -> nn.Module:
+    """
+    Parallelize VisionTransformer for sequence parallelism.
+
+    This function wraps the transformer's forward method to:
+    1. Split input sequences across GPUs
+    2. Replace attention with USP attention
+    3. Gather outputs after processing
+
+    Args:
+        transformer: HyperCLOVAXVisionTransformer2DModel instance
+
+    Returns:
+        Modified transformer with USP support
+    """
+    if not XDIT_AVAILABLE:
+        return transformer
+
+    original_forward = transformer.forward
+
+    @functools.wraps(transformer.__class__.forward)
+    def usp_forward(
+        self,
+        img: torch.Tensor,
+        img_ids: torch.Tensor,
+        timesteps: torch.Tensor,
+        y: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        USP-enabled forward pass.
+
+        Args:
+            img: Input tensor (B, L, in_channels + context_in_dim)
+            img_ids: Position IDs tensor (B, L, 3)
+            timesteps: Sigma/timestep tensor (B,)
+            y: Vision pooler output tensor (B, vec_in_dim)
+
+        Returns:
+            Output tensor (B, L, out_channels)
+        """
+        sp_world_size = get_sequence_parallel_world_size()
+
+        if sp_world_size <= 1:
+            # Single GPU mode
+            return original_forward(img, img_ids, timesteps, y)
+
+        # Split input sequences across GPUs
+        img_local = split_sequence(img, dim=1)
+        img_ids_local = split_sequence(img_ids, dim=1)
+
+        # Run transformer with local sequences
+        output_local = _usp_transformer_forward(self, img_local, img_ids_local, timesteps, y)
+
+        # Gather output from all ranks
+        output = gather_sequence(output_local, dim=1)
+
+        return output
+
+    # Bind the new forward method
+    usp_forward = usp_forward.__get__(transformer)
+    transformer.forward = usp_forward
+
+    # Parallelize attention in single blocks
+    _parallelize_attention_blocks(transformer)
+
+    return transformer
+
+
+def _usp_transformer_forward(
+    transformer: nn.Module,
+    img: torch.Tensor,
+    img_ids: torch.Tensor,
+    timesteps: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Internal forward pass with sequence-parallel attention.
+
+    This function reimplements the transformer forward to use USP attention
+    instead of standard attention.
+    """
+    if img.ndim != 3:
+        raise ValueError("Input img tensor must have 3 dimensions.")
+
+    # Project input
+    img = transformer.img_in(img)
+
+    # Time and vector embedding (no splitting needed - these are per-sample)
+    vec = transformer.time_in(
+        timestep_embedding(timesteps, 256).to(dtype=transformer.time_in.in_layer.weight.dtype, device=img.device)
+    )
+    vec = vec + transformer.vector_in(y)
+
+    # Position embedding - compute for local sequence
+    pe = transformer.pe_embedder(img_ids)
+
+    # Single stream blocks with USP attention
+    for block in transformer.single_blocks:
+        img = _usp_single_block_forward(block, img, vec, pe)
+
+    # Final projection
+    img = transformer.final_layer(img, vec)
+
+    return img
+
+
+def _usp_single_block_forward(
+    block: nn.Module,
+    x: torch.Tensor,
+    vec: torch.Tensor,
+    pe: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Single block forward with USP attention.
+
+    This replaces the standard attention with USP attention that
+    handles cross-GPU communication internally.
+    """
+    mod, _ = block.modulation(vec)
+    x_mod = (1 + mod.scale) * block.pre_norm(x) + mod.shift
+    qkv, mlp = torch.split(
+        block.linear1(x_mod),
+        [3 * block.hidden_size, block.mlp_hidden_dim],
+        dim=-1,
+    )
+
+    q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=block.num_heads)
+    q, k = block.norm(q, k, v)
+
+    # USP attention
+    if XDIT_AVAILABLE and get_sequence_parallel_world_size() > 1:
+        # Apply RoPE to local Q, K
+        q, k = apply_rope_usp(q, k, pe)
+
+        # Use xfuser's USP for efficient parallel attention
+        # USP handles cross-GPU communication internally
+        attn = USP(q, k, v, dropout_p=0.0, is_causal=False)
+
+        attn = rearrange(attn, "B H L D -> B L (H D)")
+    else:
+        # Standard attention with RoPE
+        from .layers import attention
+
+        attn = attention(q, k, v, pe=pe)
+
+    output = block.linear2(torch.cat((attn, block.mlp_act(mlp)), 2))
+    return x + mod.gate * output
+
+
+def _parallelize_attention_blocks(transformer: nn.Module) -> None:
+    """
+    Replace attention in all single blocks with USP-enabled attention.
+
+    This modifies the blocks in-place to use USP attention.
+    """
+    if not hasattr(transformer, "single_blocks"):
+        return
+
+    for i, block in enumerate(transformer.single_blocks):
+        # Store original parameters
+        block._usp_enabled = True
+        block._original_forward = block.forward
+
+        # Create new forward that uses USP
+        def make_usp_block_forward(blk):
+            @functools.wraps(blk.__class__.forward)
+            def usp_block_forward(self, x, vec, pe):
+                return _usp_single_block_forward(self, x, vec, pe)
+
+            return usp_block_forward
+
+        block.forward = make_usp_block_forward(block).__get__(block)
diff --git a/vllm_omni/diffusion/models/hyperclovax_vision/vision_token_embedder.py b/vllm_omni/diffusion/models/hyperclovax_vision/vision_token_embedder.py
new file mode 100644
index 00000000000..ff5bc052b3c
--- /dev/null
+++ b/vllm_omni/diffusion/models/hyperclovax_vision/vision_token_embedder.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from NAVER Cloud Corp. vision-decoder-api
+
+"""
+Vision Token Embedder for HyperCLOVAX Vision Decoder.
+
+Converts discrete vision tokens to continuous embeddings.
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+class VisionTokenEmbedder(nn.Module):
+    """
+    Vision Token Embedder that converts discrete vision tokens to embeddings.
+
+    This module embeds vision tokens (discrete vocabulary indices) into
+    continuous vector representations for the VisionTransformer.
+
+    Args:
+        vocab_size: Size of the vision token vocabulary (default: 65536)
+        embedding_dim: Dimension of the embedding vectors (default: 1536)
+        token_length: Expected number of tokens per image (default: 729 for 27x27)
+    """
+
+    def __init__(
+        self,
+        vocab_size: int = 65536,
+        embedding_dim: int = 1536,
+        token_length: int = 729,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.token_length = token_length
+
+        # Main vocabulary embeddings
+        self.vocab_embeddings = nn.Parameter(torch.zeros(vocab_size, embedding_dim))
+
+        # Unconditional embedding for classifier-free guidance
+        self.uncond_embedding = nn.Parameter(torch.zeros(1, embedding_dim))
+
+    def load_vocab_embeddings(self, embeddings: torch.Tensor) -> None:
+        """Load vocabulary embeddings from a tensor."""
+        if embeddings.shape != (self.vocab_size, self.embedding_dim):
+            raise ValueError(
+                f"Expected embeddings shape ({self.vocab_size}, {self.embedding_dim}), got {embeddings.shape}"
+            )
+        with torch.no_grad():
+            self.vocab_embeddings.copy_(embeddings)
+
+    def forward(self, tokens: torch.Tensor) -> dict[str, torch.Tensor]:
+        """
+        Convert vision tokens to embeddings.
+
+        Args:
+            tokens: Vision token IDs (B, L) where L is typically 729
+
+        Returns:
+            Dictionary with:
+                - vision_last_hidden_state: (B, L, embedding_dim)
+                - vision_pooler_output: (B, embedding_dim) - mean pooled
+        """
+        # Look up embeddings
+        hidden_states = self.vocab_embeddings[tokens]
+
+        # Mean pooling for pooler output
+        pooler_output = hidden_states.mean(dim=1)
+
+        return {
+            "vision_last_hidden_state": hidden_states,
+            "vision_pooler_output": pooler_output,
+        }
+
+    def get_uncond_embeddings(self, batch_size: int, token_length: int) -> dict[str, torch.Tensor]:
+        """
+        Get unconditional embeddings for classifier-free guidance.
+
+        Args:
+            batch_size: Batch size
+            token_length: Number of tokens per sample
+
+        Returns:
+            Dictionary with unconditional hidden states and pooler output
+        """
+        uncond_hidden = self.uncond_embedding.expand(batch_size, token_length, -1)
+        uncond_pooler = uncond_hidden.mean(dim=1)
+
+        return {
+            "vision_last_hidden_state": uncond_hidden,
+            "vision_pooler_output": uncond_pooler,
+        }
+
+    @classmethod
+    def from_numpy(cls, npy_path: str, token_length: int = 729) -> "VisionTokenEmbedder":
+        """
+        Create embedder from numpy file.
+
+        Args:
+            npy_path: Path to .npy file containing embeddings
+            token_length: Number of tokens per image (inferred from model architecture;
+                default 729 = 27×27 for HyperCLOVAX-SEED-Omni-8B)
+
+        Returns:
+            VisionTokenEmbedder instance with loaded embeddings
+        """
+        embeddings = torch.from_numpy(np.load(npy_path)).float()
+        vocab_size, embedding_dim = embeddings.shape
+
+        embedder = cls(
+            vocab_size=vocab_size,
+            embedding_dim=embedding_dim,
+            token_length=token_length,
+        )
+        embedder.load_vocab_embeddings(embeddings)
+        return embedder
diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py
index 0bf8c04517b..d5dd17ce2e8 100644
--- a/vllm_omni/diffusion/registry.py
+++ b/vllm_omni/diffusion/registry.py
@@ -8,11 +8,8 @@
 from vllm.model_executor.models.registry import _LazyRegisteredModel, _ModelRegistry
 
 from vllm_omni.diffusion.data import OmniDiffusionConfig
-from vllm_omni.diffusion.distributed.autoencoders.distributed_vae_executor import DistributedVaeMixin
 from vllm_omni.diffusion.distributed.sp_plan import SequenceParallelConfig, get_sp_plan_from_model
-from vllm_omni.diffusion.forward_context import get_forward_context
 from vllm_omni.diffusion.hooks.sequence_parallel import apply_sequence_parallel
-from vllm_omni.diffusion.utils.tf_utils import find_module_with_attr
 
 logger = init_logger(__name__)
 
@@ -58,31 +55,6 @@
         "pipeline_wan2_2",
         "Wan22Pipeline",
     ),
-    "WanVACEPipeline": (
-        "wan2_2",
-        "pipeline_wan2_2_vace",
-        "Wan22VACEPipeline",
-    ),
-    "LTX2Pipeline": (
-        "ltx2",
-        "pipeline_ltx2",
-        "LTX2Pipeline",
-    ),
-    "LTX2ImageToVideoPipeline": (
-        "ltx2",
-        "pipeline_ltx2_image2video",
-        "LTX2ImageToVideoPipeline",
-    ),
-    "LTX2TwoStagesPipeline": (
-        "ltx2",
-        "pipeline_ltx2",
-        "LTX2TwoStagesPipeline",
-    ),
-    "LTX2ImageToVideoTwoStagesPipeline": (
-        "ltx2",
-        "pipeline_ltx2_image2video",
-        "LTX2ImageToVideoTwoStagesPipeline",
-    ),
     "StableAudioPipeline": (
         "stable_audio",
         "pipeline_stable_audio",
@@ -113,11 +85,6 @@
         "pipeline_sd3",
         "StableDiffusion3Pipeline",
     ),
-    "FluxKontextPipeline": (
-        "flux",
-        "pipeline_flux_kontext",
-        "FluxKontextPipeline",
-    ),
     "HunyuanImage3ForCausalMM": (
         "hunyuan_image3",
         "pipeline_hunyuan_image3",
@@ -143,50 +110,10 @@
         "pipeline_omnigen2",
         "OmniGen2Pipeline",
     ),
-    "HeliosPipeline": (
-        "helios",
-        "pipeline_helios",
-        "HeliosPipeline",
-    ),
-    "HeliosPyramidPipeline": (
-        "helios",
-        "pipeline_helios",
-        "HeliosPipeline",
-    ),
-    "Flux2Pipeline": (
-        "flux2",
-        "pipeline_flux2",
-        "Flux2Pipeline",
-    ),
-    "DreamIDOmniPipeline": (
-        "dreamid_omni",
-        "pipeline_dreamid_omni",
-        "DreamIDOmniPipeline",
-    ),
-    "HunyuanVideo15Pipeline": (
-        "hunyuan_video",
-        "pipeline_hunyuan_video_1_5",
-        "HunyuanVideo15Pipeline",
-    ),
-    "HunyuanVideo15ImageToVideoPipeline": (
-        "hunyuan_video",
-        "pipeline_hunyuan_video_1_5_i2v",
-        "HunyuanVideo15I2VPipeline",
-    ),
-    "MagiHumanPipeline": (
-        "magi_human",
-        "pipeline_magi_human",
-        "MagiHumanPipeline",
-    ),
-    "OmniVoicePipeline": (
-        "omnivoice",
-        "pipeline_omnivoice",
-        "OmniVoicePipeline",
-    ),
-    "OmniVoice": (
-        "omnivoice",
-        "pipeline_omnivoice",
-        "OmniVoicePipeline",
+    "HyperCLOVAXVisionPipeline": (
+        "hyperclovax_vision",
+        "pipeline_hyperclovax_vision",
+        "HyperCLOVAXVisionPipeline",
     ),
 }
 
@@ -201,6 +128,13 @@
     }
 )
 
+_VAE_PATCH_PARALLEL_ALLOWLIST = {
+    # Only enable for models we have validated end-to-end.
+    "StableDiffusion3Pipeline",
+    "ZImagePipeline",
+    "NextStep11Pipeline",
+}
+
 _NO_CACHE_ACCELERATION = {
     # Pipelines that do not support cache acceleration (cache_dit / tea_cache).
     "NextStep11Pipeline",
@@ -232,14 +166,17 @@ def initialize_model(
         model = model_class(od_config=od_config)
 
         vae_pp_size = od_config.parallel_config.vae_patch_parallel_size
-        is_distributed_vae = hasattr(model, "vae") and isinstance(model.vae, DistributedVaeMixin)
-        if vae_pp_size > 1 and not is_distributed_vae:
+        if vae_pp_size > 1 and od_config.model_class_name not in _VAE_PATCH_PARALLEL_ALLOWLIST:
             logger.warning(
-                "vae_patch_parallel_size=%d is set but VAE patch parallelism is NOT enabled for %s; ignoring.",
+                "vae_patch_parallel_size=%d is set but VAE patch parallelism is only enabled for %s; ignoring.",
                 vae_pp_size,
-                od_config.model_class_name,
+                sorted(_VAE_PATCH_PARALLEL_ALLOWLIST),
             )
-        if vae_pp_size > 1 and is_distributed_vae and not od_config.vae_use_tiling:
+        if (
+            vae_pp_size > 1
+            and od_config.model_class_name in _VAE_PATCH_PARALLEL_ALLOWLIST
+            and not od_config.vae_use_tiling
+        ):
             logger.info(
                 "vae_patch_parallel_size=%d requires vae_use_tiling; automatically enabling it.",
                 vae_pp_size,
@@ -252,8 +189,20 @@ def initialize_model(
         if hasattr(model, "vae") and hasattr(model.vae, "use_tiling"):
             model.vae.use_tiling = od_config.vae_use_tiling
 
-        if is_distributed_vae:
-            model.vae.set_parallel_size(vae_pp_size)
+        if (
+            vae_pp_size > 1
+            and hasattr(model, "vae")
+            and od_config.model_class_name in _VAE_PATCH_PARALLEL_ALLOWLIST
+            and od_config.vae_use_tiling
+        ):
+            from vllm_omni.diffusion.distributed.parallel_state import get_dit_group
+            from vllm_omni.diffusion.distributed.vae_patch_parallel import maybe_wrap_vae_decode_with_patch_parallelism
+
+            maybe_wrap_vae_decode_with_patch_parallelism(
+                model,
+                vae_patch_parallel_size=vae_pp_size,
+                group_getter=get_dit_group,
+            )
 
         # Apply sequence parallelism if enabled
         # This follows diffusers' pattern where enable_parallelism() is called
@@ -292,12 +241,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) -
 
         for attr in transformer_attrs:
             if not hasattr(model, attr):
-                # Some pipeline like LTX2TwoStagesPipeline have recursive
-                # modules that have the transformer
-                module = find_module_with_attr(model, attr)
-                if module is None:
-                    continue
-                model = module
+                continue
 
             transformer = getattr(model, attr)
             if transformer is None:
@@ -326,11 +270,6 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) -
             apply_sequence_parallel(transformer, sp_config, plan)
             applied_count += 1
 
-        # update forward context sp_plan_hooks_applied
-        ctx = get_forward_context()
-        ctx.sp_plan_hooks_applied = applied_count > 0
-        logger.debug(f"Setting sp_plan_hooks_applied={ctx.sp_plan_hooks_applied} in ``ForwardContext``!")
-
         if applied_count == 0:
             logger.warning(
                 f"Sequence parallelism is enabled (sp_size={sp_size}) but no transformer with _sp_plan found. "
@@ -352,18 +291,12 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) -
     "ZImagePipeline": "get_post_process_func",
     "OvisImagePipeline": "get_ovis_image_post_process_func",
     "WanPipeline": "get_wan22_post_process_func",
-    "WanVACEPipeline": "get_wan22_vace_post_process_func",
-    "LTX2Pipeline": "get_ltx2_post_process_func",
-    "LTX2TwoStagesPipeline": "get_ltx2_post_process_func",
-    "LTX2ImageToVideoPipeline": "get_ltx2_post_process_func",
-    "LTX2ImageToVideoTwoStagesPipeline": "get_ltx2_post_process_func",
     "StableAudioPipeline": "get_stable_audio_post_process_func",
     "WanImageToVideoPipeline": "get_wan22_i2v_post_process_func",
     "LongCatImagePipeline": "get_longcat_image_post_process_func",
     "BagelPipeline": "get_bagel_post_process_func",
     "LongCatImageEditPipeline": "get_longcat_image_post_process_func",
     "StableDiffusion3Pipeline": "get_sd3_image_post_process_func",
-    "FluxKontextPipeline": "get_flux_kontext_post_process_func",
     "Flux2KleinPipeline": "get_flux2_klein_post_process_func",
     "NextStep11Pipeline": "get_nextstep11_post_process_func",
     "FluxPipeline": "get_flux_post_process_func",
@@ -376,6 +309,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) -
     "MagiHumanPipeline": "get_magi_human_post_process_func",
     "OmniVoicePipeline": "get_omnivoice_post_process_func",
     "DreamIDOmniPipeline": "get_dreamid_omni_post_process_func",
+    "HyperCLOVAXVisionPipeline": "get_hyperclovax_vision_post_process_func",
 }
 
 _DIFFUSION_PRE_PROCESS_FUNCS = {
@@ -388,13 +322,8 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) -
     "LongCatImageEditPipeline": "get_longcat_image_edit_pre_process_func",
     "QwenImageLayeredPipeline": "get_qwen_image_layered_pre_process_func",
     "WanPipeline": "get_wan22_pre_process_func",
-    "WanVACEPipeline": "get_wan22_vace_pre_process_func",
     "WanImageToVideoPipeline": "get_wan22_i2v_pre_process_func",
     "OmniGen2Pipeline": "get_omnigen2_pre_process_func",
-    "HeliosPipeline": "get_helios_pre_process_func",
-    "HeliosPyramidPipeline": "get_helios_pre_process_func",
-    "HunyuanVideo15ImageToVideoPipeline": "get_hunyuan_video_15_i2v_pre_process_func",
-    "MagiHumanPipeline": "get_magi_human_pre_process_func",
 }
 
 
diff --git a/vllm_omni/diffusion/request.py b/vllm_omni/diffusion/request.py
index 4d4328d2513..997046d8790 100644
--- a/vllm_omni/diffusion/request.py
+++ b/vllm_omni/diffusion/request.py
@@ -28,6 +28,8 @@ class OmniDiffusionRequest:
     request_ids: list[str] = field(default_factory=list)
     request_id: str | None = None
     kv_sender_info: dict | None = None
+    # Additional data from stage input processors (e.g. vision_tokens, audio_tokens)
+    extra: dict = field(default_factory=dict)
 
     def __post_init__(self):
         """Initialize dependent fields after dataclass initialization."""
diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py
index 160309e0d8d..d88901f7e57 100644
--- a/vllm_omni/diffusion/worker/diffusion_worker.py
+++ b/vllm_omni/diffusion/worker/diffusion_worker.py
@@ -17,7 +17,7 @@
 
 import torch
 import zmq
-from vllm.config import CompilationConfig, DeviceConfig, VllmConfig, set_current_vllm_config
+from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
 from vllm.logger import init_logger
 from vllm.profiler.wrapper import CudaProfilerWrapper, WorkerProfiler
@@ -106,12 +106,8 @@ def init_device(self) -> None:
         self.device = current_omni_platform.get_torch_device(rank)
         current_omni_platform.set_device(self.device)
 
-        # Create vllm_config for parallel configuration. Pass explicit device_config
-        # so DeviceConfig does not rely on current_platform in worker subprocesses.
-        vllm_config = VllmConfig(
-            compilation_config=CompilationConfig(),
-            device_config=DeviceConfig(device=self.device),
-        )
+        # Create vllm_config for parallel configuration
+        vllm_config = VllmConfig(compilation_config=CompilationConfig())
         vllm_config.parallel_config.tensor_parallel_size = self.od_config.parallel_config.tensor_parallel_size
         vllm_config.parallel_config.data_parallel_size = self.od_config.parallel_config.data_parallel_size
         vllm_config.parallel_config.enable_expert_parallel = self.od_config.parallel_config.enable_expert_parallel
diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py
index 5b69d6b1f0c..2f01b5bbd0f 100644
--- a/vllm_omni/engine/arg_utils.py
+++ b/vllm_omni/engine/arg_utils.py
@@ -1,13 +1,10 @@
-import argparse
-import dataclasses
-import json
-import os
-import tempfile
 from dataclasses import dataclass, field
 from typing import Any
 
 from vllm.engine.arg_utils import EngineArgs
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import get_hf_text_config
+from vllm.v1.engine.async_llm import AsyncEngineArgs
 
 from vllm_omni.config import OmniModelConfig
 from vllm_omni.engine.output_modality import OutputModality
@@ -72,14 +69,11 @@ def _register_omni_hf_configs() -> None:
         if _CONFIG_REGISTRY is not None and model_type not in _CONFIG_REGISTRY:
             _CONFIG_REGISTRY[model_type] = config_cls
 
-
 def register_omni_models_to_vllm():
     from vllm.model_executor.models import ModelRegistry
 
     from vllm_omni.model_executor.models.registry import _OMNI_MODELS
 
-    _register_omni_hf_configs()
-
     supported_archs = ModelRegistry.get_supported_archs()
     for arch, (mod_folder, mod_relname, cls_name) in _OMNI_MODELS.items():
         if arch not in supported_archs:
@@ -134,7 +128,7 @@ class OmniEngineArgs(EngineArgs):
 
     stage_id: int = 0
     model_stage: str = "thinker"
-    model_arch: str | None = None
+    model_arch: str = "Qwen2_5OmniForConditionalGeneration"
     engine_output_type: str | None = None
     hf_config_name: str | None = None
     custom_process_next_stage_input_func: str | None = None
@@ -151,15 +145,23 @@ class OmniEngineArgs(EngineArgs):
     log_stats: bool = False
     custom_pipeline_args: dict[str, Any] | None = None
 
-    def __post_init__(self) -> None:
-        load_omni_general_plugins()
-        super().__post_init__()
-
-    @classmethod
-    def from_cli_args(cls, args: argparse.Namespace) -> "OmniEngineArgs":
-        attrs = [attr.name for attr in dataclasses.fields(cls)]
-        engine_args = cls(**{attr: getattr(args, attr) for attr in attrs if hasattr(args, attr)})
-        return engine_args
+    def draw_hf_text_config(self, config_dict: dict) -> Any:
+        # transformers' get_text_config method is used to get the text config from thinker_config.
+        # to handle the case that each model stage has their own text config,
+        # we need to draw the text config from the corresponding model stage.
+        hf_config = config_dict["hf_config"]
+        hf_config_name = config_dict["hf_config_name"]
+        try:
+            # Try to get the stage-specific config (e.g., thinker_config, talker_config)
+            stage_config = getattr(hf_config, hf_config_name)
+            return stage_config.get_text_config()
+        except AttributeError:
+            # Fallback: if the attribute doesn't exist, use the default get_hf_text_config
+            logger.warning(
+                f"Config attribute '{hf_config_name}' not found in hf_config, "
+                "falling back to default get_hf_text_config"
+            )
+            return get_hf_text_config(hf_config)
 
     def _ensure_omni_models_registered(self):
         if hasattr(self, "_omni_models_registered"):
@@ -168,30 +170,6 @@ def _ensure_omni_models_registered(self):
         self._omni_models_registered = True
         return True
 
-    def _patch_empty_hf_config(self, model_type: str) -> None:
-        """For models with empty config.json (e.g. CosyVoice3), create a
-        patched config in a temp directory with model_type set so that
-        transformers AutoConfig.from_pretrained can resolve the config class.
-        Sets self.hf_config_path to point to the patched directory."""
-        try:
-            from transformers import PretrainedConfig
-
-            config_dict, _ = PretrainedConfig.get_config_dict(self.model)
-            if config_dict.get("model_type"):
-                return  # config.json already has model_type, no patching needed
-        except Exception:
-            return  # can't load config, let vLLM handle the error
-
-        # Create a temp dir with a patched config.json
-        temp_dir = tempfile.mkdtemp(prefix="omni_hf_config_")
-        config_dict["model_type"] = model_type
-        config_dict.setdefault("architectures", [self.model_arch])
-        with open(os.path.join(temp_dir, "config.json"), "w") as f:
-            json.dump(config_dict, f)
-        self.hf_config_path = temp_dir
-        self._temp_config_dir = temp_dir
-        logger.info("Patched empty HF config with model_type=%s at %s", model_type, temp_dir)
-
     def create_model_config(self) -> OmniModelConfig:
         """Create an OmniModelConfig from these engine arguments.
         Returns:
@@ -200,103 +178,67 @@ def create_model_config(self) -> OmniModelConfig:
         # register omni models to avoid model not found error
         self._ensure_omni_models_registered()
 
-        # Build stage_connector_config from stage_connector_spec
-        stage_connector_config = {
-            "name": self.stage_connector_spec.get("name", "SharedMemoryConnector"),
-            "extra": self.stage_connector_spec.get("extra", {}).copy(),
-        }
-        stage_connector_config["extra"]["stage_id"] = self.stage_id
+        # First, get the base ModelConfig from the parent class
+        base_config = super().create_model_config()
 
-        # If model_arch is specified, inject it into hf_overrides so vLLM can
-        # resolve the architecture even when config.json lacks 'architectures'.
-        # Also inject model_type so AutoConfig can resolve the correct config
-        # class for models with empty or missing config.json (e.g. CosyVoice3).
-        if self.model_arch:
-            if self.hf_overrides is None:
-                self.hf_overrides = {}
-            if isinstance(self.hf_overrides, dict):
-                self.hf_overrides.setdefault("architectures", [self.model_arch])
-                if "model_type" not in self.hf_overrides:
-                    model_type = _ARCH_TO_MODEL_TYPE.get(self.model_arch)
-                    if model_type is not None:
-                        self.hf_overrides.setdefault("model_type", model_type)
-
-            # For models whose HF config.json is empty or lacks model_type
-            # (e.g. CosyVoice3), AutoConfig.from_pretrained fails because it
-            # cannot determine which config class to use from the empty dict.
-            # hf_overrides alone is not enough since transformers reads
-            # model_type from config_dict before applying overrides.
-            # Workaround: create a patched config.json in a temp directory
-            # and point hf_config_path to it so vLLM reads model_type from it.
-            if not self.hf_config_path:
-                model_type = _ARCH_TO_MODEL_TYPE.get(self.model_arch)
-                if model_type is not None:
-                    self._patch_empty_hf_config(model_type)
-
-        # Auto-detect tokenizer for models that store it in a subdirectory
-        # rather than the root (e.g. CosyVoice3 uses CosyVoice-BlankEN/).
-        if not self.tokenizer and self.model:
-            model_path = self.model
-            if os.path.isdir(model_path) and not os.path.isfile(os.path.join(model_path, "tokenizer_config.json")):
-                for subfolder in sorted(os.listdir(model_path)):
-                    candidate = os.path.join(model_path, subfolder)
-                    if os.path.isdir(candidate) and os.path.isfile(os.path.join(candidate, "tokenizer_config.json")):
-                        self.tokenizer = candidate
-                        logger.info("Auto-detected tokenizer at %s", candidate)
-                        break
-            elif not os.path.isdir(model_path):
-                subfolder = _TOKENIZER_SUBFOLDER_MAP.get(self.model_arch)
-                if subfolder:
-                    # Download just the tokenizer files from the subfolder
-                    try:
-                        from huggingface_hub import snapshot_download
-
-                        local_dir = snapshot_download(
-                            model_path,
-                            allow_patterns=[
-                                f"{subfolder}/tokenizer*",
-                                f"{subfolder}/special_tokens*",
-                                f"{subfolder}/vocab*",
-                                f"{subfolder}/merges*",
-                                f"{subfolder}/added_tokens*",
-                            ],
-                        )
-                        candidate = os.path.join(local_dir, subfolder)
-                        if os.path.isdir(candidate):
-                            self.tokenizer = candidate
-                            logger.info("Downloaded tokenizer from %s/%s", model_path, subfolder)
-                    except Exception as e:
-                        logger.warning("Failed to download tokenizer subfolder: %s", e)
-
-        # Build the vLLM config first, then use it to create the Omni config.
-        try:
-            model_config = super().create_model_config()
-        finally:
-            # Clean up temp config dir if we created one
-            if hasattr(self, "_temp_config_dir"):
-                import shutil
+        # Create OmniModelConfig by copying all base config attributes
+        # and adding the new omni-specific fields
+        config_dict = base_config.__dict__.copy()
+        # FIXME(Isotr0py): This is a temporary workaround for multimodal_config
+        config_dict = {
+            **(getattr(mm := config_dict.pop("multimodal_config", None), "__dict__", mm or {})),
+            **config_dict,
+        }
 
-                shutil.rmtree(self._temp_config_dir, ignore_errors=True)
-                del self._temp_config_dir
+        # Add the new omni-specific fields
+        config_dict["stage_id"] = self.stage_id
+        config_dict["model_stage"] = self.model_stage
+        config_dict["model_arch"] = self.model_arch
+        config_dict["engine_output_type"] = self.engine_output_type
+        config_dict["hf_config_name"] = self.hf_config_name
+        if self.hf_config_name is not None:
+            config_dict["hf_text_config"] = self.draw_hf_text_config(config_dict)
+        # Create and return the OmniModelConfig instance
+        omni_config = OmniModelConfig(**config_dict)
+        omni_config.hf_config.architectures = omni_config.architectures
 
-        omni_config = OmniModelConfig.from_vllm_model_config(
-            model_config=model_config,
-            # All kwargs below are Omni specific
-            stage_id=self.stage_id,
-            async_chunk=self.async_chunk,
-            model_stage=self.model_stage,
-            model_arch=self.model_arch,
-            worker_type=self.worker_type,
-            engine_output_type=self.engine_output_type,
-            hf_config_name=self.hf_config_name,
-            custom_process_next_stage_input_func=self.custom_process_next_stage_input_func,
-            stage_connector_config=stage_connector_config,
-            omni_kv_config=self.omni_kv_config,
-            task_type=self.task_type,
-        )
         return omni_config
 
     @property
     def output_modality(self) -> OutputModality:
         """Parse engine_output_type into a type-safe OutputModality flag."""
         return OutputModality.from_string(self.engine_output_type)
+
+
+@dataclass
+class AsyncOmniEngineArgs(AsyncEngineArgs):
+    """Async engine arguments for omni LLM stages.
+
+    Extends AsyncEngineArgs with omni-specific multi-stage pipeline fields.
+    Used when launching LLM stages (stage_type=llm) within an async context.
+    """
+
+    stage_id: int = 0
+    model_stage: str = "thinker"
+    model_arch: str = "Qwen2_5OmniForConditionalGeneration"
+    engine_output_type: str | None = None
+    hf_config_name: str | None = None
+    custom_process_next_stage_input_func: str | None = None
+    stage_connector_spec: dict[str, Any] = field(default_factory=dict)
+    async_chunk: bool = False
+    omni_kv_config: dict | None = None
+    quantization_config: Any | None = None
+    worker_type: str | None = None
+
+    def __post_init__(self) -> None:
+        load_omni_general_plugins()
+        super().__post_init__()
+
+    def create_engine_config(self, usage_context=None, **kwargs):
+        """Create engine config, injecting model_arch into hf_overrides if set."""
+        if self.model_arch:
+            if self.hf_overrides is None:
+                self.hf_overrides = {}
+            if isinstance(self.hf_overrides, dict):
+                self.hf_overrides.setdefault("architectures", [self.model_arch])
+        return super().create_engine_config(usage_context=usage_context, **kwargs)
diff --git a/vllm_omni/engine/input_processor.py b/vllm_omni/engine/input_processor.py
new file mode 100644
index 00000000000..e0bfd1016dd
--- /dev/null
+++ b/vllm_omni/engine/input_processor.py
@@ -0,0 +1,32 @@
+"""OmniInputProcessor: extends vLLM InputProcessor with OmniInputPreprocessor."""
+
+from vllm.config import VllmConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.renderers import BaseRenderer
+from vllm.v1.engine.input_processor import InputProcessor
+
+from vllm_omni.inputs.preprocess import OmniInputPreprocessor
+
+
+class OmniInputProcessor(InputProcessor):
+    """InputProcessor for omni models.
+
+    Extends the base vLLM InputProcessor by replacing the default
+    InputPreprocessor with OmniInputPreprocessor, which handles
+    omni-specific input types (prompt embeddings, additional information).
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        renderer: BaseRenderer | None = None,
+        *,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ) -> None:
+        super().__init__(vllm_config, renderer, mm_registry=mm_registry)
+        # Replace the base InputPreprocessor with OmniInputPreprocessor
+        self.input_preprocessor = OmniInputPreprocessor(
+            vllm_config,
+            renderer=self.renderer,
+            mm_registry=mm_registry,
+        )
diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py
index 129ef3c99d8..0ec7c339406 100644
--- a/vllm_omni/entrypoints/async_omni.py
+++ b/vllm_omni/entrypoints/async_omni.py
@@ -1,68 +1,98 @@
-"""
-AsyncOmni - Refactored async orchestrator using AsyncOmniEngine.
-
-This is the new implementation that uses AsyncOmniEngine (which manages
-StageEngineCoreClient instances) instead of OmniStage with worker processes.
-"""
-
-from __future__ import annotations
-
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
+import copy
 import time
-from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence
-from typing import TYPE_CHECKING, Any
+import weakref
+from collections.abc import AsyncGenerator, Iterable, Sequence
+from typing import Any
 
-from vllm import TokensPrompt
-from vllm.engine.protocol import EngineClient, StreamingInput
+from vllm.config import VllmConfig
+from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.outputs import PoolingRequestOutput
 from vllm.plugins.io_processors import get_io_processor
-from vllm.pooling_params import PoolingParams
-from vllm.renderers.inputs.preprocess import extract_prompt_components
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.tasks import SupportedTask
+from vllm.sampling_params import SamplingParams
+from vllm.tokenizers import TokenizerLike
 from vllm.v1.engine.exceptions import EngineDeadError
 
+from vllm_omni.config import OmniModelConfig
+from vllm_omni.diffusion.data import DiffusionParallelConfig
+from vllm_omni.distributed.omni_connectors.adapter import compute_talker_prompt_ids_length, try_send_via_connector
+from vllm_omni.distributed.ray_utils.utils import try_close_ray
+from vllm_omni.engine.input_processor import OmniInputProcessor
 from vllm_omni.entrypoints.client_request_state import ClientRequestState
-from vllm_omni.entrypoints.omni_base import OmniBase
-from vllm_omni.metrics.stats import OrchestratorAggregator as OrchestratorMetrics
+from vllm_omni.entrypoints.omni import OmniBase
+from vllm_omni.entrypoints.omni_stage import OmniStage
+from vllm_omni.entrypoints.stage_utils import SHUTDOWN_TASK, OmniStageTaskType
+from vllm_omni.entrypoints.stage_utils import maybe_load_from_ipc as _load
+from vllm_omni.entrypoints.utils import (
+    get_final_stage_id_for_e2e,
+)
+from vllm_omni.inputs.data import OmniPromptType, OmniSamplingParams
+
+# Internal imports (our code)
+from vllm_omni.lora.request import LoRARequest
+from vllm_omni.metrics import OrchestratorAggregator
 from vllm_omni.outputs import OmniRequestOutput
 
-if TYPE_CHECKING:
-    from vllm.inputs.preprocess import InputPreprocessor
-    from vllm.tokenizers import TokenizerLike
-    from vllm.v1.engine import PauseMode
+logger = init_logger(__name__)
 
-    from vllm_omni.inputs.data import OmniPromptType, OmniSamplingParams
 
-logger = init_logger(__name__)
-_FINAL_OUTPUT_IDLE_SLEEP_S = 0.001
+def _weak_close_cleanup_async(stage_list, stage_in_queues, stage_out_queues, ray_pg, output_handler, zmq_ctx=None):
+    """Weak reference cleanup function for AsyncOmni instances."""
+    if stage_list:
+        for q in stage_in_queues:
+            try:
+                q.put_nowait(SHUTDOWN_TASK)
+            except Exception as e:
+                logger.warning(f"Failed to send shutdown signal to stage input queue: {e}")
+            close_fn = getattr(q, "close", None)
+            if callable(close_fn):
+                close_fn()
+        for q in stage_out_queues:
+            close_fn = getattr(q, "close", None)
+            if callable(close_fn):
+                close_fn()
+        for stage in stage_list:
+            try:
+                stage.stop_stage_worker()
+            except Exception as e:
+                logger.warning(f"Failed to stop stage worker: {e}")
+    try_close_ray(ray_pg)
+    # Cancel output handler
+    if output_handler is not None:
+        output_handler.cancel()
+    if zmq_ctx is not None:
+        zmq_ctx.term()
 
 
-class AsyncOmni(EngineClient, OmniBase):
-    """Asynchronous unified entry point for multi-stage pipelines using AsyncOmniEngine.
+class AsyncOmni(OmniBase):
+    """Asynchronous unified entry point supporting multi-stage pipelines for LLM and Diffusion models.
 
-    This is the refactored version that uses AsyncOmniEngine instead of
-    OmniStage workers. It provides the same interface as AsyncOmni but with
-    a cleaner architecture.
+    Similar to the Omni class, but provides an asynchronous interface supporting
+    asynchronous LLM and Diffusion models.
 
     Args:
         model: Model name or path to load.
-        **kwargs: Additional keyword arguments.
+        **kwargs: Arbitrary keyword arguments.
             - stage_configs_path: Optional path to YAML file containing stage
-              configurations. If None, configurations are resolved from model
-              pipeline factory.
-            - log_stats: Whether to enable statistics logging.
-            - stage_init_timeout: Timeout for per-stage initialization.
-            - init_timeout: Total timeout for orchestrator startup.
-            - async_chunk: Whether to enable async chunk mode.
-            - output_modalities: Requested output modalities.
+              configurations. If None, configurations are loaded from the model.
+            - log_stats: Whether to enable statistics logging
+              be written to files with stage-specific suffixes.
+            - stage_init_timeout: Per-stage init watchdog (seconds). Measured from
+              when the previous stage finished (possibly a prior Omni run with GPU
+              reuse/overlap) to when the current stage starts to initialize.
+            - shm_threshold_bytes: Threshold in bytes for using shared memory
+              for IPC. Objects larger than this threshold will use shared memory.
+            - worker_backend: Backend for worker processes. Default is "multi_process".
+            - ray_address: Address of Ray cluster for Ray backend, if using Ray backend.
+            - batch_timeout: Timeout in seconds for batching requests within a stage
+            - init_timeout: Timeout in seconds for waiting for all stages to initialize
             - Additional keyword arguments passed to stage engines.
 
     Example:
-        >>> async_omni = AsyncOmni(model="Qwen/Qwen2.5-Omni-7B")
-        >>> async for output in async_omni.generate(
+        >>> async_llm = AsyncOmni(model="Qwen/Qwen2.5-Omni-7B")
+        >>> async for output in async_llm.generate(
         ...     prompt="Hello",
         ...     request_id="req-1",
         ...     sampling_params_list=[SamplingParams(), SamplingParams()]
@@ -70,698 +100,781 @@ class AsyncOmni(EngineClient, OmniBase):
         ...     print(output)
     """
 
-    def __init__(self, *args: Any, model: str = "", **kwargs: Any) -> None:
-        OmniBase.__init__(self, model=model, **kwargs)
+    def __init__(self, model: str, **kwargs: dict[str, Any]) -> None:
+        # Pause/resume control attributes
         self._pause_cond: asyncio.Condition = asyncio.Condition()
         self._paused: bool = False
-        self._is_sleeping: bool = False
-        self.final_output_task: asyncio.Task | None = None
 
-        self.config_path = self.engine.config_path
-        self.stage_configs = self.engine.stage_configs
-        self.tts_max_instructions_length = kwargs.get("tts_max_instructions_length", None)
-        self.input_processor = self.engine.input_processor
+        # Request state tracking
+        self.request_states: dict[str, ClientRequestState] = {}
+        self.output_handler: asyncio.Task | None = None
+
+        super().__init__(model, **kwargs)
+
+        # Register weak reference cleanup (called on garbage collection)
+        self._weak_finalizer = weakref.finalize(
+            self,
+            _weak_close_cleanup_async,
+            self.stage_list,
+            self._stage_in_queues,
+            self._stage_out_queues,
+            self._ray_pg,
+            self.output_handler,
+            self._zmq_ctx,
+        )
 
-        stage_index = self._get_comprehension_stage_index()
-        if stage_index is None:
-            self.io_processor = None
+    def _create_default_diffusion_stage_cfg(self, kwargs: dict[str, Any]) -> dict[str, Any]:
+        """Create default diffusion stage configuration."""
+        # TODO: here is different from the Omni class. We should merge the two in the future.
+        cache_backend = kwargs.get("cache_backend", "none")
+        cache_config = self._normalize_cache_config(cache_backend, kwargs.get("cache_config", None))
+
+        devices = "0"
+        if "parallel_config" in kwargs:
+            parallel_config = kwargs["parallel_config"]
+            num_devices = kwargs["parallel_config"].world_size
+            for i in range(1, num_devices):
+                devices += f",{i}"
         else:
-            vllm_config = self.engine.stage_vllm_configs[stage_index]
-            io_processor_plugin = vllm_config.model_config.io_processor_plugin
-            renderer = self.renderer
-            if renderer is None:
-                from vllm.renderers import renderer_from_config
-
-                renderer = renderer_from_config(vllm_config)
-            self.io_processor = get_io_processor(vllm_config, renderer, io_processor_plugin)
-
-    def _get_comprehension_stage_index(self) -> int | None:
-        fallback_idx: int | None = None
-        for idx, stage_client in enumerate(self.engine.stage_clients):
-            stage_vllm_config = self.engine.stage_vllm_configs[idx]
-            if stage_vllm_config is None:
-                continue
-            if fallback_idx is None:
-                fallback_idx = idx
-            if stage_client.is_comprehension:
-                return idx
-        return fallback_idx
-
-    @property
-    def renderer(self):
-        """Return the renderer from the engine input processor when available."""
-        if self.input_processor is None:
-            return None
-        return self.input_processor.renderer
-
-    @property
-    def vllm_config(self):
-        """Return the vLLM config for the comprehension stage when present."""
-        stage_index = self._get_comprehension_stage_index()
-        if stage_index is None:
-            return None
-        return self.engine.stage_vllm_configs[stage_index]
-
-    async def get_vllm_config(self) -> Any:
-        """Compatibility helper for call sites expecting async vllm config access."""
-        return self.vllm_config
-
-    def get_diffusion_od_config(self) -> Any | None:
-        """Return the diffusion-stage config when the pipeline has one."""
-        for stage_client in self.engine.stage_clients:
-            if getattr(stage_client, "stage_type", None) != "diffusion":
-                continue
-
-            od_config = getattr(stage_client, "od_config", None)
-            if od_config is not None:
-                return od_config
-
-            inner_engine = getattr(stage_client, "_engine", None)
-            od_config = getattr(inner_engine, "od_config", None)
-            if od_config is not None:
-                return od_config
+            ulysses_degree = kwargs.get("ulysses_degree") or 1
+            ring_degree = kwargs.get("ring_degree") or 1
+            sequence_parallel_size = kwargs.get("sequence_parallel_size")
+            tensor_parallel_size = kwargs.get("tensor_parallel_size") or 1
+            cfg_parallel_size = kwargs.get("cfg_parallel_size") or 1
+            use_hsdp = kwargs.get("use_hsdp", False)
+            hsdp_shard_size = kwargs.get("hsdp_shard_size", -1)
+            hsdp_replicate_size = kwargs.get("hsdp_replicate_size", 1)
+            if sequence_parallel_size is None:
+                sequence_parallel_size = ulysses_degree * ring_degree
+
+            # Calculate num_devices: consider standalone HSDP
+            other_parallel_size = sequence_parallel_size * tensor_parallel_size * cfg_parallel_size
+            if use_hsdp and other_parallel_size == 1 and hsdp_shard_size > 0:
+                # Standalone HSDP: num_devices is determined by HSDP dimensions
+                num_devices = hsdp_shard_size * hsdp_replicate_size
+            else:
+                num_devices = other_parallel_size
+
+            for i in range(1, num_devices):
+                devices += f",{i}"
+            parallel_config = DiffusionParallelConfig(
+                pipeline_parallel_size=1,
+                data_parallel_size=1,
+                tensor_parallel_size=tensor_parallel_size,
+                sequence_parallel_size=sequence_parallel_size,
+                ulysses_degree=ulysses_degree,
+                ring_degree=ring_degree,
+                cfg_parallel_size=cfg_parallel_size,
+                use_hsdp=use_hsdp,
+                hsdp_shard_size=hsdp_shard_size,
+                hsdp_replicate_size=hsdp_replicate_size,
+            )
+        default_stage_cfg = [
+            {
+                "stage_id": 0,
+                "stage_type": "diffusion",
+                "runtime": {
+                    "process": True,
+                    "devices": devices,
+                    "max_batch_size": 1,
+                },
+                "engine_args": {
+                    "parallel_config": parallel_config,
+                    "vae_use_slicing": kwargs.get("vae_use_slicing", False),
+                    "vae_use_tiling": kwargs.get("vae_use_tiling", False),
+                    "cache_backend": cache_backend,
+                    "cache_config": cache_config,
+                    "enable_cache_dit_summary": kwargs.get("enable_cache_dit_summary", False),
+                    "enable_cpu_offload": kwargs.get("enable_cpu_offload", False),
+                    "enable_layerwise_offload": kwargs.get("enable_layerwise_offload", False),
+                    "enforce_eager": kwargs.get("enforce_eager", False),
+                    "diffusion_load_format": kwargs.get("diffusion_load_format", "default"),
+                    "custom_pipeline_args": kwargs.get("custom_pipeline_args", None),
+                },
+                "final_output": True,
+                "final_output_type": "image",
+            }
+        ]
+        default_stage_cfg[0]["engine_args"]["model_stage"] = "diffusion"
+        return default_stage_cfg
+
+    def _process_stage_ready(self, stage: OmniStage, stage_id: int, result: dict[str, Any]) -> None:
+        # Store vllm_config received from worker process (may be None for diffusion stages)
+        vllm_config = result.get("vllm_config")
+        if vllm_config is not None:
+            stage.set_vllm_config(vllm_config)
+        tokenizer = result.get("tokenizer")
+        if tokenizer is not None:
+            stage.set_tokenizer(tokenizer)
+        is_tracing_enabled = result.get("is_tracing_enabled")
+        if is_tracing_enabled is not None:
+            stage.set_is_tracing_enabled(is_tracing_enabled)
+        super()._process_stage_ready(stage, stage_id, result)
+
+    def _wait_for_stages_ready(self, timeout: int = 120) -> None:
+        """Wait for all stages to report readiness."""
+        super()._wait_for_stages_ready(timeout)
+        for stage in self.stage_list:
+            if stage.vllm_config is not None and stage.tokenizer is not None:
+                try:
+                    vllm_config = stage.vllm_config
+                    # Initialize input_processor
+                    # OMNI: OmniInputProcessor creates tokenizer internally from vllm_config
+                    self.input_processor = OmniInputProcessor(
+                        vllm_config=vllm_config,
+                    )
+                    # Initialize model_config
+                    self.model_config = vllm_config.model_config
+                    # Initialize io_processor
+                    io_processor_plugin = self.model_config.io_processor_plugin
+                    self.io_processor = get_io_processor(vllm_config, io_processor_plugin)
+
+                    logger.info(
+                        f"[{self._name}] Initialized input_processor, "
+                        f"io_processor, and model_config from stage-{stage.stage_id}",
+                    )
+                    break
+                except Exception as e:
+                    logger.warning(
+                        f"[{self._name}] Failed to initialize processors from stage-{stage.stage_id}: {e}",
+                    )
+        # If no LLM stage found via ZMQ payload, fall back to creating from stage.engine_args
+        if not hasattr(self, "input_processor") or self.input_processor is None:
+            for stage in self.stage_list:
+                if stage.stage_type == "llm" and hasattr(stage, "engine_args") and stage.engine_args is not None:
+                    try:
+                        logger.info(
+                            f"[{self._name}] stage-{stage.stage_id} vllm_config not received via ZMQ, "
+                            "falling back to create_engine_config from stage.engine_args"
+                        )
+                        from vllm.usage.usage_lib import UsageContext
+
+                        from vllm_omni.engine.arg_utils import AsyncOmniEngineArgs
+                        from vllm_omni.entrypoints.omni_stage import filter_dataclass_kwargs
+
+                        try:
+                            from omegaconf import OmegaConf
+
+                            _ea = OmegaConf.to_container(stage.engine_args, resolve=True)
+                        except Exception:
+                            _ea = dict(stage.engine_args)
+                        _ea = filter_dataclass_kwargs(AsyncOmniEngineArgs, _ea)
+                        _ea.pop("model", None)
+                        _model = getattr(self, "_model", None)
+                        if _model is None:
+                            raise RuntimeError("Cannot determine model path for fallback")
+                        _omni_ea = AsyncOmniEngineArgs(model=_model, **_ea)
+                        vllm_config = _omni_ea.create_engine_config(usage_context=UsageContext.API_SERVER)
+                        stage.set_vllm_config(vllm_config)
+                        self.input_processor = OmniInputProcessor(vllm_config=vllm_config)
+                        self.model_config = vllm_config.model_config
+                        io_processor_plugin = self.model_config.io_processor_plugin
+                        self.io_processor = get_io_processor(vllm_config, io_processor_plugin)
+                        logger.info(
+                            "[%s] Initialized input_processor from stage-%s engine_args fallback",
+                            self._name,
+                            stage.stage_id,
+                        )
+                        break
+                    except Exception as e:
+                        logger.warning(f"[{self._name}] Fallback init failed for stage-{stage.stage_id}: {e}")
+        if not hasattr(self, "input_processor") or self.input_processor is None:
+            logger.warning(
+                f"[{self._name}] No LLM stage found, processors will not be available. "
+                "This may cause issues with OpenAIServingModels."
+            )
+            self.input_processor = None
+            self.io_processor = None
+            self.model_config = None
 
-        return None
+    def shutdown(self):
+        """Shutdown, cleaning up the background proc and IPC.
 
-    @property
-    def model_config(self):
-        """Return the model config for the comprehension stage when present."""
-        vllm_config = self.vllm_config
-        if vllm_config is None:
-            return None
-        return vllm_config.model_config
-
-    # ==================== Generate Method ====================
+        Alias for close() method. Cleans up all stage processes
+        and inter-process communication resources.
+        """
+        if hasattr(self, "_weak_finalizer"):
+            self._weak_finalizer()
 
     async def generate(
         self,
-        prompt: OmniPromptType | AsyncGenerator[StreamingInput, None] | list[OmniPromptType],
-        sampling_params: Any = None,
-        request_id: str = "",
-        *,
-        prompt_text: str | None = None,
-        lora_request: Any = None,
-        tokenization_kwargs: dict[str, Any] | None = None,
+        prompt: OmniPromptType,
+        request_id: str,
         sampling_params_list: Sequence[OmniSamplingParams] | None = None,
+        *,
         output_modalities: list[str] | None = None,
-        trace_headers: Mapping[str, str] | None = None,
-        priority: int = 0,
-        data_parallel_rank: int | None = None,
-        reasoning_ended: bool | None = None,
     ) -> AsyncGenerator[OmniRequestOutput, None]:
-        """Generate outputs for the given prompt(s) asynchronously.
-
-        Coordinates multi-stage pipeline execution. Processes the prompt
-        through all stages in the pipeline and yields outputs as they become
-        available.
+        """Generate outputs for the given prompt asynchronously.
 
-        **Batch mode (diffusion only):**
-        When *prompt* is a ``list``, all prompts are dispatched in a single
-        ``DiffusionEngine.step()`` call at the diffusion stage.  The combined
-        result is yielded as one ``OmniRequestOutput`` with all generated
-        images.  Only a single *request_id* is used for the whole batch.
+        Coordinates multi-stage pipeline through YAML configuration.
+        Each stage will use AsyncOmniLLM or AsyncOmniDiffusion based on stage_type.
+        Processes the prompt through all stages in the pipeline and yields
+        outputs as they become available. Each stage uses its corresponding
+        sampling parameters from the sampling_params_list.
 
         Args:
-            prompt: A single prompt **or** a list of prompts.  A list
-                triggers batch mode when the diffusion stage is reached.
-            request_id: Unique identifier for this request.
-            sampling_params_list: List of SamplingParams, one per stage.
+            prompt: Prompt to process. Can be a text string, token IDs,
+                or multimodal prompt.
+            request_id: Unique identifier for this request
+            sampling_params_list: List of SamplingParams, one for each stage.
                 Must have the same length as the number of stages.
-                If *None*, uses default sampling params for each stage.
+                If None, uses default sampling params for each stage.
             output_modalities: Optional list of output modalities.
 
         Yields:
             OmniRequestOutput objects as they are produced by each stage.
-            In batch mode the diffusion stage yields one output containing
-            all generated images.
+            Each output contains the stage_id, final_output_type, and
+            the request_output from that stage.
 
         Raises:
             ValueError: If sampling_params_list has incorrect length.
         """
-        # Wait until generation is resumed if the engine is paused
+        # Wait until generation is resumed if the engine is paused.
         async with self._pause_cond:
             await self._pause_cond.wait_for(lambda: not self._paused)
 
-        logger.debug(f"[AsyncOmni] generate() called for request {request_id}")
-
-        input_stream_task: asyncio.Task | None = None
+        logger.debug(f"[{self._name}] generate() called")
         try:
-            # Start final output dispatcher on the first call to generate()
-            self._final_output_handler()
-
-            sampling_params_list = self.resolve_sampling_params_list(sampling_params_list)
-
-            # Track per-request metrics
-            wall_start_ts = time.time()
-            req_start_ts: dict[str, float] = {}
-
-            # Determine the final stage for E2E stats
-            final_stage_id_for_e2e = self._compute_final_stage_id(output_modalities)
+            # Start output handler on the first call to generate()
+            self._run_output_handler()
+
+            # TODO: lora_request, trace_headers, priority are not supported yet
+            if sampling_params_list is None:
+                sampling_params_list = self.default_sampling_params_list
+
+            if len(sampling_params_list) != len(self.stage_list):
+                raise ValueError(f"Expected {len(self.stage_list)} sampling params, got {len(sampling_params_list)}")
+
+            # Orchestrator keeps stage objects for input derivation
+            num_stages = len(self.stage_list)
+            # Track per-request start time for end-to-end timing
+            _req_start_ts: dict[int, float] = {}
+            _wall_start_ts: float = time.time()
+            # _last_finish_ts: float = _wall_start_ts
+
+            # Determine the final stage for E2E stats (highest stage_id with
+            # final_output=True; fallback to last stage)
+            final_stage_id_for_e2e = get_final_stage_id_for_e2e(
+                output_modalities, self.output_modalities, self.stage_list
+            )
 
-            metrics = OrchestratorMetrics(
-                self.num_stages,
-                self.log_stats,
-                wall_start_ts,
-                final_stage_id_for_e2e,
+            # Metrics/aggregation helper
+            metrics = OrchestratorAggregator(
+                num_stages=num_stages,
+                log_stats=self.log_stats,
+                wall_start_ts=_wall_start_ts,
+                final_stage_id_for_e2e=final_stage_id_for_e2e,
             )
             req_state = ClientRequestState(request_id)
             req_state.metrics = metrics
             self.request_states[request_id] = req_state
-
-            # Add request(s) to stage 0. For streaming inputs, submit
-            # chunks incrementally through streaming_update.
-            if isinstance(prompt, AsyncGenerator):
-                input_stream_task = await self._add_streaming_input_request(
-                    request_id=request_id,
-                    input_stream=prompt,
-                    sampling_params_list=sampling_params_list,
-                    final_stage_id=final_stage_id_for_e2e,
-                )
+            sp0: SamplingParams = sampling_params_list[0]  # type: ignore[index]
+            task = {
+                "request_id": request_id,
+                "engine_inputs": prompt,
+                "sampling_params": sp0,
+            }
+            self.stage_list[0].submit(task)
+            metrics.stage_first_ts[0] = metrics.stage_first_ts[0] or time.time()
+            _req_start_ts[request_id] = time.time()
+            logger.info(
+                f"[{self._name}] Entering scheduling loop: stages={num_stages}, final_stage={final_stage_id_for_e2e}"
+            )
+            if self.async_chunk:
+                stage_queues = {stage_id: asyncio.Queue() for stage_id in range(num_stages)}
+                req_state.stage_queues = stage_queues
+                async for output in self._process_async_results(
+                    request_id,
+                    prompt,
+                    sampling_params_list,
+                    req_state,
+                    metrics,
+                    final_stage_id_for_e2e,
+                ):
+                    yield output
             else:
-                await self.engine.add_request_async(
-                    request_id=request_id,
-                    prompt=prompt,
-                    sampling_params_list=sampling_params_list,
-                    final_stage_id=final_stage_id_for_e2e,
+                async for output in self._process_sequential_results(
+                    request_id,
+                    req_state,
+                    metrics,
+                    final_stage_id_for_e2e,
+                    sampling_params_list,
+                    prompt,
+                ):
+                    yield output
+
+            logger.debug(f"[{self._name}] Request {request_id} finalized at stage-{final_stage_id_for_e2e}")
+            try:
+                # Finalize E2E metrics if not already done
+                metrics.on_finalize_request(
+                    final_stage_id_for_e2e,
+                    request_id,
+                    _req_start_ts.get(request_id, _wall_start_ts),
                 )
-            submit_ts = time.time()
-            req_state.metrics.stage_first_ts[0] = submit_ts
-            req_start_ts[request_id] = submit_ts
-
-            # Process results based on mode
-            # Both sequential and async_chunk modes read the same message stream
-            # from Orchestrator; stage-transfer behavior differs inside
-            # Orchestrator._route_output().
-            async for output in self._process_orchestrator_results(
-                request_id,
-                metrics,
-                final_stage_id_for_e2e,
-                req_start_ts,
-                wall_start_ts,
-            ):
-                yield output
-
-            logger.debug(f"[AsyncOmni] Request {request_id} completed")
-
-            self._log_summary_and_cleanup(request_id)
 
+                logger.debug(f"[{self._name}] All requests completed")
+                # Summarize and print stats
+                metrics.build_and_log_summary()
+            except Exception as e:
+                logger.exception(f"[{self._name}] Request {request_id} Failed to finalized/build/log summary: {e}")
+            finally:
+                self.request_states.pop(request_id, None)
         except (asyncio.CancelledError, GeneratorExit):
-            if input_stream_task is not None and not input_stream_task.done():
-                input_stream_task.cancel()
-            await self.abort(request_id)
-            logger.info(f"[AsyncOmni] Request {request_id} aborted.")
-            raise
-        except Exception as e:
             await self.abort(request_id)
-            logger.info(f"[AsyncOmni] Request {request_id} failed (input error): {e}")
+            logger.info("[AsyncOrchestrator] Request %s aborted.", request_id)
             raise
 
-    async def _add_streaming_input_request(
+    async def _process_async_results(
         self,
-        *,
         request_id: str,
-        input_stream: AsyncGenerator[StreamingInput, None],
-        sampling_params_list: Sequence[OmniSamplingParams],
-        final_stage_id: int,
-    ) -> asyncio.Task:
-        """Submit a streaming input generator as incremental stage-0 updates."""
-        if not sampling_params_list:
-            raise ValueError("sampling_params_list cannot be empty for streaming input")
-        # only check thinker's sampling params now
-        stage0_params = sampling_params_list[0]
-        self._validate_streaming_input_sampling_params(stage0_params)
-
-        req_state = self.request_states[request_id]
-
-        if not stage0_params.skip_clone:
-            stage0_params = stage0_params.clone()
-            stage0_params.skip_clone = True
-        stage0_params.output_kind = RequestOutputKind.DELTA
-
-        has_submitted_first_chunk = False
-
-        async def handle_inputs() -> None:
-            nonlocal has_submitted_first_chunk
-            cancelled = False
-            try:
-                async for chunk in input_stream:
-                    chunk_params = getattr(chunk, "sampling_params", None) or stage0_params
-                    self._validate_streaming_input_sampling_params(chunk_params)
-                    chunk_sampling_params_list = list(sampling_params_list)
-                    chunk_sampling_params_list[0] = chunk_params
-                    chunk_prompt = chunk.prompt
-                    prompt_text, _, _ = extract_prompt_components(self.model_config, chunk_prompt)
-
-                    if not has_submitted_first_chunk:
-                        await self.engine.add_request_async(
-                            request_id=request_id,
-                            prompt=chunk_prompt,
-                            prompt_text=prompt_text,
-                            sampling_params_list=chunk_sampling_params_list,
-                            final_stage_id=final_stage_id,
-                            resumable=True,
-                        )
-                        has_submitted_first_chunk = True
-                    else:
-                        await self.engine.add_streaming_update_async(
-                            request_id=request_id,
-                            prompt=chunk_prompt,
-                            prompt_text=prompt_text,
-                            sampling_params_list=chunk_sampling_params_list,
-                            final_stage_id=final_stage_id,
-                            resumable=True,
-                        )
-            except (asyncio.CancelledError, GeneratorExit):
-                cancelled = True
-            except Exception as error:
-                await req_state.queue.put({"request_id": request_id, "error": error})
-            finally:
-                if not cancelled:
-                    # Send empty final request to indicate that inputs have
-                    # finished. Don't send if canceled (session was aborted).
-                    final_sampling_params_list = list(sampling_params_list)
-                    final_sampling_params_list[0] = stage0_params
-                    final_prompt = TokensPrompt(prompt_token_ids=[0])
-
-                    if has_submitted_first_chunk:
-                        await self.engine.add_streaming_update_async(
-                            request_id=request_id,
-                            prompt=final_prompt,
-                            prompt_text=None,
-                            sampling_params_list=final_sampling_params_list,
-                            final_stage_id=final_stage_id,
-                            resumable=False,
-                        )
-                    else:
-                        await self.engine.add_request_async(
-                            request_id=request_id,
-                            prompt=final_prompt,
-                            prompt_text=None,
-                            sampling_params_list=final_sampling_params_list,
-                            final_stage_id=final_stage_id,
-                            resumable=False,
-                        )
-
-        input_stream_task = asyncio.create_task(handle_inputs())
-        req_state.input_stream_task = input_stream_task
-        return input_stream_task
-
-    @staticmethod
-    def _validate_streaming_input_sampling_params(params: OmniSamplingParams) -> None:
-        if (
-            not isinstance(params, SamplingParams)
-            or params.n > 1
-            or params.output_kind == RequestOutputKind.FINAL_ONLY
-            or params.stop
-        ):
-            raise ValueError(
-                "Input streaming is currently supported only for SamplingParams "
-                "with n == 1, output_kind != FINAL_ONLY, and without stop strings."
-            )
-
-    async def encode(
-        self,
         prompt: Any,
-        pooling_params: PoolingParams,
-        request_id: str,
-        lora_request: LoRARequest | None = None,
-        trace_headers: dict[str, str] | None = None,
-        priority: int = 0,
-        tokenization_kwargs: dict[str, Any] | None = None,
-        reasoning_ended: bool | None = None,
-    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """EngineClient.encode() stub.
-
-        Omni pipeline currently exposes only generate() API at orchestrator level.
-        """
-        raise NotImplementedError("AsyncOmni.encode is not implemented.")
-
-    # ==================== Processing Methods ====================
-
-    async def _process_orchestrator_results(
+        sampling_params_list: list[SamplingParams],
+        req_state: ClientRequestState,
+        metrics: OrchestratorAggregator,
+        final_stage_id_for_e2e: int,
+    ) -> AsyncGenerator[OmniRequestOutput, None]:
+        all_stages_finished = {stage_id: False for stage_id in range(final_stage_id_for_e2e + 1)}
+        submit_flag = True
+        while not all(all_stages_finished.values()):
+            for stage_id, stage in enumerate(self.stage_list[: final_stage_id_for_e2e + 1]):
+                if all_stages_finished[stage_id]:
+                    continue
+                try:
+                    result = req_state.stage_queues[stage_id].get_nowait()
+                except asyncio.QueueEmpty:
+                    await asyncio.sleep(0.001)
+                    continue
+                engine_outputs, finished, output_to_yield = self._process_single_result(
+                    result,
+                    stage,
+                    stage_id,
+                    metrics,
+                )
+                if submit_flag and stage_id == 0:
+                    submit_flag = False
+                    prompt_token_ids = engine_outputs.prompt_token_ids
+                    engine_input = copy.deepcopy(prompt)
+                    next_prompt_len = max(1, compute_talker_prompt_ids_length(prompt_token_ids))
+                    engine_input["prompt_token_ids"] = [0] * next_prompt_len
+                    engine_input["multi_modal_data"] = engine_input["mm_processor_kwargs"] = None
+                    for i in range(1, len(self.stage_list)):
+                        task = {
+                            "request_id": request_id,
+                            "engine_inputs": engine_input,
+                            "sampling_params": sampling_params_list[i],
+                        }
+                        self.stage_list[i].submit(task)
+                        metrics.stage_first_ts[i] = time.time()
+                all_stages_finished[stage_id] = finished
+
+                if output_to_yield:
+                    yield output_to_yield
+
+    async def _process_sequential_results(
         self,
         request_id: str,
-        metrics: OrchestratorMetrics,
+        req_state: ClientRequestState,
+        metrics: OrchestratorAggregator,
         final_stage_id_for_e2e: int,
-        req_start_ts: dict[str, float],
-        wall_start_ts: float,
+        sampling_params_list: list[SamplingParams],
+        prompt: Any,
     ) -> AsyncGenerator[OmniRequestOutput, None]:
-        """Read results from the Orchestrator (via the request's asyncio.Queue)
-        and yield OmniRequestOutput objects.
-
-        The Orchestrator handles all stage-to-stage transfers. This method
-        only processes final outputs that arrive on the per-request queue.
+        # Track stages that were never submitted (no inputs); skip waiting for them.
+        # This handles the fan-out topology where Stage-0 forwards to BOTH Stage-1
+        # (vision decoder) and Stage-2 (audio decoder) independently, based on
+        # which token types appeared in Stage-0 output.
+        skipped_stages: set[int] = set()
+        for stage_id, stage in enumerate(self.stage_list[: final_stage_id_for_e2e + 1]):
+            if stage_id in skipped_stages:
+                continue
+            finished = False
+            while not finished:
+                result = await req_state.queue.get()
+                assert stage_id == req_state.stage_id
+                engine_outputs, finished, output_to_yield = self._process_single_result(
+                    result,
+                    stage,
+                    stage_id,
+                    metrics,
+                )
+                if output_to_yield:
+                    yield output_to_yield
+            if not isinstance(engine_outputs, list):
+                engine_outputs = [engine_outputs]
+            stage.set_engine_outputs(engine_outputs)
+            # Forward to all subsequent stages whose engine_input_source includes
+            # this stage. Both Stage-1 (vision) and Stage-2 (audio) source from
+            # Stage-0 independently, so we must try both after Stage-0 completes.
+            any_forwarded = False
+            for next_stage_id in range(stage_id + 1, final_stage_id_for_e2e + 1):
+                next_stage: OmniStage = self.stage_list[next_stage_id]
+                if stage_id not in getattr(next_stage, "engine_input_source", []):
+                    continue
+                # Derive inputs for the next stage, record postprocess time
+                with metrics.stage_postprocess_timer(stage_id, request_id):
+                    next_inputs = next_stage.process_engine_inputs(self.stage_list, prompt)
+                sp_next: SamplingParams = sampling_params_list[next_stage_id]
+                if not next_inputs:
+                    logger.warning(
+                        "[%s] No inputs for stage-%s (request %s), skipping forward",
+                        self._name,
+                        next_stage_id,
+                        request_id,
+                    )
+                    skipped_stages.add(next_stage_id)
+                    continue
+                # Check if we have a connector for this edge
+                connector_key = (str(stage_id), str(next_stage_id))
+                connector = self.connectors.get(connector_key)
+                sent_via_connector = False
+                if connector:
+                    sent_via_connector = try_send_via_connector(
+                        connector=connector,
+                        stage_id=stage_id,
+                        next_stage_id=next_stage_id,
+                        req_id=request_id,
+                        next_inputs=next_inputs,
+                        sampling_params=sp_next,
+                        original_prompt=prompt,
+                        next_stage_queue_submit_fn=self.stage_list[next_stage_id].submit,
+                        metrics=metrics,
+                    )
+                if not sent_via_connector:
+                    # Fallback logic removed as we now enforce connector usage.
+                    # If no connector is found or send fails, we log an error and raise,
+                    # because continuing would cause the request to be silently dropped
+                    # and the orchestrator to hang waiting for completion.
+                    error_msg = (
+                        f"[{self._name}] Failed to send request {request_id} to stage-{next_stage_id} via connector. "
+                        "Configure a connector for this edge or inspect connector logs for details."
+                    )
+                    logger.error(error_msg)
+                    raise RuntimeError(error_msg)
+                logger.debug(f"[{self._name}] Forwarded request {request_id} to stage-{next_stage_id}")
+                any_forwarded = True
+            if not any_forwarded:
+                logger.debug(f"[{self._name}] Request {request_id} fully completed at stage-{stage_id}")
+
+    def _process_single_result(
+        self,
+        result: dict[str, Any],
+        stage: OmniStage,
+        stage_id: int,
+        metrics: OrchestratorAggregator,
+    ) -> tuple[Any, bool, OmniRequestOutput | None]:
         """
-        req_state = self.request_states.get(request_id)
-        if req_state is None:
-            return
+        Process a single result dictionary from a stage.
+        Returns:
+            engine_outputs: The decoded outputs.
+            finished: Whether the stage processing is finished for this request.
+            output_to_yield: An OmniRequestOutput to yield, or None.
+        """
+        req_id = result.get("request_id")
 
-        while True:
-            result = await req_state.queue.get()
+        if result.get("skipped"):
+            logger.info(f"[{self._name}] Stage {stage_id} skipped request {req_id} (no engine inputs)")
 
-            stage_id = result.get("stage_id", 0)
+            class _SkippedOutput:
+                finished = True
+                prompt_token_ids = []
 
-            # Check for errors
-            if "error" in result:
-                logger.error(
-                    "[AsyncOmni] Orchestrator error for req=%s stage-%s: %s",
-                    request_id,
-                    stage_id,
-                    result["error"],
-                )
-                raise RuntimeError(result)
-
-            # Process the result (constructs OmniRequestOutput)
-            output_to_yield = self._process_single_result(
-                result,
-                stage_id,
-                metrics,
-                req_start_ts,
-                wall_start_ts,
-                final_stage_id_for_e2e,
-            )
+            return _SkippedOutput(), True, None
 
-            if output_to_yield:
-                logger.debug(
-                    "[AsyncOmni] req=%s stage-%s yielding final_output_type=%s",
-                    request_id,
-                    stage_id,
-                    getattr(output_to_yield, "final_output_type", None),
+        if "error" in result:
+            logger.error(
+                f"[{self._name}] Stage {stage_id} error on request {req_id}: {result['error']}",
+            )
+            raise RuntimeError(result)
+
+        engine_outputs = _load(result, obj_key="engine_outputs", shm_key="engine_outputs_shm")
+        if isinstance(engine_outputs, list):
+            engine_outputs = engine_outputs[0]
+
+        finished = engine_outputs.finished
+
+        output_to_yield = None
+
+        if getattr(stage, "final_output", False):
+            # Construct output to yield
+            images = []
+            if stage.final_output_type == "image":
+                if isinstance(engine_outputs, OmniRequestOutput) and engine_outputs.images:
+                    images = engine_outputs.images
+                elif hasattr(engine_outputs, "images") and engine_outputs.images:
+                    images = engine_outputs.images
+
+            if stage.final_output_type == "image":
+                output_to_yield = OmniRequestOutput(
+                    stage_id=stage_id,
+                    final_output_type=stage.final_output_type,
+                    request_output=engine_outputs,
+                    images=images,
+                    finished=finished,
                 )
-                yield output_to_yield
-
-            # The Orchestrator sets "finished" when the final stage is done
-            if result.get("finished"):
-                break
+            else:
+                output_to_yield = OmniRequestOutput(
+                    stage_id=stage_id,
+                    final_output_type=stage.final_output_type,
+                    request_output=engine_outputs,
+                    finished=finished,
+                )
+        # Mark last output time
+        metrics.stage_last_ts[stage_id] = max(metrics.stage_last_ts[stage_id] or 0.0, time.time())
+
+        metrics.process_stage_metrics(
+            result=result,
+            stage_type=stage.stage_type,
+            stage_id=stage_id,
+            req_id=req_id,
+            engine_outputs=engine_outputs,
+            finished=finished,
+            final_output_type=stage.final_output_type,
+            output_to_yield=output_to_yield,
+        )
 
-    # ==================== Output Handler ====================
+        logger.debug(
+            f"[{self._name}] Stage-{stage_id} completed request {req_id}; forwarding or finalizing",
+        )
 
-    def _final_output_handler(self) -> None:
-        """Start the final output handler if not already running.
+        return engine_outputs, finished, output_to_yield
 
-        This handler reads messages from the Orchestrator output queue and
-        routes them to per-request asyncio.Queues.
-        """
-        if self.final_output_task is not None:
+    def _run_output_handler(self) -> None:
+        if self.output_handler is not None:
             return
 
-        engine = self.engine
+        stage_list = self.stage_list
+        request_states = self.request_states
 
-        async def _final_output_loop():
-            """Background coroutine that dispatches final outputs to request queues."""
+        async def output_handler():
             try:
                 while True:
-                    msg = await engine.try_get_output_async()
-                    if msg is None:
-                        await asyncio.sleep(_FINAL_OUTPUT_IDLE_SLEEP_S)
-                        continue
-
-                    should_continue, _, stage_id, req_state = self._handle_output_message(msg)
-                    if should_continue:
-                        continue
-
-                    req_state.stage_id = stage_id
-
-                    # Route to the per-request queue
-                    await req_state.queue.put(msg)
-
-            except asyncio.CancelledError:
-                raise
+                    idle = True
+                    for stage_id, stage in enumerate(stage_list):
+                        result = stage.try_collect()
+                        if result is None:
+                            continue
+                        idle = False
+                        if result.get("type") == "stage_ready":
+                            # Only happens when stage is initialized slower than expected,
+                            # so we wait for a short time and try again
+                            await asyncio.sleep(0.05)
+                            continue
+                        req_id = result.get("request_id")
+                        req_state = request_states.get(req_id)
+                        if req_state is None:
+                            logger.debug(
+                                f"[{self._name}] Request may have been aborted; \
+                                dropping output for req {req_id} at stage-{stage_id}"
+                            )
+                            continue
+                        if hasattr(req_state, "stage_queues") and stage_id in req_state.stage_queues:
+                            await req_state.stage_queues[stage_id].put(result)
+                        else:
+                            # Fallback to old behavior for compatibility
+                            await req_state.queue.put(result)
+                            req_state.stage_id = stage_id
+                    if idle:
+                        await asyncio.sleep(0.001)  # Avoid CPU overload when idle
+                    else:
+                        await asyncio.sleep(0)
             except Exception as e:
-                logger.exception("[AsyncOmni] final_output_loop failed.")
-                for req_state in list(self.request_states.values()):
+                logger.exception("AsyncOmni output_handler failed.")
+                for req_state in request_states.values():
                     error_msg = {"request_id": req_state.request_id, "error": str(e)}
-                    await req_state.queue.put(error_msg)
-                self.final_output_task = None
-
-        self.final_output_task = asyncio.create_task(_final_output_loop())
-        logger.debug("[AsyncOmni] Final output handler started")
+                    # Send error to all stage queues
+                    if hasattr(req_state, "stage_queues"):
+                        for queue in req_state.stage_queues.values():
+                            await queue.put(error_msg)
+                    else:
+                        await req_state.queue.put(error_msg)
+                    error_msg = {"request_id": req_state.request_id, "error": str(e)}
+                self.output_handler = None  # Make possible for restart
 
-    # ==================== Control Methods ====================
+        self.output_handler = asyncio.create_task(output_handler())
 
-    async def collective_rpc(
-        self,
-        method: str,
-        timeout: float | None = None,
-        args: tuple[Any, ...] = (),
-        kwargs: dict[str, Any] | None = None,
-        stage_ids: list[int] | None = None,
-    ) -> list[Any]:
-        """Execute a best-effort control RPC on selected stages.
-
-        Unsupported stages currently return a TODO-style result dict instead of
-        failing the entire call. This keeps AsyncOmni usable while the orchestrator
-        control plane is still being filled out.
-        """
-        results = await self.engine.collective_rpc_async(
-            method=method,
-            timeout=timeout,
-            args=args,
-            kwargs=kwargs,
-            stage_ids=stage_ids,
-        )
+    @property
+    def is_running(self) -> bool:
+        # Is None before the loop is started.
+        return len(self._stage_in_queues) > 0
 
-        unsupported_stage_ids: list[int] = []
-        effective_stage_ids = stage_ids or list(range(len(results)))
-        for index, result in enumerate(results):
-            if isinstance(result, dict) and result.get("todo"):
-                unsupported_stage_ids.append(effective_stage_ids[index])
+    @property
+    def is_stopped(self) -> bool:
+        return self.errored
 
-        if unsupported_stage_ids:
-            logger.warning(
-                "[AsyncOmni] collective_rpc(%s) has TODO support on stage(s): %s",
-                method,
-                unsupported_stage_ids,
-            )
+    @property
+    def errored(self) -> bool:
+        return not self.is_running
 
-        return results
+    @property
+    def _name(self) -> str:
+        return "AsyncOrchestrator"
 
-    @staticmethod
-    def _coerce_stage_bool(result: Any) -> bool:
-        """Reduce a stage RPC result to a boolean.
+    @property
+    def is_async(self) -> bool:
+        return True
 
-        Some stage RPCs may return worker-level lists like ``[True]``;
-        diffusion wrappers usually return a plain bool.
-        """
-        if isinstance(result, list):
-            return all(bool(item) for item in result)
-        return bool(result)
+    @property
+    def dead_error(self) -> BaseException:
+        return EngineDeadError()
 
     async def abort(self, request_id: str | Iterable[str]) -> None:
-        """Abort request(s) via the Orchestrator."""
-        request_ids = [request_id] if isinstance(request_id, str) else list(request_id)
-        await self.engine.abort_async(request_ids)
-        for req_id in request_ids:
-            self.request_states.pop(req_id, None)
-        if self.log_stats:
-            logger.info("[AsyncOmni] Aborted request(s) %s", ",".join(request_ids))
-
-    async def pause_generation(
-        self,
-        *,
-        mode: PauseMode = "abort",
-        wait_for_inflight_requests: bool = False,
-        clear_cache: bool = True,
-    ) -> None:
-        """Pause generation."""
-        async with self._pause_cond:
-            if self._paused:
-                return
-            self._paused = True
+        abort_task = {"type": OmniStageTaskType.ABORT, "request_id": request_id}
+        for stage in self.stage_list:
+            stage.submit(abort_task)
+        return None
 
-        # TODO: Implement request draining if wait_for_inflight_requests
+    async def get_vllm_config(self) -> VllmConfig:
+        for stage in self.stage_list:
+            if stage.is_comprehension:
+                # Use the vllm_config received from worker process
+                if stage.vllm_config is not None:
+                    return stage.vllm_config
+        return None
 
-        if clear_cache:
-            # Clear caches for all stages.
-            await self.reset_prefix_cache(
-                reset_running_requests=not wait_for_inflight_requests,
-                reset_connector=True,
-            )
-            await self.reset_mm_cache()
-            await self.reset_encoder_cache()
+    async def get_model_config(self) -> OmniModelConfig:
+        for stage in self.stage_list:
+            if stage.is_comprehension:
+                # Use the vllm_config received from worker process
+                if stage.vllm_config is not None:
+                    return stage.vllm_config.model_config
+        return None
 
-    async def resume_generation(self) -> None:
-        """Resume generation."""
-        async with self._pause_cond:
-            self._paused = False
-            self._pause_cond.notify_all()
+    async def get_input_preprocessor(self) -> InputPreprocessor:
+        return None
 
-    async def is_paused(self) -> bool:
-        """Check if paused."""
-        async with self._pause_cond:
-            return self._paused
+    async def get_tokenizer(self) -> TokenizerLike:
+        for stage in self.stage_list:
+            if stage.is_comprehension:
+                return stage.tokenizer
+        return None
 
-    async def start_profile(
-        self,
-        profile_prefix: str | None = None,
-        stages: list[int] | None = None,
-    ) -> list[Any]:
-        """Start profiling specified stages.
+    async def is_tracing_enabled(self) -> bool:
+        for stage in self.stage_list:
+            if stage.is_comprehension:
+                return stage.is_tracing_enabled
+        return False
 
-        Uses vLLM-compatible profile(is_start=True, profile_prefix) interface.
+    @property
+    def renderer(self):
+        """Return the renderer from input_processor if available.
 
-        Args:
-            profile_prefix: Optional prefix for the trace file names.
-            stages: List of stage IDs to profile. If None, profiles all stages.
+        OMNI: Required by upstream OpenAIServingModels.__init__ which
+        accesses engine_client.renderer.
         """
-        return await self.collective_rpc(method="profile", args=(True, profile_prefix), stage_ids=stages)
-
-    async def stop_profile(self, stages: list[int] | None = None) -> list[Any]:
-        """Stop profiling specified stages.
+        return self.input_processor.renderer
 
-        Uses vLLM-compatible profile(is_start=False) interface.
+    async def do_log_stats(self) -> None:
+        pass
 
-        Args:
-            stages: List of stage IDs to profile. If None, stops all stages.
-        """
-        return await self.collective_rpc(method="profile", args=(False, None), stage_ids=stages)
+    async def check_health(self) -> None:
+        pass
 
     async def reset_mm_cache(self) -> None:
-        """Reset the multi-modal cache for all stages.
-
-        TODO: Forward to Orchestrator process via message.
-        """
-        logger.warning("[AsyncOmni] reset_mm_cache not yet supported with Orchestrator process")
-
-    async def reset_encoder_cache(self) -> None:
-        """Reset the encoder cache for all stages.
-
-        TODO: Forward to Orchestrator process via message.
-        """
-        logger.warning("[AsyncOmni] reset_encoder_cache not yet supported with Orchestrator process")
-
-    async def reset_prefix_cache(
-        self,
-        reset_running_requests: bool = False,
-        reset_connector: bool = False,
-    ) -> bool:
-        """Reset the prefix cache for all stages.
-
-        TODO: Forward to Orchestrator process via message.
-        """
-        logger.warning("[AsyncOmni] reset_prefix_cache not yet supported with Orchestrator process")
-        return True
+        pass
 
-    async def sleep(self, level: int = 1, mode: PauseMode = "abort") -> None:
-        """Sleep all stages.
+    async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+        pass
 
-        Best-effort: unsupported stages will emit a TODO result.
-        """
-        self._is_sleeping = True
-        await self.collective_rpc(method="sleep", args=(level,))
+    async def sleep(self, level: int = 1) -> None:
+        pass
 
     async def wake_up(self, tags: list[str] | None = None) -> None:
-        """Wake up all stages.
-
-        Best-effort: unsupported stages will emit a TODO result.
-        """
-        self._is_sleeping = False
-        await self.collective_rpc(method="wake_up", args=(tags,))
+        pass
 
     async def is_sleeping(self) -> bool:
-        """Return whether all stages are sleeping.
-
-        TODO(AsyncOmni): query the orchestrator once all stage backends expose
-        a real sleeping-state RPC. For now we track the requested state locally.
-        """
-        return self._is_sleeping
+        """Check whether the engine is sleeping"""
+        return False
 
     async def add_lora(self, lora_request: LoRARequest) -> bool:
-        """Load a new LoRA adapter into all stages.
-
-        Returns True only if all concretely-implemented stages report success.
-        """
-        results = await self.collective_rpc(method="add_lora", args=(lora_request,))
-        concrete_results = [r for r in results if not (isinstance(r, dict) and r.get("todo"))]
-        return all(self._coerce_stage_bool(r) for r in concrete_results) if concrete_results else False
-
-    async def remove_lora(self, adapter_id: int) -> bool:
-        """Remove a LoRA adapter from all stages.
+        """Load a new LoRA adapter into the engine for future requests."""
+        return False
 
-        TODO(AsyncOmni): add richer per-stage error reporting to the public API.
-        """
-        results = await self.collective_rpc(method="remove_lora", args=(adapter_id,))
-        concrete_results = [r for r in results if not (isinstance(r, dict) and r.get("todo"))]
-        return all(self._coerce_stage_bool(r) for r in concrete_results) if concrete_results else False
-
-    async def list_loras(self) -> list[int]:
-        """List all loaded LoRA adapter IDs across stages."""
-        results = await self.collective_rpc(method="list_loras")
-        merged: set[int] = set()
-        for result in results:
-            if isinstance(result, dict) and result.get("todo"):
-                continue
-            if isinstance(result, list):
-                merged.update(result)
-        return sorted(merged)
+    async def encode(
+        self,
+        *args,
+        **kwargs,
+    ):
+        """Generate outputs for a request from a pooling model."""
+        raise NotImplementedError("encode() is not implemented for AsyncOmni")
 
-    async def pin_lora(self, adapter_id: int) -> bool:
-        """Pin a LoRA adapter across stages."""
-        results = await self.collective_rpc(method="pin_lora", args=(adapter_id,))
-        concrete_results = [r for r in results if not (isinstance(r, dict) and r.get("todo"))]
-        return all(self._coerce_stage_bool(r) for r in concrete_results) if concrete_results else False
+    async def start_profile(self, stages: list[int] | None = None) -> None:
+        """Start profiling for specified stages.
 
-    # ==================== Properties ====================
+        Async wrapper around the base implementation for API consistency.
 
-    @property
-    def is_running(self) -> bool:
-        """Check if the engine is running."""
-        return self.final_output_task is not None and not self.final_output_task.done()
+        Args:
+            stages: List of stage IDs to start profiling. If None, starts
+                profiling for all stages that have profiling enabled.
+
+        Example:
+            >>> await async_omni.start_profile()
+            >>> async for output in async_omni.generate(...):
+            ...     pass
+            >>> await async_omni.stop_profile()
+        """
+        super().start_profile(stages)
 
-    @property
-    def errored(self) -> bool:
-        """Whether orchestrator thread has stopped unexpectedly."""
-        return not self.engine.is_alive()
+    async def stop_profile(self, stages: list[int] | None = None) -> None:
+        """Stop profiling for specified stages.
 
-    @property
-    def is_stopped(self) -> bool:
-        """EngineClient abstract property implementation."""
-        return self.errored
+        Async wrapper around the base implementation for API consistency.
 
-    @property
-    def dead_error(self) -> BaseException:
-        """EngineClient abstract property implementation."""
-        return EngineDeadError()
+        Args:
+            stages: List of stage IDs to stop profiling. If None, stops
+                profiling for all stages.
+
+        Example:
+            >>> await async_omni.start_profile()
+            >>> async for output in async_omni.generate(...):
+            ...     pass
+            >>> await async_omni.stop_profile()
+        """
+        super().stop_profile(stages)
 
-    # ==================== EngineClient Interface ====================
+    async def pause_generation(
+        self,
+        *,
+        wait_for_inflight_requests: bool = False,
+        clear_cache: bool = True,
+    ) -> None:
+        """
+        Pause generation to allow model weight updates.
 
-    async def get_input_preprocessor(self) -> InputPreprocessor:
-        """Get input preprocessor."""
-        return self.input_processor
+        New generation/encoding requests are blocked until resume.
 
-    async def get_tokenizer(self) -> TokenizerLike:
-        """Get tokenizer for the comprehension stage."""
-        stage_index = self._get_comprehension_stage_index()
-        if stage_index is not None:
-            tokenizer = self.engine.output_processors[stage_index].tokenizer
-            if tokenizer is not None:
-                return tokenizer
-        return self.input_processor.tokenizer  # type: ignore[return-value]
+        Args:
+            wait_for_inflight_requests: When ``True`` waits for in-flight
+                requests to finish before pausing. When ``False`` (default),
+                immediately aborts any in-flight requests.
+            clear_cache: Whether to clear KV cache and prefix cache after
+                draining. Set to ``False`` to preserve cache for faster resume.
+                Default is ``True`` (clear caches).
+        """
 
-    async def is_tracing_enabled(self) -> bool:
-        """Check if tracing is enabled."""
-        return False
+        async with self._pause_cond:
+            if self._paused:
+                return
+            self._paused = True
 
-    async def do_log_stats(self) -> None:
-        """Log statistics.
+        # Note: AsyncOmni uses a stage-based architecture without a central
+        # output_processor. For now, we simply set the pause flag and let
+        # new requests wait. In-flight requests will complete naturally.
+        # TODO: Implement request abortion for stages if needed.
 
-        TODO: Forward to Orchestrator process via message.
-        """
-        pass
+        # Clear cache if requested
+        if clear_cache:
+            await self.reset_prefix_cache()
+            await self.reset_mm_cache()
 
-    async def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
-        """Return the task set exposed by the orchestrator-backed engine."""
-        return tuple(self.engine.supported_tasks)
+    async def resume_generation(self) -> None:
+        """Resume generation after :meth:`pause_generation`."""
 
-    async def check_health(self) -> None:
-        """Check engine health by verifying the Orchestrator process is alive."""
-        OmniBase.check_health(self)
+        async with self._pause_cond:
+            self._paused = False
+            self._pause_cond.notify_all()  # Wake up all waiting requests
 
-    # ==================== Shutdown ====================
+    async def is_paused(self) -> bool:
+        """Return whether the engine is currently paused."""
 
-    def shutdown(self, timeout: float | None = None) -> None:
-        """Shutdown the engine."""
-        if self.final_output_task is not None:
-            self.final_output_task.cancel()
-            self.final_output_task = None
-        OmniBase.shutdown(self)
+        async with self._pause_cond:
+            return self._paused
diff --git a/vllm_omni/entrypoints/async_omni_diffusion.py b/vllm_omni/entrypoints/async_omni_diffusion.py
new file mode 100644
index 00000000000..9c5860405a1
--- /dev/null
+++ b/vllm_omni/entrypoints/async_omni_diffusion.py
@@ -0,0 +1,491 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Async entrypoint for vLLM-Omni diffusion model inference.
+
+Provides an asynchronous interface for running diffusion models,
+enabling concurrent request handling and streaming generation.
+"""
+
+import asyncio
+import uuid
+import weakref
+from collections.abc import AsyncGenerator, Iterable
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any
+
+from vllm.logger import init_logger
+from vllm.transformers_utils.config import get_hf_file_to_dict
+
+try:
+    from huggingface_hub.errors import HFValidationError as _HFValidationError
+except ImportError:
+    _HFValidationError = ValueError
+
+from vllm_omni.diffusion.data import (
+    DiffusionRequestAbortedError,
+    OmniDiffusionConfig,
+    TransformerConfig,
+)
+from vllm_omni.diffusion.diffusion_engine import DiffusionEngine
+from vllm_omni.diffusion.request import OmniDiffusionRequest
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType
+from vllm_omni.lora.request import LoRARequest
+from vllm_omni.outputs import OmniRequestOutput
+
+logger = init_logger(__name__)
+
+
+def _weak_close_async_omni_diffusion(engine: DiffusionEngine, executor: ThreadPoolExecutor) -> None:
+    """Best-effort diffusion cleanup for GC finalization."""
+    try:
+        engine.close()
+    except Exception:
+        pass
+    try:
+        executor.shutdown(wait=False)
+    except Exception:
+        pass
+
+
+class AsyncOmniDiffusion:
+    """Async entry point for vLLM-Omni diffusion model inference.
+
+    This class provides an asynchronous interface for running diffusion models,
+    enabling concurrent request handling. It wraps the DiffusionEngine and
+    provides async methods for image generation.
+
+    Args:
+        model: Model name or path to load
+        od_config: Optional OmniDiffusionConfig. If not provided, it will be
+            created from kwargs
+        **kwargs: Additional keyword arguments passed to OmniDiffusionConfig
+
+    Example:
+        >>> async_diffusion = AsyncOmniDiffusion(model="Qwen/Qwen-Image")
+        >>> result = await async_diffusion.generate(
+        ...     prompt="A beautiful sunset over the ocean",
+        ...     request_id="req-1",
+        ... )
+        >>> print(result.images)
+    """
+
+    def __init__(
+        self,
+        model: str,
+        od_config: OmniDiffusionConfig | None = None,
+        batch_size: int = 1,
+        **kwargs: Any,
+    ):
+        self.model = model
+
+        # Set batch size (default 1 for backward compatibility)
+        self._batch_size = max(1, batch_size)
+
+        # Capture stage info from kwargs before they might be filtered out
+        stage_id = kwargs.get("stage_id")
+        engine_input_source = kwargs.get("engine_input_source")
+        cfg_kv_collect_func = kwargs.pop("cfg_kv_collect_func", None)
+
+        # Build config
+        if od_config is None:
+            od_config = OmniDiffusionConfig.from_kwargs(model=model, **kwargs)
+        elif isinstance(od_config, dict):
+            # If config is dict, check it too (priority to kwargs if both exist)
+            if stage_id is None:
+                stage_id = od_config.get("stage_id")
+            if engine_input_source is None:
+                engine_input_source = od_config.get("engine_input_source")
+            od_config = OmniDiffusionConfig.from_kwargs(**od_config)
+
+        self.od_config = od_config
+
+        # Inject stage info into omni_kv_config if present
+        if stage_id is not None:
+            self.od_config.omni_kv_config.setdefault("stage_id", stage_id)
+        if engine_input_source is not None:
+            self.od_config.omni_kv_config.setdefault("engine_input_source", engine_input_source)
+
+        # Diffusers-style models expose `model_index.json` with `_class_name`.
+        # Non-diffusers models (e.g. Bagel, NextStep) only have `config.json`,
+        # so we fall back to reading that and mapping model_type manually.
+        try:
+            config_dict = get_hf_file_to_dict("model_index.json", od_config.model)
+            if config_dict is not None:
+                if od_config.model_class_name is None:
+                    od_config.model_class_name = config_dict.get("_class_name", None)
+                od_config.update_multimodal_support()
+
+                tf_config_dict = get_hf_file_to_dict("transformer/config.json", od_config.model)
+                od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict)
+            else:
+                raise FileNotFoundError("model_index.json not found")
+        except (AttributeError, KeyError, OSError, ValueError, FileNotFoundError, _HFValidationError):
+            cfg = get_hf_file_to_dict("config.json", od_config.model)
+            if cfg is None:
+                if od_config.model_class_name is not None:
+                    cfg = {}  # skip - use explicit model_class_name
+                else:
+                    raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}")
+
+            od_config.tf_model_config = TransformerConfig.from_dict(cfg)
+            model_type = cfg.get("model_type")
+            architectures = cfg.get("architectures") or []
+            # Bagel/NextStep models don't have a model_index.json, so we set the pipeline class name manually
+            if model_type == "bagel" or "BagelForConditionalGeneration" in architectures:
+                od_config.model_class_name = "BagelPipeline"
+                od_config.tf_model_config = TransformerConfig()
+                od_config.update_multimodal_support()
+            elif model_type == "nextstep":
+                if od_config.model_class_name is None:
+                    od_config.model_class_name = "NextStep11Pipeline"
+                od_config.tf_model_config = TransformerConfig()
+                od_config.update_multimodal_support()
+            elif architectures and len(architectures) == 1:
+                if od_config.model_class_name is None:
+                    od_config.model_class_name = architectures[0]
+            elif od_config.model_class_name is None:
+                raise
+
+        if cfg_kv_collect_func is not None:
+            od_config.cfg_kv_collect_func = cfg_kv_collect_func
+
+        # Initialize engine
+        self.engine: DiffusionEngine = DiffusionEngine.make_engine(od_config)
+
+        # Thread pool for running sync engine in async context
+        self._executor = ThreadPoolExecutor(max_workers=1)
+        self._closed = False
+        self._weak_finalizer = weakref.finalize(
+            self,
+            _weak_close_async_omni_diffusion,
+            self.engine,
+            self._executor,
+        )
+
+        logger.info("AsyncOmniDiffusion initialized with model: %s, batch_size: %d", model, self._batch_size)
+
+    # ------------------------------------------------------------------
+    # batch_size property
+    # ------------------------------------------------------------------
+
+    @property
+    def batch_size(self) -> int:
+        """Return the configured batch size for request batching."""
+        return self._batch_size
+
+    @batch_size.setter
+    def batch_size(self, value: int) -> None:
+        if not isinstance(value, int) or value < 1:
+            raise ValueError("batch_size must be a positive integer")
+        self._batch_size = value
+
+    # ------------------------------------------------------------------
+    # Public batch generation API
+    # ------------------------------------------------------------------
+
+    async def generate_batch(
+        self,
+        prompts: list[OmniPromptType],
+        sampling_params: OmniDiffusionSamplingParams,
+        request_id: str | None = None,
+        lora_request: LoRARequest | None = None,
+    ) -> OmniRequestOutput:
+        """Generate images from multiple prompts in a single engine call.
+
+        Batches the given prompts into **one** ``DiffusionEngine.step()``
+        call and returns a single ``OmniRequestOutput`` containing all
+        generated images.  Called by ``StageDiffusionClient._run_batch``
+        when the orchestrator receives a list-prompt request.
+
+        Args:
+            prompts: List of text prompts describing the desired images.
+            sampling_params: Shared sampling parameters for all prompts.
+            request_id: Optional unique identifier. Auto-generated when *None*.
+            lora_request: Optional LoRA adapter to apply.
+
+        Returns:
+            A single ``OmniRequestOutput`` with all images combined.
+        """
+        if request_id is None:
+            request_id = f"diff-batch-{uuid.uuid4().hex[:8]}"
+        return await self._generate_batch(prompts, sampling_params, request_id, lora_request)
+
+    # ------------------------------------------------------------------
+    # Internal batch generation
+    # ------------------------------------------------------------------
+
+    async def _generate_batch(
+        self,
+        prompts: list[OmniPromptType],
+        sampling_params: OmniDiffusionSamplingParams,
+        request_id: str,
+        lora_request: LoRARequest | None = None,
+    ) -> OmniRequestOutput:
+        """Generate images from multiple prompts in a single engine call."""
+        if not prompts:
+            return OmniRequestOutput(request_id=request_id, images=[], final_output_type="image")
+
+        if sampling_params.guidance_scale:
+            sampling_params.guidance_scale_provided = True
+
+        if lora_request is not None:
+            sampling_params.lora_request = lora_request
+
+        request = OmniDiffusionRequest(
+            prompts=prompts,
+            sampling_params=sampling_params,
+            request_ids=[f"{request_id}-{i}" for i in range(len(prompts))],
+        )
+
+        logger.debug("Starting batch generation for %d prompts, request_id=%s", len(prompts), request_id)
+
+        loop = asyncio.get_event_loop()
+        try:
+            results = await loop.run_in_executor(
+                self._executor,
+                self.engine.step,
+                request,
+            )
+        except Exception as e:
+            logger.error("Batch generation failed for request %s: %s", request_id, e)
+            raise RuntimeError(f"Diffusion batch generation failed: {e}") from e
+
+        # Combine all per-prompt results into a single OmniRequestOutput
+        all_images = []
+        for result in results:
+            all_images.extend(result.images)
+
+        return OmniRequestOutput(
+            request_id=request_id,
+            images=all_images,
+            final_output_type="image",
+            finished=True,
+        )
+
+    def get_diffusion_od_config(self) -> OmniDiffusionConfig:
+        """Return the diffusion config used by this engine."""
+        return self.od_config
+
+    # ------------------------------------------------------------------
+    # Public generate API
+    # ------------------------------------------------------------------
+
+    async def generate(
+        self,
+        prompt: OmniPromptType,
+        sampling_params: OmniDiffusionSamplingParams,
+        request_id: str | None = None,
+        lora_request: LoRARequest | None = None,
+    ) -> OmniRequestOutput:
+        """Generate images asynchronously from a single text prompt.
+
+        For batched generation (multiple prompts in one engine call), use
+        :meth:`generate_batch` instead.  This method always processes
+        exactly one prompt per call.
+
+        Args:
+            prompt: Text prompt describing the desired image
+            sampling_params: Sampling parameters
+            request_id: Optional unique identifier for tracking the request
+            lora_request: Optional LoRA adapter to apply
+
+        Returns:
+            OmniRequestOutput containing generated images
+
+        Raises:
+            RuntimeError: If generation fails
+        """
+        if request_id is None:
+            request_id = f"diff-{uuid.uuid4().hex[:16]}"
+        if sampling_params.guidance_scale:
+            sampling_params.guidance_scale_provided = True
+
+        if lora_request is not None:
+            sampling_params.lora_request = lora_request
+
+        # Extract additional_information from OmniTokensPrompt into extra dict
+        # (carries audio_tokens, vision_tokens, etc. from thinker2*_decoder processors)
+        extra: dict = {}
+        if isinstance(prompt, dict) and prompt.get("additional_information"):
+            extra.update(prompt["additional_information"])
+        elif hasattr(prompt, "additional_information") and prompt.additional_information:
+            extra.update(prompt.additional_information)
+
+        request = OmniDiffusionRequest(
+            prompts=[prompt],
+            sampling_params=sampling_params,
+            request_ids=[request_id],
+            extra=extra if extra else {},
+        )
+
+        logger.debug("Starting generation for request %s", request_id)
+
+        loop = asyncio.get_event_loop()
+        try:
+            result = await loop.run_in_executor(
+                self._executor,
+                self.engine.step,
+                request,
+            )
+            result = result[0]
+        except asyncio.CancelledError:
+            self.engine.abort(request_id)
+            raise
+        except DiffusionRequestAbortedError:
+            raise
+        except Exception as e:
+            logger.error("Generation failed for request %s: %s", request_id, e)
+            raise RuntimeError(f"Diffusion generation failed: {e}") from e
+
+        if not result.request_id:
+            result.request_id = request_id
+        return result
+
+    async def generate_stream(
+        self,
+        prompt: str,
+        request_id: str | None = None,
+        **kwargs: Any,
+    ) -> AsyncGenerator[OmniRequestOutput, None]:
+        """Generate images with streaming progress updates.
+
+        Currently, diffusion models don't support true streaming, so this
+        yields a single result after generation completes. Future implementations
+        may support step-by-step progress updates.
+
+        Args:
+            prompt: Text prompt describing the desired image
+            request_id: Optional unique identifier for tracking the request
+            **kwargs: Additional generation parameters
+
+        Yields:
+            OmniRequestOutput with generation progress/results
+        """
+        result = await self.generate(prompt=prompt, request_id=request_id, **kwargs)
+        yield result
+
+    def close(self) -> None:
+        """Close the engine and release resources.
+
+        Should be called when done using the AsyncOmniDiffusion instance.
+        """
+        if self._closed:
+            return
+        self._closed = True
+
+        finalizer = getattr(self, "_weak_finalizer", None)
+        if finalizer is not None and finalizer.alive:
+            finalizer.detach()
+
+        try:
+            self.engine.close()
+        except Exception as e:
+            logger.warning("Error closing diffusion engine: %s", e)
+
+        try:
+            self._executor.shutdown(wait=False)
+        except Exception as e:
+            logger.warning("Error shutting down executor: %s", e)
+
+        logger.info("AsyncOmniDiffusion closed")
+
+    def shutdown(self) -> None:
+        """Alias for close() method."""
+        self.close()
+
+    async def abort(self, request_id: str | Iterable[str]) -> None:
+        """Abort a request."""
+        self.engine.abort(request_id)
+
+    @property
+    def is_running(self) -> bool:
+        """Check if the engine is running."""
+        return not self._closed
+
+    @property
+    def is_stopped(self) -> bool:
+        """Check if the engine is stopped."""
+        return self._closed
+
+    async def remove_lora(self, adapter_id: int) -> bool:
+        """Remove a LoRA"""
+        loop = asyncio.get_event_loop()
+        results = await loop.run_in_executor(
+            self._executor,
+            self.engine.collective_rpc,
+            "remove_lora",
+            None,
+            (adapter_id,),
+            {},
+            None,
+        )
+        return all(results) if isinstance(results, list) else results
+
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
+        """Add a LoRA adapter"""
+        loop = asyncio.get_event_loop()
+        results = await loop.run_in_executor(
+            self._executor,
+            self.engine.collective_rpc,
+            "add_lora",
+            None,
+            (),
+            {"lora_request": lora_request},
+            None,
+        )
+        return all(results) if isinstance(results, list) else results
+
+    async def list_loras(self) -> list[int]:
+        """List all registered LoRA adapter IDs."""
+        loop = asyncio.get_event_loop()
+        results = await loop.run_in_executor(
+            self._executor,
+            self.engine.collective_rpc,
+            "list_loras",
+            None,
+            (),
+            {},
+            None,
+        )
+        # collective_rpc returns list from workers; flatten unique ids
+        if not isinstance(results, list):
+            return results or []
+        merged: set[int] = set()
+        for part in results:
+            merged.update(part or [])
+        return sorted(merged)
+
+    async def pin_lora(self, lora_id: int) -> bool:
+        """Prevent an adapter from being evicted."""
+        loop = asyncio.get_event_loop()
+        results = await loop.run_in_executor(
+            self._executor,
+            self.engine.collective_rpc,
+            "pin_lora",
+            None,
+            (),
+            {"adapter_id": lora_id},
+            None,
+        )
+        return all(results) if isinstance(results, list) else results
+
+    async def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
+        """Start or stop profiling for the diffusion model.
+
+        Args:
+            is_start: True to start profiling, False to stop.
+            profile_prefix: Optional prefix for trace filename (vLLM compat).
+
+        Note:
+            Matches vLLM's worker.profile() signature for consistency.
+            Traces are saved automatically via on_trace_ready callback.
+        """
+        loop = asyncio.get_event_loop()
+        await loop.run_in_executor(
+            self._executor,
+            self.engine.profile,
+            is_start,
+            profile_prefix,
+        )
diff --git a/vllm_omni/entrypoints/async_omni_llm.py b/vllm_omni/entrypoints/async_omni_llm.py
new file mode 100644
index 00000000000..14a02f1819b
--- /dev/null
+++ b/vllm_omni/entrypoints/async_omni_llm.py
@@ -0,0 +1,225 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+import socket
+from typing import TYPE_CHECKING
+
+import torch
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.tokenizers import cached_tokenizer_from_config
+from vllm.tracing import init_tracer
+from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils.func_utils import deprecate_kwargs
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
+
+from vllm_omni.engine.arg_utils import AsyncOmniEngineArgs
+from vllm_omni.engine.input_processor import OmniInputProcessor
+from vllm_omni.engine.output_processor import MultimodalOutputProcessor
+
+if TYPE_CHECKING:
+    pass
+
+logger = init_logger(__name__)
+
+
+class AsyncOmniLLM(AsyncLLM):
+    """Async single-stage LLM engine for use within a stage worker process.
+
+    This class extends the base vLLM AsyncLLM class with omni-specific
+    processors for handling multimodal inputs and outputs. It is used
+    internally by AsyncOmniStage workers and should not be instantiated
+    directly by users.
+
+    Args:
+        engine_args: AsyncOmniEngineArgs containing engine configuration
+        vllm_config: Global vLLM configuration
+        executor_class: Executor implementation class, e.g. MultiprocExecutor
+        log_stats: Whether to log statistics
+        usage_context: Usage context of the LLM (default: ENGINE_CONTEXT)
+        mm_registry: Multi-modal registry for processing multimodal inputs
+        use_cached_outputs: Whether to use cached outputs
+        log_requests: Whether to log requests
+        start_engine_loop: Whether to start the engine loop automatically
+        stat_loggers: Customized stat loggers for the engine.
+            If not provided, default stat loggers will be used.
+            Note: Stat logger interface may change in V1.
+        client_addresses: Optional dictionary mapping client names to addresses
+        client_count: Total number of clients (default: 1)
+        client_index: Index of this client (default: 0)
+    """
+
+    def __init__(
+        self,
+        engine_args: AsyncOmniEngineArgs,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        use_cached_outputs: bool = False,
+        log_requests: bool = True,
+        start_engine_loop: bool = True,
+        stat_loggers: list[StatLoggerFactory] | None = None,
+        client_addresses: dict[str, str] | None = None,
+        client_count: int = 1,
+        client_index: int = 0,
+    ) -> None:
+        """
+        Create an AsyncOmniLLM.
+
+        Args:
+            vllm_config: global configuration.
+            executor_class: an Executor impl, e.g. MultiprocExecutor.
+            log_stats: Whether to log stats.
+            usage_context: Usage context of the LLM.
+            mm_registry: Multi-modal registry.
+            use_cached_outputs: Whether to use cached outputs.
+            log_requests: Whether to log requests.
+            start_engine_loop: Whether to start the engine loop.
+            stat_loggers: customized stat loggers for the engine.
+                If not provided, default stat loggers will be used.
+                PLEASE BE AWARE THAT STAT LOGGER IS NOT STABLE
+                IN V1, AND ITS BASE CLASS INTERFACE MIGHT CHANGE.
+
+        Returns:
+            None
+        """
+        # Ensure we can serialize custom transformer configs
+        maybe_register_config_serialize_by_value()
+
+        self.model_config = vllm_config.model_config
+        self.vllm_config = vllm_config
+        self.observability_config = vllm_config.observability_config
+        self.log_requests = log_requests
+
+        self.log_stats = log_stats or (stat_loggers is not None)
+        if not log_stats and stat_loggers is not None:
+            logger.info(
+                "AsyncLLM created with log_stats=False and non-empty custom logger list; "
+                "enabling logging without default stat loggers"
+            )
+
+        if self.model_config.skip_tokenizer_init:
+            tokenizer = None
+        else:
+            # Tokenizer (+ ensure liveness if running in another process).
+            tokenizer = cached_tokenizer_from_config(model_config=vllm_config.model_config)
+
+        # InputProcessor (converts Inputs --> EngineCoreRequests).
+        self.input_processor = OmniInputProcessor(
+            vllm_config=vllm_config,
+            mm_registry=mm_registry,
+        )
+
+        # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
+        self.output_processor = MultimodalOutputProcessor(
+            tokenizer=tokenizer,
+            log_stats=self.log_stats,
+            engine_core_output_type=engine_args.engine_output_type,
+        )
+
+        if self.observability_config.otlp_traces_endpoint is not None:
+            tracer = init_tracer("vllm.llm_engine", self.observability_config.otlp_traces_endpoint)
+            self.output_processor.tracer = tracer
+
+        # Pause / resume state for async RL workflows.
+        self._pause_cond = asyncio.Condition()
+        self._paused = False
+
+        # Set renderer for output handler compatibility with AsyncLLM
+        from vllm.renderers import renderer_from_config as _renderer_from_config
+
+        self.renderer = _renderer_from_config(self.vllm_config)
+
+        # EngineCore (starts the engine in background process).
+        self.engine_core = EngineCoreClient.make_async_mp_client(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=self.log_stats,
+            client_addresses=client_addresses,
+            client_count=client_count,
+            client_index=client_index,
+        )
+
+        # Loggers.
+        self.logger_manager: StatLoggerManager | None = None
+        if self.log_stats:
+            self.logger_manager = StatLoggerManager(
+                vllm_config=vllm_config,
+                engine_idxs=self.engine_core.engine_ranks_managed,
+                custom_stat_loggers=stat_loggers,
+                enable_default_loggers=log_stats,
+                client_count=client_count,
+            )
+            self.logger_manager.log_engine_initialized()
+
+        self.output_handler: asyncio.Task | None = None
+        try:
+            # Start output handler eagerly if we are in the asyncio eventloop.
+            asyncio.get_running_loop()
+            self._run_output_handler()
+        except RuntimeError:
+            pass
+
+        # Use profiler_config from vllm_config (new way, aligned with vllm v1)
+        if vllm_config.profiler_config.profiler == "torch" and not vllm_config.profiler_config.ignore_frontend:
+            profiler_dir = vllm_config.profiler_config.torch_profiler_dir
+            logger.info(
+                "Torch profiler enabled. AsyncOmniLLM CPU traces will be collected under %s",
+                profiler_dir,
+            )
+            worker_name = f"{socket.gethostname()}_{os.getpid()}.async_omni_llm"
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                ],
+                with_stack=vllm_config.profiler_config.torch_profiler_with_stack,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    profiler_dir,
+                    worker_name=worker_name,
+                    use_gzip=vllm_config.profiler_config.torch_profiler_use_gzip,
+                ),
+            )
+        else:
+            self.profiler = None
+
+    @classmethod
+    @deprecate_kwargs(
+        "disable_log_requests",
+        additional_message=("This argument will have no effect. Use `enable_log_requests` instead."),
+    )
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        engine_args: AsyncOmniEngineArgs,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: list[StatLoggerFactory] | None = None,
+        enable_log_requests: bool = False,
+        disable_log_stats: bool = False,
+        client_addresses: dict[str, str] | None = None,
+        client_count: int = 1,
+        client_index: int = 0,
+        disable_log_requests: bool = True,  # Deprecated, will be removed
+    ) -> "AsyncLLM":
+        # Create the LLMEngine.
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=Executor.get_class(vllm_config),
+            start_engine_loop=start_engine_loop,
+            stat_loggers=stat_loggers,
+            log_requests=enable_log_requests,
+            log_stats=not disable_log_stats,
+            usage_context=usage_context,
+            client_addresses=client_addresses,
+            client_count=client_count,
+            client_index=client_index,
+            engine_args=engine_args,
+        )
diff --git a/vllm_omni/entrypoints/cli/main.py b/vllm_omni/entrypoints/cli/main.py
index affa6c83349..b3ec90a6edd 100644
--- a/vllm_omni/entrypoints/cli/main.py
+++ b/vllm_omni/entrypoints/cli/main.py
@@ -18,20 +18,14 @@ def main():
         from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
         from vllm.utils.argparse_utils import FlexibleArgumentParser
 
-        import vllm_omni.entrypoints.cli.benchmark.main
         import vllm_omni.entrypoints.cli.serve
 
         CMD_MODULES = [
             vllm_omni.entrypoints.cli.serve,
-            vllm_omni.entrypoints.cli.benchmark.main,
         ]
 
         cli_env_setup()
 
-        from vllm_omni.entrypoints.cli.serve import _ensure_vllm_platform
-
-        _ensure_vllm_platform()
-
         parser = FlexibleArgumentParser(
             description="vLLM OMNI CLI",
             epilog=VLLM_SUBCMD_PARSER_EPILOG.format(subcmd="[subcommand]"),
@@ -49,6 +43,7 @@ def main():
             for cmd in new_cmds:
                 cmd.subparser_init(subparsers).set_defaults(dispatch_function=cmd.cmd)
                 cmds[cmd.name] = cmd
+        sys.argv = [a for a in sys.argv if a != "--omni"]
         args = parser.parse_args()
         if args.subparser in cmds:
             cmds[args.subparser].validate(args)
diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py
index a3bfe98ce2c..c2482ae5f15 100644
--- a/vllm_omni/entrypoints/omni.py
+++ b/vllm_omni/entrypoints/omni.py
@@ -1,42 +1,813 @@
-from __future__ import annotations
-
-import copy
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import multiprocessing as mp
+import os
+import threading
 import time
 import uuid
-from collections.abc import Callable, Generator, Iterable, Sequence
-from typing import TYPE_CHECKING, Literal, overload
+import weakref
+from collections.abc import Callable, Generator, Sequence
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Literal, overload
 
+import huggingface_hub
+import msgspec.msgpack
+import zmq
+from omegaconf import OmegaConf
 from tqdm.auto import tqdm
+from vllm import SamplingParams
 from vllm.logger import init_logger
-from vllm.sampling_params import RequestOutputKind
+from vllm.utils.network_utils import make_zmq_socket
+from vllm.v1.utils import get_engine_client_zmq_addr
 
-from vllm_omni.entrypoints.client_request_state import ClientRequestState
-from vllm_omni.entrypoints.omni_base import OmniBase
-from vllm_omni.metrics.stats import OrchestratorAggregator as OrchestratorMetrics
+from vllm_omni.distributed.omni_connectors import (
+    get_stage_connector_config,
+    initialize_orchestrator_connectors,
+)
+from vllm_omni.distributed.omni_connectors.adapter import try_send_via_connector
+from vllm_omni.distributed.omni_connectors.utils.initialization import (
+    resolve_omni_kv_config_for_stage,
+)
+from vllm_omni.distributed.ray_utils.utils import (
+    create_placement_group,
+    get_ray_queue_class,
+    try_close_ray,
+)
+from vllm_omni.entrypoints.omni_stage import OmniStage
+from vllm_omni.entrypoints.stage_utils import SHUTDOWN_TASK, OmniStageTaskType
+from vllm_omni.entrypoints.stage_utils import maybe_load_from_ipc as _load
+from vllm_omni.entrypoints.utils import (
+    get_final_stage_id_for_e2e,
+    inject_omni_kv_config,
+    load_and_resolve_stage_configs,
+)
+from vllm_omni.entrypoints.zmq_utils import ZmqQueue
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType, OmniSamplingParams
+from vllm_omni.metrics import OrchestratorAggregator, StageRequestStats
+from vllm_omni.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf_specific,
+)
 from vllm_omni.outputs import OmniRequestOutput
 
-if TYPE_CHECKING:
-    from vllm_omni.inputs.data import OmniPromptType, OmniSamplingParams
-
 logger = init_logger(__name__)
 
 
+def _weak_close_cleanup(
+    stage_list,
+    stage_in_queues,
+    stage_out_queues,
+    ray_pg,
+    zmq_ctx=None,
+    handshake_stop: threading.Event | None = None,
+    zmq_handshake_socket: zmq.Socket | None = None,
+    handshake_thread: threading.Thread | None = None,
+):
+    """Weak reference cleanup function for OmniBase instances."""
+    if stage_list:
+        for q in stage_in_queues:
+            try:
+                q.put_nowait(SHUTDOWN_TASK)
+            except Exception as e:
+                logger.warning(f"Failed to send shutdown signal to stage input queue: {e}")
+            close_fn = getattr(q, "close", None)
+            if callable(close_fn):
+                close_fn()
+        for q in stage_out_queues:
+            close_fn = getattr(q, "close", None)
+            if callable(close_fn):
+                close_fn()
+        for stage in stage_list:
+            try:
+                stage.stop_stage_worker()
+            except Exception as e:
+                logger.warning(f"Failed to stop stage worker: {e}")
+    try_close_ray(ray_pg)
+
+    # Gracefully shutdown handshake server thread
+    if handshake_stop is not None:
+        handshake_stop.set()
+    if handshake_thread is not None:
+        handshake_thread.join(timeout=2.0)
+        if handshake_thread.is_alive():
+            logger.warning("Handshake server thread did not terminate gracefully within timeout")
+
+    # Close ZMQ resources after thread has exited
+    if zmq_handshake_socket is not None:
+        zmq_handshake_socket.close(0)
+    if zmq_ctx is not None:
+        zmq_ctx.term()
+
+
+def _dummy_snapshot_download(model_id):
+    return model_id
+
+
+def omni_snapshot_download(model_id) -> str:
+    # If it's already a local path, just return it
+    if os.path.exists(model_id):
+        return model_id
+    # TODO: this is just a workaround for quickly use modelscope, we should support
+    # modelscope in weight loading feature instead of using `snapshot_download`
+    if os.environ.get("VLLM_USE_MODELSCOPE", False):
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        return snapshot_download(model_id)
+    # For other cases (Hugging Face), perform a real download to ensure all
+    # necessary files (including *.pt for audio/diffusion) are available locally
+    # before stage workers are spawned. This prevents initialization timeouts.
+    # Return the original model_id so that model_config.model preserves
+    # HuggingFace semantics (e.g. "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice")
+    # instead of the resolved cache path.
+    try:
+        download_weights_from_hf_specific(
+            model_name_or_path=model_id,
+            cache_dir=None,
+            allow_patterns=["*"],
+            require_all=True,
+        )
+    except huggingface_hub.errors.RepositoryNotFoundError:
+        logger.warning(f"Repository not found for '{model_id}'.")
+    return model_id
+
+
+class OmniBase:
+    """Base class for serving Omni models.
+
+    Args:
+        model: Model name or path to load.
+        **kwargs: Arbitrary keyword arguments.
+            - stage_configs_path: Optional path to YAML file containing stage
+              configurations. If None, configurations are loaded from the model.
+            - log_stats: Whether to enable statistics logging
+              be written to files with stage-specific suffixes.
+            - stage_init_timeout: Per-stage init watchdog (seconds). Measured from
+              when the previous stage finished (possibly a prior Omni run with GPU
+              reuse/overlap) to when the current stage starts to initialize.
+            - shm_threshold_bytes: Threshold in bytes for using shared memory
+              for IPC. Objects larger than this threshold will use shared memory.
+            - worker_backend: Backend for worker processes. Default is "multi_process".
+            - ray_address: Address of Ray cluster for Ray backend, if using Ray backend.
+            - batch_timeout: Timeout in seconds for batching requests within a stage
+            - init_timeout: Timeout in seconds for waiting for all stages to initialize
+            - Additional keyword arguments passed to stage engines.
+    """
+
+    def __init__(self, model: str, **kwargs: Any) -> None:
+        model = omni_snapshot_download(model)
+        kwargs["model"] = model
+        self._model = model  # store for use in fallback processors init
+
+        # Stage management attributes
+        self.stage_list: list[OmniStage] = []
+        self._stage_in_queues: list[Any] = []
+        self._stage_out_queues: list[Any] = []
+        self._stages_ready: set[int] = set()
+        self._ray_pg = None
+        self._queue_cls = None
+        self._ctx = None
+        self._zmq_ctx: zmq.Context | None = None
+        self._zmq_master_address: str | None = None
+        self._zmq_master_port: int | None = None
+        self._zmq_handshake_socket: zmq.Socket | None = None
+        self._handshake_thread: threading.Thread | None = None
+        self._handshake_stop: threading.Event | None = None
+        self._handshake_endpoints: dict[int, tuple[str, str]] = {}
+        self._handshake_seen: set[int] = set()  # Track which stage IDs have completed ZMQ handshake
+        self._single_stage_id: int | None = None  # Optional: deploy only a specific stage ID
+
+        # Initialize stages - each stage will create appropriate instance based on stage_type
+        # Stage workers will automatically create OmniLLM or OmniDiffusion instances
+        # based on stage_type in YAML config (handled in omni_stage.py)
+        logger.info(f"Initializing stages for model: {model}")
+        self._initialize_stages(model, kwargs)
+
+    def _get_default_cache_config(self, cache_backend: str | None) -> dict[str, Any] | None:
+        if cache_backend == "cache_dit":
+            return {
+                "Fn_compute_blocks": 1,
+                "Bn_compute_blocks": 0,
+                "max_warmup_steps": 4,
+                "residual_diff_threshold": 0.24,
+                "max_continuous_cached_steps": 3,
+                "enable_taylorseer": False,
+                "taylorseer_order": 1,
+                "scm_steps_mask_policy": None,
+                "scm_steps_policy": "dynamic",
+            }
+        if cache_backend == "tea_cache":
+            return {
+                "rel_l1_thresh": 0.2,
+            }
+        return None
+
+    def _normalize_cache_config(self, cache_backend: str | None, cache_config: Any | None) -> Any | None:
+        if isinstance(cache_config, str):
+            try:
+                cache_config = json.loads(cache_config)
+            except json.JSONDecodeError:
+                logger.warning("Invalid cache_config JSON, using defaults.")
+                cache_config = None
+        if cache_config is None and cache_backend not in (None, "", "none"):
+            cache_config = self._get_default_cache_config(cache_backend)
+        return cache_config
+
+    def _create_default_diffusion_stage_cfg(self, kwargs: dict[str, Any]) -> dict[str, Any]:
+        """Create default diffusion stage configuration."""
+        # We temporally create a default config for diffusion stage.
+        # In the future, we should merge the default config with the user-provided config.
+        # TODO: hack, convert dtype to string to avoid non-premitive omegaconf create error.
+        if "dtype" in kwargs:
+            kwargs["dtype"] = str(kwargs["dtype"])
+        cache_backend = kwargs.get("cache_backend", "none")
+        cache_config = self._normalize_cache_config(cache_backend, kwargs.get("cache_config", None))
+        # TODO: hack, calculate devices based on parallel config.
+        devices = "0"
+        if "parallel_config" in kwargs:
+            num_devices = kwargs["parallel_config"].world_size
+            for i in range(1, num_devices):
+                devices += f",{i}"
+        default_stage_cfg = [
+            {
+                "stage_id": 0,
+                "stage_type": "diffusion",
+                "runtime": {
+                    "process": True,
+                    "devices": devices,
+                    "max_batch_size": 1,
+                },
+                "engine_args": OmegaConf.create(
+                    {
+                        **kwargs,
+                        "cache_backend": cache_backend,
+                        "cache_config": cache_config,
+                    }
+                ),
+                "final_output": True,
+                "final_output_type": "image",
+            }
+        ]
+        default_stage_cfg[0]["engine_args"]["model_stage"] = "diffusion"
+        return default_stage_cfg
+
+    def _resolve_stage_configs(self, model: str, kwargs: dict[str, Any]) -> tuple[str, list[Any]]:
+        """Resolve stage configs and inject defaults shared by orchestrator/headless."""
+        # TODO(wuhang):
+        # Remove kwargs as parameters in the future.
+        # Use dataclass directly for engine args.
+
+        stage_configs_path = kwargs.get("stage_configs_path", None)
+
+        # TTS-specific CLI overrides
+        self.tts_max_instructions_length: int | None = kwargs.get("tts_max_instructions_length", None)
+
+        # Load stage configurations from YAML
+        config_path, stage_configs = load_and_resolve_stage_configs(
+            model,
+            stage_configs_path,
+            kwargs,
+            default_stage_cfg_factory=lambda: self._create_default_diffusion_stage_cfg(kwargs),
+        )
+
+        # Inject diffusion LoRA-related knobs from kwargs if not present in the stage config.
+        for cfg in stage_configs:
+            try:
+                if getattr(cfg, "stage_type", None) != "diffusion":
+                    continue
+                if not hasattr(cfg, "engine_args") or cfg.engine_args is None:
+                    cfg.engine_args = OmegaConf.create({})
+                if kwargs.get("lora_path") is not None:
+                    if not hasattr(cfg.engine_args, "lora_path") or cfg.engine_args.lora_path is None:
+                        cfg.engine_args.lora_path = kwargs["lora_path"]
+                lora_scale = kwargs.get("lora_scale")
+                if lora_scale is None:
+                    # Backwards compatibility for older callers.
+                    lora_scale = kwargs.get("static_lora_scale")
+                if lora_scale is not None:
+                    if not hasattr(cfg.engine_args, "lora_scale") or cfg.engine_args.lora_scale is None:
+                        cfg.engine_args.lora_scale = lora_scale
+                quantization_config = kwargs.get("quantization_config")
+                if quantization_config is not None:
+                    if (
+                        not hasattr(cfg.engine_args, "quantization_config")
+                        or cfg.engine_args.quantization_config is None
+                    ):
+                        cfg.engine_args.quantization_config = quantization_config
+            except Exception as e:
+                logger.warning("Failed to inject LoRA config for stage: %s", e)
+
+        return config_path, stage_configs
+
+    def _initialize_stages(self, model: str, kwargs: dict[str, Any]) -> None:
+        """Initialize stage list management."""
+        stage_init_timeout = kwargs.get("stage_init_timeout", 20)
+        shm_threshold_bytes = kwargs.get("shm_threshold_bytes", 65536)
+        init_timeout = kwargs.get("init_timeout", 300)
+        worker_backend = kwargs.get("worker_backend", "multi_process")
+        ray_address = kwargs.get("ray_address", None)
+        batch_timeout = kwargs.get("batch_timeout", 10)
+        log_stats = kwargs.get("log_stats", False)
+        self._single_stage_id = kwargs.get("stage_id", None)
+        self._zmq_master_address = kwargs.get("omni_master_address", None)
+        if self._zmq_master_address is None:
+            self._zmq_master_address = "127.0.0.1"
+            logger.info("No omni_master_address provided, defaulting to localhost (127.0.0.1)")
+        self._zmq_master_port = kwargs.get("omni_master_port", None)
+
+        # Resolve stage configs shared by orchestrator/headless paths.
+        self.config_path, self.stage_configs = self._resolve_stage_configs(model, kwargs)
+
+        # Initialize connectors
+        self.omni_transfer_config, self.connectors = initialize_orchestrator_connectors(
+            self.config_path, worker_backend=worker_backend, shm_threshold_bytes=shm_threshold_bytes
+        )
+
+        # Initialize stats paths
+        self.log_stats: bool = bool(log_stats)
+
+        self.worker_backend = worker_backend
+        self.ray_address = ray_address
+        self.batch_timeout = batch_timeout
+        # async chunk remains the same for each stage
+        self.async_chunk = self._is_async_chunk_enable(self.stage_configs)
+
+        # Build OmniStage instances in parallel, preserve original order
+        def _build_stage(idx_cfg: tuple[int, Any]) -> tuple[int, OmniStage]:
+            idx, cfg = idx_cfg
+            return idx, OmniStage(cfg, stage_init_timeout=stage_init_timeout)
+
+        with ThreadPoolExecutor(max_workers=min(len(self.stage_configs), max(1, os.cpu_count() or 1))) as executor:
+            futures = [executor.submit(_build_stage, (idx, cfg)) for idx, cfg in enumerate(self.stage_configs)]
+            results: list[tuple[int, OmniStage]] = []
+            for fut in as_completed(futures):
+                results.append(fut.result())
+        results.sort(key=lambda x: x[0])
+        self.stage_list = [st for _, st in results]
+        self.default_sampling_params_list = [st.default_sampling_params for st in self.stage_list]
+        self.output_modalities = [st.final_output_type for st in self.stage_list]
+        logger.info(f"[{self._name}] Loaded {len(self.stage_list)} stages")
+
+        if self.worker_backend == "ray":
+            self._queue_cls = get_ray_queue_class()
+        else:
+            self._ctx = mp.get_context("spawn")
+            self._queue_cls = lambda: self._ctx.Queue(maxsize=0)
+
+        self._stage_init_timeout = max(0, int(stage_init_timeout))
+        self._shm_threshold_bytes = max(0, int(shm_threshold_bytes))
+        self._start_stages(model)
+        # Wait for all stages to report readiness before seeding
+        self._wait_for_stages_ready(timeout=init_timeout)
+
+    def _is_async_chunk_enable(self, stage_args: list) -> bool:
+        """get async chunk flag"""
+        engine_args = getattr(stage_args[0], "engine_args", None)
+        return bool(getattr(engine_args, "async_chunk", False))
+
+    def _start_stages(self, model: str) -> None:
+        """Start all stage processes."""
+        if self.worker_backend == "ray":
+            # Initialize Ray Cluster
+            self._ray_pg = create_placement_group(
+                number_of_stages=len(self.stage_list), address=self.ray_address, strategy="PACK"
+            )
+        else:
+            # Initialize ZMQ context
+            if self._zmq_ctx is None:
+                self._zmq_ctx = zmq.Context()
+
+            # Allocate endpoints for each stage
+            total_stages = len(self.stage_configs)
+            self._handshake_endpoints = {}
+
+            # If --stage-id is not set, use local_only mode
+            local_only = self._single_stage_id is None
+
+            for sid in range(total_stages):
+                in_endpoint = get_engine_client_zmq_addr(local_only=local_only, host=self._zmq_master_address)
+                out_endpoint = get_engine_client_zmq_addr(local_only=local_only, host=self._zmq_master_address)
+                self._handshake_endpoints[sid] = (in_endpoint, out_endpoint)
+                logger.debug(
+                    f"[{self._name}] Allocated endpoints for stage-{sid}: in={in_endpoint}, out={out_endpoint}"
+                )
+
+            # Start handshake server
+            self.start_handshake_server()
+
+        for stage_id, stage in enumerate[OmniStage](self.stage_list):
+            if self.worker_backend == "ray":
+                in_q = self._queue_cls()
+                out_q = self._queue_cls()
+            else:
+                in_endpoint, out_endpoint = self._handshake_endpoints[stage_id]
+                in_q = ZmqQueue(self._zmq_ctx, zmq.PUSH, bind=in_endpoint)
+                out_q = ZmqQueue(self._zmq_ctx, zmq.PULL, bind=out_endpoint)
+
+            self._stage_in_queues.append(in_q)
+            self._stage_out_queues.append(out_q)
+            stage.attach_queues(in_q, out_q)
+
+            stage_connectors_config = get_stage_connector_config(
+                self.omni_transfer_config,
+                stage_id,
+            )
+
+            # Inject YAML-resolved connector config into omni_kv_config for
+            # in-engine usage (GPU model runner reads model_config.omni_kv_config).
+            try:
+                omni_conn_cfg, omni_from, omni_to = resolve_omni_kv_config_for_stage(
+                    self.omni_transfer_config, stage_id
+                )
+                if omni_conn_cfg:
+                    inject_omni_kv_config(stage, omni_conn_cfg, omni_from, omni_to)  # type: ignore
+
+            except Exception as e:
+                logger.debug("[Omni] Failed to inject omni connector config into stage-%s: %s", stage_id, e)
+
+            if self._single_stage_id is not None and stage_id != int(self._single_stage_id):
+                logger.info(
+                    f"[{self._name}] Skipping initialization of stage-{stage_id} worker due to single_stage_id setting"
+                )
+                continue
+
+            stage.init_stage_worker(
+                model,
+                is_async=self.is_async,
+                shm_threshold_bytes=self._shm_threshold_bytes,
+                ctx=self._ctx if self.worker_backend != "ray" else None,
+                batch_timeout=self.batch_timeout,
+                connectors_config=stage_connectors_config,
+                worker_backend=self.worker_backend,
+                ray_placement_group=self._ray_pg,
+                ignore_runtime_config=True if self._single_stage_id is not None else False,
+            )
+
+            logger.debug(f"[{self._name}] Stage-{stage_id} process started")
+
+    def _process_stage_ready(self, stage: OmniStage, stage_id: int, result: dict[str, Any]) -> None:
+        self._stages_ready.add(stage_id)
+        logger.info(f"[{self._name}] Stage-{stage_id} reported ready")
+
+    def _wait_for_stages_ready(self, timeout: int = 120) -> None:
+        """Wait for all stages to report readiness with optimized polling."""
+        if self._single_stage_id is not None and self.worker_backend != "ray":
+            timeout = self._wait_for_handshakes(timeout)
+
+        num_stages = len(self.stage_list)
+        deadline = time.time() + max(0, int(timeout))
+
+        logger.info(f"[{self._name}] Waiting for {num_stages} stages to initialize (timeout: {timeout}s)")
+
+        while len(self._stages_ready) < num_stages and time.time() < deadline:
+            progressed = False
+            for stage_id, stage in enumerate(self.stage_list):
+                if stage_id in self._stages_ready:
+                    continue
+
+                # Check if the stage has reported status
+                if result := stage.try_collect():
+                    progressed = True
+                    if result.get("type") == "stage_ready":
+                        self._process_stage_ready(stage, stage_id, result)
+
+            if not progressed:
+                time.sleep(0.05)
+
+        # Handle Final State
+        if len(self._stages_ready) == num_stages:
+            logger.info(f"[{self._name}] All stages initialized successfully")
+            return
+
+        # Handle Timeout/Failure
+        not_ready = sorted(set(range(num_stages)) - set(self._stages_ready))
+        logger.warning(
+            f"[{self._name}] Initialization timeout: {len(self._stages_ready)}/{num_stages} "
+            f"stages ready. Missing stages: {not_ready}"
+        )
+
+        suggestions = [
+            f"Ignore this warning if the model weight download / load from disk time is longer than {timeout}s.",
+            "Verify GPU/device assignment in config (runtime.devices) is correct.",
+            "Check GPU/host memory availability; reduce model or batch size if needed.",
+            "Check model weights path and network reachability (if loading remotely).",
+            "Increase initialization wait time (stage_init_timeout or call-site timeout).",
+        ]
+
+        formatted_suggestions = "\n".join(f"  {i + 1}) {msg}" for i, msg in enumerate(suggestions))
+
+        logger.warning(f"[{self._name}] Stage initialization timeout. Troubleshooting Steps:\n{formatted_suggestions}")
+
+    def _is_profiler_enabled(self, stage_id: int) -> bool:
+        """Check if profiler config is set for a given stage."""
+        stage = self.stage_list[stage_id]
+        # For diffusion stages, profiling is controlled by VLLM_TORCH_PROFILER_DIR env var
+        if stage.stage_type == "diffusion":
+            return True
+        # For LLM stages, check if profiler_config is set in engine_args
+        engine_args = getattr(stage.stage_config, "engine_args", None)
+        if engine_args is None:
+            return False
+        profiler_config = getattr(engine_args, "profiler_config", None)
+        if profiler_config is None:
+            return False
+        profiler = getattr(profiler_config, "profiler", None)
+        return profiler is not None
+
+    def start_profile(self, stages: list[int] | None = None) -> None:
+        """Start profiling for specified stages.
+
+        Sends start_profile command to stage workers. Profiling must be enabled
+        via VLLM_TORCH_PROFILER_DIR environment variable.
+
+        Args:
+            stages: List of stage IDs to start profiling. If None, starts
+                profiling for all stages that have profiling enabled.
+
+        Example:
+            >>> # Profile all stages
+            >>> omni.start_profile()
+            >>> outputs = omni.generate(prompts, sampling_params)
+            >>> omni.stop_profile()
+
+            >>> # Profile only stage 0 and 2
+            >>> omni.start_profile(stages=[0, 2])
+        """
+        if stages is None:
+            stages = list(range(len(self.stage_list)))
+
+        for stage_id in stages:
+            if stage_id < len(self.stage_list):
+                if not self._is_profiler_enabled(stage_id):
+                    logger.info(
+                        "[%s] Skipping start_profile for stage-%s: profiler config not set",
+                        self._name,
+                        stage_id,
+                    )
+                    continue
+                try:
+                    self.stage_list[stage_id].submit({"type": OmniStageTaskType.PROFILER_START})
+                    logger.info("[%s] Sent start_profile to stage-%s", self._name, stage_id)
+                except Exception as e:
+                    logger.warning(
+                        "[%s] Failed to send start_profile to stage-%s: %s",
+                        self._name,
+                        stage_id,
+                        e,
+                    )
+
+    def stop_profile(self, stages: list[int] | None = None) -> dict:
+        """
+        Synchronously stop profiling for specified stages and collect
+        the file paths for traces and tables.
+        """
+        if stages is None:
+            stages = list(range(len(self.stage_list)))
+
+        all_results = {"traces": [], "tables": []}
+
+        for stage_id in stages:
+            if stage_id < len(self.stage_list):
+                if not self._is_profiler_enabled(stage_id):
+                    logger.info(
+                        "[%s] Skipping stop_profile for stage-%s: profiler config not set",
+                        self._name,
+                        stage_id,
+                    )
+                    continue
+                stage = self.stage_list[stage_id]
+
+                # Check if the stage object has our new bridge method
+                if hasattr(stage, "stop_profile"):
+                    logger.info("[%s] Requesting profile data collection from stage-%s", self._name, stage_id)
+
+                    # This is the blocking call that triggers the RPC chain
+                    stage_data = stage.stop_profile()
+
+                    if isinstance(stage_data, dict):
+                        # FIX: Handle both single key and list key formats
+                        traces = stage_data.get("trace") or stage_data.get("traces")
+                        tables = stage_data.get("table") or stage_data.get("tables")
+
+                        # Debug logging
+                        logger.debug(f"[{self._name}] Stage-{stage_id} returned: {stage_data.keys()}")
+                        if traces:
+                            logger.debug(f"[{self._name}] Stage-{stage_id} traces type: {type(traces)}")
+                        if tables:
+                            logger.debug(f"[{self._name}] Stage-{stage_id} tables type: {type(tables)}")
+
+                        # Handle single strings
+                        if traces:
+                            if isinstance(traces, str):
+                                all_results["traces"].append(traces)
+                            elif isinstance(traces, list):
+                                all_results["traces"].extend(traces)
+
+                        # Handle single strings
+                        if tables:
+                            if isinstance(tables, str):
+                                all_results["tables"].append(tables)
+                            elif isinstance(tables, list):
+                                all_results["tables"].extend(tables)
+                        else:
+                            logger.warning(f"[{self._name}] Stage-{stage_id} returned no table data")
+                    else:
+                        logger.warning(f"[{self._name}] Stage-{stage_id} returned non-dict data: {type(stage_data)}")
+                else:
+                    # Fallback for non-diffusion stages
+                    logger.warning(
+                        "[%s] Stage-%s does not support synchronous stop_profile. Falling back to async.",
+                        self._name,
+                        stage_id,
+                    )
+                    stage.submit({"type": OmniStageTaskType.PROFILER_STOP})
+
+        # Final debug output
+        logger.info(
+            f"[{self._name}] Collected {len(all_results['traces'])} trace(s) and {len(all_results['tables'])} table(s)"
+        )
+
+        return all_results
+
+    def close(self) -> None:
+        """Close all stage processes and clean up resources."""
+        if hasattr(self, "_weak_finalizer"):
+            self._weak_finalizer()
+
+    def _process_handshake_message(self, msg: Any) -> dict[str, Any]:
+        """Process incoming handshake message and generate response.
+
+        Args:
+            msg: Decoded message from client
+
+        Returns:
+            Response dictionary with ok status and either endpoints or error
+        """
+        if not isinstance(msg, dict) or msg.get("type") != "handshake":
+            return {"ok": False, "error": "invalid handshake payload"}
+
+        try:
+            stage_id = int(msg.get("stage_id"))
+        except (TypeError, ValueError) as e:
+            return {"ok": False, "error": f"invalid stage_id: {e}"}
+
+        endpoints = self._handshake_endpoints.get(stage_id)
+        if endpoints is None:
+            return {"ok": False, "error": f"unknown stage_id: {stage_id}"}
+
+        # Mark stage as seen and prepare success response
+        self._handshake_seen.add(stage_id)
+        in_endpoint, out_endpoint = endpoints
+
+        logger.info(
+            "[%s] Handshake received from stage-%s",
+            self._name,
+            stage_id,
+        )
+
+        return {
+            "ok": True,
+            "in_endpoint": in_endpoint,
+            "out_endpoint": out_endpoint,
+        }
+
+    def _run_handshake_server_loop(self) -> None:
+        """Main loop for handshake server - polls for messages and responds."""
+        poller = zmq.Poller()
+        poller.register(self._zmq_handshake_socket, zmq.POLLIN)
+
+        try:
+            while not self._handshake_stop.is_set():
+                events = poller.poll(1000)
+                has_message = any(sock == self._zmq_handshake_socket and event == zmq.POLLIN for sock, event in events)
+                if not has_message:
+                    continue
+
+                msg = msgspec.msgpack.decode(self._zmq_handshake_socket.recv())
+                response = msgspec.msgpack.encode(self._process_handshake_message(msg))
+                self._zmq_handshake_socket.send(response)
+        finally:
+            poller.unregister(self._zmq_handshake_socket)
+
+    def start_handshake_server(self) -> None:
+        """Start the ZMQ handshake server.
+
+        The handshake server allows distributed stages to discover their
+        queue endpoints by querying the orchestrator with their stage_id.
+        Skips starting if the server is already running or ZMQ is not initialized.
+        """
+        # Skip if already running or ZMQ not initialized
+        if self._handshake_thread is not None or self._zmq_ctx is None:
+            return
+
+        # Skip if master address/port not configured
+        if not self._zmq_master_address or self._zmq_master_port is None:
+            return
+
+        # Create server endpoint and socket
+        endpoint = get_engine_client_zmq_addr(
+            local_only=False, host=self._zmq_master_address, port=int(self._zmq_master_port)
+        )
+
+        self._handshake_stop = threading.Event()
+        self._zmq_handshake_socket = make_zmq_socket(self._zmq_ctx, endpoint, zmq.REP, bind=True, linger=5000)
+
+        # Start server thread
+        self._handshake_thread = threading.Thread(
+            target=self._run_handshake_server_loop, daemon=True, name="zmq-handshake-server"
+        )
+        self._handshake_thread.start()
+
+    def _wait_for_handshakes(self, timeout: int = 120) -> int:
+        """Wait for handshakes from all expected stages.
+
+        Args:
+            timeout: Timeout in seconds for waiting for handshakes. Default is 120s.
+
+        Returns:
+            Remaining timeout in seconds after waiting for handshakes.
+        """
+        total_stages = len(self.stage_configs)
+        expected = set(range(total_stages)) - {int(self._single_stage_id)}
+        if not expected:
+            return timeout
+
+        deadline = time.time() + max(0, int(timeout))
+        logger.info(f"[{self._name}] Waiting for handshakes from stages: {expected} (timeout: {timeout}s)")
+
+        # NOTE: _handshake_seen may be updated from the handshake server thread.
+        # It is intentionally used here without additional locking because:
+        #   - _handshake_seen only ever grows (stages are added but never removed), and
+        #   - we only check membership and set inclusion relative to `expected`.
+        # Under these monotonic semantics and the CPython GIL, concurrent reads/writes
+        # are safe for this usage and cannot violate correctness: we may observe a
+        # slightly stale view, but the loop condition remains valid and eventually
+        # becomes true once all expected stages have handshaked or the timeout elapses.
+        while not expected.issubset(self._handshake_seen) and time.time() < deadline:
+            time.sleep(1.0)
+
+        remaining_timeout = max(0, int(deadline - time.time()))
+
+        if not expected.issubset(self._handshake_seen):
+            missing = sorted(expected - self._handshake_seen)
+            logger.warning(
+                f"[{self._name}] Handshake timeout: {len(self._handshake_seen)}/{len(expected)} "
+                f"stages completed handshake. Missing stages: {missing}"
+            )
+
+        return remaining_timeout
+
+    @property
+    def _name(self) -> str:
+        return "OmniBase"
+
+    @property
+    def is_async(self) -> bool:
+        return False
+
+
 class Omni(OmniBase):
-    """Synchronous entrypoint for offline generation."""
+    """Unified entrypoint for both LLM and Diffusion models for better usability.
 
-    def _set_final_only_for_llm_stages(
-        self,
-        sampling_params_list: Sequence[OmniSamplingParams],
-    ) -> list[OmniSamplingParams]:
-        """Return per-stage params with LLM stages forced to FINAL_ONLY."""
-        effective_params: list[OmniSamplingParams] = []
-        for stage_id, params in enumerate(sampling_params_list):
-            sp = copy.deepcopy(params)
-            stage_meta = self.engine.get_stage_metadata(stage_id)
-            if stage_meta.get("stage_type") != "diffusion" and hasattr(sp, "output_kind"):
-                sp.output_kind = RequestOutputKind.FINAL_ONLY
-            effective_params.append(sp)
-        return effective_params
+    Args:
+        model: Model name or path to load.
+        **kwargs: Arbitrary keyword arguments.
+            - stage_configs_path: Optional path to YAML file containing stage
+              configurations. If None, configurations are loaded from the model.
+            - log_stats: Whether to enable statistics logging
+              be written to files with stage-specific suffixes.
+            - stage_init_timeout: Per-stage init watchdog (seconds). Measured from
+              when the previous stage finished (possibly a prior Omni run with GPU
+              reuse/overlap) to when the current stage starts to initialize.
+            - shm_threshold_bytes: Threshold in bytes for using shared memory
+              for IPC. Objects larger than this threshold will use shared memory.
+            - worker_backend: Backend for worker processes. Default is "multi_process".
+            - ray_address: Address of Ray cluster for Ray backend, if using Ray backend.
+            - batch_timeout: Timeout in seconds for batching requests within a stage
+            - init_timeout: Timeout in seconds for waiting for all stages to initialize
+            - Additional keyword arguments passed to stage engines.
+
+    Example:
+        >>> omni = Omni(model="Qwen/Qwen2.5-Omni-7B")
+        >>> outputs = omni.generate(prompts="Hello, world!", sampling_params_list=[SamplingParams()])
+        >>> print(outputs)
+    """
+
+    def __init__(self, model: str, **kwargs: Any) -> None:
+        super().__init__(model, **kwargs)
+
+        # Register weak reference cleanup (called on garbage collection)
+        self._weak_finalizer = weakref.finalize(
+            self,
+            _weak_close_cleanup,
+            self.stage_list,
+            self._stage_in_queues,
+            self._stage_out_queues,
+            self._ray_pg,
+            self._zmq_ctx,
+            self._handshake_stop,
+            self._zmq_handshake_socket,
+            self._handshake_thread,
+        )
 
     @overload
     def generate(
@@ -45,7 +816,6 @@ def generate(
         sampling_params_list: OmniSamplingParams | Sequence[OmniSamplingParams] | None = None,
         *,
         py_generator: Literal[True],
-        use_tqdm: bool | Callable[..., tqdm] = True,
     ) -> Generator[OmniRequestOutput, None, None]: ...
 
     @overload
@@ -55,7 +825,6 @@ def generate(
         sampling_params_list: OmniSamplingParams | Sequence[OmniSamplingParams] | None = None,
         *,
         py_generator: Literal[False] = False,
-        use_tqdm: bool | Callable[..., tqdm] = True,
     ) -> list[OmniRequestOutput]: ...
 
     def generate(
@@ -66,26 +835,66 @@ def generate(
         py_generator: bool = False,
         use_tqdm: bool | Callable[..., tqdm] = True,
     ) -> Generator[OmniRequestOutput, None, None] | list[OmniRequestOutput]:
-        sampling_params_list = self.resolve_sampling_params_list(sampling_params_list)
+        """Generate outputs for the given prompts.
+
+        Orchestrates the multi-stage pipeline based on YAML configuration.
+        Each stage will use OmniLLM or OmniDiffusion based on stage_type.
+
+        Args:
+            prompts: Input prompt(s) for generation.
+            sampling_params_list: Optional list of per-stage parameters.
+            py_generator: Whether the returned result(s) are wrapped in a generator instead of a list.
+            use_tqdm: Whether to use tqdm progress bar
+
+        Returns:
+            List of OmniRequestOutput objects, one for each input prompt.
+            Each output contains the stage_id, final_output_type, and
+            the request_output from the final stage.
+
+        Raises:
+            ValueError: If sampling_params_list is None or has incorrect length.
+        """
+        if sampling_params_list is None:
+            sampling_params_list = self.default_sampling_params_list
+        elif not isinstance(sampling_params_list, Sequence):
+            # TODO: After the recent introduction of BAGEL model (one LLM and one Diffusion),
+            # expect the text_to_image example code to run when only passing one OmniDiffusionSamplingParams
+            # This behavior may be confusing, and future PR can improve it.
+            per_stage_params: list[OmniSamplingParams] = []
+            for default_stage_sp in self.default_sampling_params_list:
+                default_sp_type = default_stage_sp.__class__
+                if default_sp_type == sampling_params_list.__class__:
+                    per_stage_params.append(sampling_params_list)
+                else:
+                    per_stage_params.append(default_stage_sp)
+            sampling_params_list = per_stage_params
+
         try:
             if py_generator:
-                return self._run_generation_with_generator(prompts, sampling_params_list, use_tqdm)
-            return list(self._run_generation(prompts, sampling_params_list, use_tqdm))
+                return self._run_generation_with_generator(prompts, sampling_params_list)
+            else:
+                outputs = list(self._run_generation(prompts, sampling_params_list, use_tqdm))
+                return outputs
         except Exception as e:
-            logger.exception("[Omni] Failed to run generation: %s", e)
+            logger.exception("[Orchestrator] Failed to run generation: %s", e)
+            # Always close on exception to ensure cleanup
             self.close()
-            raise
+            raise e
 
     def _run_generation_with_generator(
         self,
         prompts: OmniPromptType | Sequence[OmniPromptType],
         sampling_params_list: Sequence[OmniSamplingParams],
-        use_tqdm: bool | Callable[..., tqdm] = True,
     ) -> Generator[OmniRequestOutput, None, None]:
-        gen = self._run_generation(prompts, sampling_params_list, use_tqdm)
+        """Run generation through all stages in the pipeline and return a generator."""
+        gen = self._run_generation(prompts, sampling_params_list)
         try:
             yield from gen
+        except Exception as e:
+            logger.exception("[Orchestrator] Failed to run generation: %s", e)
+            raise e
         finally:
+            # Cleanup when generator is exhausted or closed
             self.close()
 
     def _run_generation(
@@ -94,94 +903,272 @@ def _run_generation(
         sampling_params_list: Sequence[OmniSamplingParams],
         use_tqdm: bool | Callable[..., tqdm] = True,
     ) -> Generator[OmniRequestOutput, None, None]:
-        try:
-            sampling_params_list = self._set_final_only_for_llm_stages(sampling_params_list)
+        """Run generation through all stages in the pipeline."""
+        logger.debug(f"[{self._name}] generate() called")
+        if sampling_params_list is None:
+            raise ValueError("sampling_params_list is required for pipelined generation")
 
-            if isinstance(prompts, str) or not isinstance(prompts, Sequence):
-                request_prompts: list[OmniPromptType] = [prompts]
-            else:
-                request_prompts = list(prompts)
-
-            if not request_prompts:
-                return
-
-            request_ids = [f"{i}_{uuid.uuid4()}" for i in range(len(request_prompts))]
-            req_start_ts: dict[str, float] = {}
-            wall_start_ts = time.time()
-            req_final_stage_ids: dict[str, int] = {}
-
-            for req_id, prompt in zip(request_ids, request_prompts):
-                prompt_modalities = prompt.get("modalities", None) if isinstance(prompt, dict) else None
-                final_stage_id = self._compute_final_stage_id(prompt_modalities)
-                req_final_stage_ids[req_id] = final_stage_id
-
-                metrics = OrchestratorMetrics(
-                    self.num_stages,
-                    self.log_stats,
-                    wall_start_ts,
-                    final_stage_id,
-                )
-                req_state = ClientRequestState(req_id)
-                req_state.metrics = metrics
-                self.request_states[req_id] = req_state
-
-                self.engine.add_request(
-                    request_id=req_id,
-                    prompt=prompt,
-                    sampling_params_list=sampling_params_list,
-                    final_stage_id=final_stage_id,
+        if len(sampling_params_list) != len(self.stage_list):
+            raise ValueError(f"Expected {len(self.stage_list)} sampling params, got {len(sampling_params_list)}")
+
+        for i, (stage, sp) in enumerate(zip(self.stage_list, sampling_params_list)):
+            ExpectedSPType = OmniDiffusionSamplingParams if stage.stage_type == "diffusion" else SamplingParams
+            if not isinstance(sp, ExpectedSPType):
+                raise ValueError(
+                    f"Expected sampling parameters with type {ExpectedSPType} in stage {i}, got {sp.__class__}"
                 )
-                submit_ts = time.time()
-                req_state.metrics.stage_first_ts[0] = submit_ts
-                req_start_ts[req_id] = submit_ts
 
-            active_reqs = set(request_ids)
-            pbar = None
-            if use_tqdm:
-                tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
-                pbar = tqdm_func(total=len(request_ids), desc="Processed prompts", dynamic_ncols=True)
+        # Normalize prompts to a list for per-request iteration
+        # str is also Sequence but only test list-like containers here
+        if isinstance(prompts, str) or not isinstance(prompts, Sequence):
+            request_prompts: list[OmniPromptType] = [prompts]
+        else:
+            request_prompts = list(prompts)
+
+        # Orchestrator keeps stage objects for input derivation
+        num_stages = len(self.stage_list)
+
+        # Generate globally unique request IDs and map them to original prompts
+        request_ids = [f"{i}_{uuid.uuid4()}" for i in range(len(request_prompts))]
+        request_id_to_prompt = {rid: p for rid, p in zip(request_ids, request_prompts)}
+
+        # Track per-request start time for end-to-end timing
+        _req_start_ts: dict[str, float] = {}
+        _wall_start_ts: float = time.time()
+
+        # Determine the final stage for E2E stats (highest stage_id with final_output=True; fallback to last stage)
+        final_stage_id_to_prompt: dict[str, int] = {}
+        for rid, prompt in request_id_to_prompt.items():
+            if isinstance(prompt, dict):
+                prompt_modalities = prompt.get("modalities", None)
+            else:
+                prompt_modalities = None
+            final_stage_id_for_e2e = get_final_stage_id_for_e2e(
+                prompt_modalities, self.output_modalities, self.stage_list
+            )
+            final_stage_id_to_prompt[rid] = final_stage_id_for_e2e
+
+        # Metrics/aggregation helper
+        metrics = OrchestratorAggregator(
+            num_stages,
+            self.log_stats,
+            _wall_start_ts,
+            final_stage_id_to_prompt,
+        )
+
+        it = request_id_to_prompt.items()
+        if use_tqdm:
+            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
+            it = tqdm_func(it, desc="Adding requests")
+
+        # Seed stage-0 queue with all requests
+        logger.debug(f"[{self._name}] Seeding {len(request_prompts)} requests into stage-0")
+        # Mark first input time for stage-0
+        metrics.stage_first_ts[0] = metrics.stage_first_ts[0] or time.time()
 
-            while active_reqs:
-                msg = self.engine.try_get_output()
+        for req_id, prompt in request_id_to_prompt.items():
+            sp0 = sampling_params_list[0]  # type: ignore[index]
+            task = {
+                "request_id": req_id,
+                "engine_inputs": prompt,
+                "sampling_params": sp0,
+            }
+            self.stage_list[0].submit(task)
+            _req_start_ts[req_id] = time.time()
+            logger.debug(f"[{self._name}] Enqueued request {req_id} to stage-0")
 
-                should_continue, req_id, stage_id, req_state = self._handle_output_message(msg)
-                if should_continue:
+        pbar = None
+        if use_tqdm:
+            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
+            pbar = tqdm_func(
+                total=len(request_prompts),
+                desc="Processed prompts",
+                dynamic_ncols=True,
+                postfix=(f"est. speed input: {0:.2f} unit/s, output: {0:.2f} unit/s"),
+            )
+        # For each stage, forward results to next stage; collect finals at the end
+        # We pipeline by continually polling output queues in stage order
+        remaining_by_stage: list[int] = [len(request_prompts)] + [0] * (num_stages - 1)
+        completed_requests = 0
+        total_requests = len(request_prompts)
+
+        logger.debug(
+            f"[{self._name}] Entering scheduling loop: total_requests={total_requests}, stages={num_stages}",
+        )
+        while completed_requests < total_requests:
+            made_progress = False
+            for stage_id, stage in enumerate(self.stage_list):
+                result = stage.try_collect()
+                if result is None:
                     continue
 
-                if req_id not in active_reqs:
-                    logger.warning("[Omni] Received output for unknown/finished request_id=%s", req_id)
+                made_progress = True
+                req_id = result.get("request_id")
+                if "error" in result:
+                    logger.error(
+                        f"[{self._name}] Stage {stage_id} error on request {req_id}: {result['error']}",
+                    )
                     continue
 
-                if req_state.metrics is None:
+                if result.get("type") == "stage_ready":
+                    # Only happens when stage is initialized slower than expected,
+                    # so we wait for a short time and try again
+                    time.sleep(0.05)
                     continue
-                output_to_yield = self._process_single_result(
-                    result=msg,
-                    stage_id=stage_id,
-                    metrics=req_state.metrics,
-                    req_start_ts=req_start_ts,
-                    wall_start_ts=wall_start_ts,
-                    final_stage_id_for_e2e=req_final_stage_ids[req_id],
+
+                engine_outputs = _load(result, obj_key="engine_outputs", shm_key="engine_outputs_shm")
+                # Mark last output time for this stage whenever we receive outputs
+                metrics.stage_last_ts[stage_id] = max(metrics.stage_last_ts[stage_id] or 0.0, time.time())
+                try:
+                    _m: StageRequestStats = result.get("metrics")
+                    if _m is not None:
+                        # Accumulate generation time
+                        metrics.accumulated_gen_time_ms[req_id][stage_id] += _m.stage_gen_time_ms
+
+                        # For diffusion stages, we also accumulate diffusion time
+                        metrics.accumulate_diffusion_metrics(stage.stage_type, req_id, engine_outputs)
+
+                        metrics.on_stage_metrics(stage_id, req_id, _m, stage.final_output_type)
+                        if pbar:
+                            elapsed = pbar.format_dict["elapsed"] or 1e-6
+                            # Aggregate total tokens/images across all stages
+                            total_out = sum(metrics.stage_total_tokens)
+                            out_spd = total_out / elapsed
+
+                            modality = self.output_modalities[stage_id]
+                            unit = "img" if modality == "image" else "tok"
+
+                            # Pre-calculate for cleaner string formatting
+                            if metrics.e2e_count > 0:
+                                avg_lat = metrics.e2e_total_ms / metrics.e2e_count
+                            else:
+                                avg_lat = 0
+
+                            # Align with vLLM's wording "est. speed" using multi-line parentheses
+                            pbar.postfix = (
+                                f"est. speed stage-{stage_id} {unit}/s: {out_spd:.2f}, avg e2e_lat: {avg_lat:.1f}ms"
+                            )
+                except Exception as e:
+                    logger.exception(
+                        f"[{self._name}] Failed to process metrics for stage {stage_id}, req {req_id}: {e}",
+                    )
+                logger.debug(
+                    f"[{self._name}] Stage-{stage_id} completed request {req_id}; forwarding or finalizing",
                 )
-                if output_to_yield is not None:
+                stage.set_engine_outputs(engine_outputs)
+
+                if getattr(stage, "final_output", False):
+                    logger.debug(
+                        f"[{self._name}] Request {req_id} finalized at stage-{stage_id}",
+                    )
+
+                    # End-to-end timing and time-per-token for final output
+                    # (only once per request at the designated final stage)
+                    try:
+                        if stage_id == final_stage_id_to_prompt[req_id]:
+                            metrics.on_finalize_request(
+                                stage_id,
+                                req_id,
+                                _req_start_ts.get(req_id, _wall_start_ts),
+                            )
+                    except Exception as e:
+                        logger.exception(
+                            f"[{self._name}] Finalize request handling error for req {req_id} at stage {stage_id}: {e}",
+                        )
+                    output_to_yield = OmniRequestOutput(
+                        stage_id=stage_id,
+                        final_output_type=stage.final_output_type,  # type: ignore[attr-defined]
+                        request_output=engine_outputs,
+                    )
+
+                    # Record audio generated frames (only when finished)
+                    try:
+                        finished = (
+                            engine_outputs.finished
+                            if hasattr(engine_outputs, "finished")
+                            else (
+                                engine_outputs[0].finished
+                                if isinstance(engine_outputs, list)
+                                and engine_outputs
+                                and hasattr(engine_outputs[0], "finished")
+                                else False
+                            )
+                        )
+                        if finished:
+                            metrics.record_audio_generated_frames(output_to_yield, stage_id, req_id)
+                    except Exception as e:
+                        logger.exception(
+                            f"[{self._name}] Failed to record audio metrics for req {req_id} at stage {stage_id}: {e}",
+                        )
+
                     yield output_to_yield
 
-                if msg.get("finished"):
-                    active_reqs.discard(req_id)
-                    if pbar is not None:
+                next_stage_id = stage_id + 1
+                if next_stage_id <= final_stage_id_to_prompt[req_id]:
+                    next_stage: OmniStage = self.stage_list[next_stage_id]
+                    try:
+                        # Derive inputs for the next stage, record preprocess time
+                        with metrics.stage_postprocess_timer(stage_id, req_id):
+                            next_inputs = next_stage.process_engine_inputs(
+                                self.stage_list, [request_id_to_prompt[req_id]]
+                            )
+                    except Exception as e:
+                        logger.exception(
+                            f"[{self._name}] Process engine inputs error for req {req_id}"
+                            f" at stage {next_stage_id}: {e}",
+                        )
+                        continue
+                    sp_next = sampling_params_list[next_stage_id]  # type: ignore[index]
+
+                    # Check if we have a connector for this edge
+                    connector_key = (str(stage_id), str(next_stage_id))
+                    connector = self.connectors.get(connector_key)
+                    sent_via_connector = False
+                    if connector:
+                        sent_via_connector = try_send_via_connector(
+                            connector=connector,
+                            stage_id=stage_id,
+                            next_stage_id=next_stage_id,
+                            req_id=req_id,
+                            next_inputs=next_inputs,
+                            sampling_params=sp_next,
+                            original_prompt=request_id_to_prompt[req_id],
+                            next_stage_queue_submit_fn=self.stage_list[next_stage_id].submit,
+                            metrics=metrics,
+                        )
+
+                    if not sent_via_connector:
+                        raise RuntimeError(
+                            f"[{self._name}] Failed to send request {req_id} to stage-{next_stage_id} via connector. "
+                            "Configure a connector for this edge or inspect connector logs for details."
+                        )
+                    logger.debug(
+                        f"[{self._name}] Forwarded request {req_id} to stage-{next_stage_id}",
+                    )
+                    remaining_by_stage[next_stage_id] += 1
+                else:
+                    completed_requests += 1
+                    if pbar:
+                        final_mod = self.output_modalities[final_stage_id_to_prompt[req_id]]
+                        pbar.unit = "img" if final_mod == "image" else "req"
                         pbar.update(1)
-                    self._log_summary_and_cleanup(req_id)
-        except Exception:
-            if "active_reqs" in locals() and active_reqs:
-                self.abort(list(active_reqs))
-            raise
-        finally:
-            if "pbar" in locals() and pbar is not None:
-                pbar.close()
-
-    def abort(self, request_id: str | Iterable[str]) -> None:
-        request_ids = [request_id] if isinstance(request_id, str) else list(request_id)
-        self.engine.abort(request_ids)
-        for req_id in request_ids:
-            self.request_states.pop(req_id, None)
-        if self.log_stats:
-            logger.info("[Omni] Aborted request(s) %s", ",".join(request_ids))
+                    logger.debug(
+                        f"[{self._name}] Request {req_id} fully completed ({completed_requests}/{total_requests})",
+                    )
+
+            if not made_progress:
+                time.sleep(0.005)
+        logger.debug(f"[{self._name}] All requests completed")
+
+        if pbar:
+            pbar.close()
+
+        # Summarize and print stats
+        try:
+            metrics.build_and_log_summary()
+        except Exception as e:
+            logger.exception(f"[{self._name}] Failed to build/log summary: {e}")
+
+    @property
+    def _name(self) -> str:
+        return "Orchestrator"
diff --git a/vllm_omni/entrypoints/omni_diffusion.py b/vllm_omni/entrypoints/omni_diffusion.py
new file mode 100644
index 00000000000..0f8049f43f5
--- /dev/null
+++ b/vllm_omni/entrypoints/omni_diffusion.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import uuid
+from collections.abc import Sequence
+
+from vllm.transformers_utils.config import get_hf_file_to_dict
+
+from vllm_omni.diffusion.data import OmniDiffusionConfig, TransformerConfig
+from vllm_omni.diffusion.diffusion_engine import DiffusionEngine
+from vllm_omni.diffusion.request import OmniDiffusionRequest
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType
+from vllm_omni.outputs import OmniRequestOutput
+
+
+class OmniDiffusion:
+    """
+    It is the main class to interact with vLLM-Omni diffusion models.
+    It acts as a high-level interface that prepares requests and
+    delegates the actual diffusion process to the DiffusionEngine.
+
+    You can pass either an `OmniDiffusionConfig` via `od_config`, or
+    pass kwargs such as `model="Qwen/Qwen-Image"`,
+    which will be forwarded to `OmniDiffusionConfig.from_kwargs`.
+    """
+
+    def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs):
+        # Capture stage info from kwargs before they might be filtered out
+        stage_id = kwargs.get("stage_id")
+        engine_input_source = kwargs.get("engine_input_source")
+
+        if od_config is None:
+            od_config = OmniDiffusionConfig.from_kwargs(**kwargs)
+        elif isinstance(od_config, dict):
+            # If config is dict, check it too (priority to kwargs if both exist)
+            if stage_id is None:
+                stage_id = od_config.get("stage_id")
+            if engine_input_source is None:
+                engine_input_source = od_config.get("engine_input_source")
+            od_config = OmniDiffusionConfig.from_kwargs(**od_config)
+
+        self.od_config = od_config
+
+        # Inject stage info into omni_kv_config if present
+        if stage_id is not None:
+            self.od_config.omni_kv_config.setdefault("stage_id", stage_id)
+        if engine_input_source is not None:
+            self.od_config.omni_kv_config.setdefault("engine_input_source", engine_input_source)
+
+        # Detect model class and load config
+        # Diffusers-style models expose `model_index.json` with `_class_name`.
+        # Non-diffusers models (e.g. Bagel, NextStep, GLM-Image) only have `config.json`,
+        # so we fall back to reading that and mapping model_type manually.
+        try:
+            config_dict = get_hf_file_to_dict(
+                "model_index.json",
+                od_config.model,
+            )
+            if config_dict is not None:
+                od_config.model_class_name = config_dict.get("_class_name", None)
+                od_config.update_multimodal_support()
+
+                tf_config_dict = get_hf_file_to_dict(
+                    "transformer/config.json",
+                    od_config.model,
+                )
+                od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict)
+            else:
+                raise FileNotFoundError("model_index.json not found")
+        except (AttributeError, OSError, ValueError, FileNotFoundError):
+            cfg = get_hf_file_to_dict("config.json", od_config.model)
+            if cfg is None:
+                raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}")
+
+            # Map model_type or architecture to pipeline class
+            model_type = cfg.get("model_type")
+            architectures = cfg.get("architectures") or []
+            pipeline_class = None
+            # Bagel/NextStep models don't have a model_index.json, so we set the pipeline class name manually
+            if model_type == "bagel" or "BagelForConditionalGeneration" in architectures:
+                pipeline_class = "BagelPipeline"
+            elif model_type == "nextstep":
+                if od_config.model_class_name is None:
+                    pipeline_class = "NextStep11Pipeline"
+            elif model_type == "glm-image" or "GlmImageForConditionalGeneration" in architectures:
+                pipeline_class = "GlmImagePipeline"
+            elif architectures and len(architectures) == 1:
+                pipeline_class = architectures[0]
+
+            if pipeline_class is None:
+                raise ValueError(f"Unknown model type: {model_type}, architectures: {architectures}")
+
+            od_config.model_class_name = pipeline_class
+            od_config.tf_model_config = TransformerConfig()
+            od_config.update_multimodal_support()
+
+        self.engine: DiffusionEngine = DiffusionEngine.make_engine(od_config)
+
+    def generate(
+        self,
+        prompts: OmniPromptType | Sequence[OmniPromptType],
+        sampling_params: OmniDiffusionSamplingParams,
+        request_ids: list[str] = [],
+    ) -> list[OmniRequestOutput]:
+        if isinstance(prompts, str | dict):
+            prompts = [prompts]
+        else:
+            prompts = list(prompts)
+
+        # Check if request_id is provided in kwargs
+        if len(request_ids) < len(prompts):
+            request_ids.extend(f"{i + len(request_ids)}_{uuid.uuid4()}" for i in range(len(prompts) - len(request_ids)))
+
+        request = OmniDiffusionRequest(prompts, sampling_params, request_ids)
+        return self._run_engine(request)
+
+    def _run_engine(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]:
+        return self.engine.step(request)
+
+    def close(self) -> None:
+        self.engine.close()
+
+    def __del__(self):  # pragma: no cover - best effort cleanup
+        try:
+            self.close()
+        except Exception:
+            pass
+
+    def start_profile(self, trace_filename: str | None = None) -> None:
+        """Start profiling for the diffusion model.
+
+        Args:
+            trace_filename: Optional base filename for trace files.
+                           If None, a timestamp-based name will be generated.
+        """
+        if hasattr(self, "engine") and self.engine:
+            self.engine.start_profile(trace_filename)
+        else:
+            raise RuntimeError("Diffusion engine not initialized")
+
+    def stop_profile(self) -> dict:
+        """Stop profiling and return profiling results.
+
+        Returns:
+            Dictionary containing paths to trace and table files.
+        """
+        if hasattr(self, "engine") and self.engine:
+            return self.engine.stop_profile()
+        else:
+            raise RuntimeError("Diffusion engine not initialized")
diff --git a/vllm_omni/entrypoints/omni_llm.py b/vllm_omni/entrypoints/omni_llm.py
new file mode 100644
index 00000000000..ec9248e3041
--- /dev/null
+++ b/vllm_omni/entrypoints/omni_llm.py
@@ -0,0 +1,242 @@
+from collections.abc import Callable
+from typing import Any
+
+import cloudpickle
+from pydantic import ValidationError
+from tqdm import tqdm
+
+# External library imports (vLLM)
+from vllm.config import CompilationConfig, StructuredOutputsConfig, is_init_field
+from vllm.entrypoints.llm import LLM
+from vllm.logger import init_logger
+from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.plugins.io_processors import get_io_processor
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils.counter import Counter
+from vllm.v1.engine.llm_engine import LLMEngine
+
+from vllm_omni.distributed.omni_connectors import initialize_orchestrator_connectors
+
+# Internal imports (our code)
+from vllm_omni.engine.arg_utils import OmniEngineArgs
+from vllm_omni.engine.input_processor import OmniInputProcessor
+from vllm_omni.engine.output_processor import MultimodalOutputProcessor
+from vllm_omni.entrypoints.utils import (
+    filter_dataclass_kwargs,
+    load_stage_configs_from_model,
+    load_stage_configs_from_yaml,
+    resolve_model_config_path,
+)
+
+logger = init_logger(__name__)
+
+
+class OmniLLM(LLM):
+    """Main entry point for vLLM-Omni inference.
+
+    This class extends the base vLLM LLM class with omni-specific
+    processors for handling multimodal inputs and outputs. It provides
+    configuration loading for multi-stage pipelines, while stage management
+    is handled by the Omni class.
+
+    Args:
+        model: Model name or path to load
+        stage_configs_path: Optional path to YAML file containing stage
+            configurations. If None, configurations are loaded from the model.
+        log_stats: Whether to enable statistics logging
+        compilation_config: Optional compilation configuration. Can be an
+            integer (compilation level), dict, or CompilationConfig instance.
+        hf_overrides: Optional HuggingFace model configuration overrides
+        structured_outputs_config: Optional structured outputs configuration.
+            Can be a dict or StructuredOutputsConfig instance.
+        init_sleep_seconds: Number of seconds to sleep between starting
+            each stage process during initialization (used by Omni class)
+        shm_threshold_bytes: Threshold in bytes for using shared memory
+            for IPC. Objects larger than this threshold will use shared memory.
+        batch_timeout: Timeout in seconds for batching requests within a stage
+        init_timeout: Timeout in seconds for waiting for all stages to initialize
+        **kwargs: Additional keyword arguments passed to the base LLM class
+            and engine
+
+    Example:
+        >>> llm = OmniLLM(model="Qwen/Qwen2.5-Omni-7B")
+        >>> # Stage management is handled by Omni class
+    """
+
+    def __init__(
+        self,
+        model: str,
+        stage_configs_path: str | None = None,
+        log_stats: bool = False,
+        compilation_config: int | dict[str, Any] | CompilationConfig | None = None,
+        hf_overrides: dict[str, Any] | None = None,
+        structured_outputs_config: dict[str, Any] | StructuredOutputsConfig | None = None,
+        init_sleep_seconds: int = 20,
+        shm_threshold_bytes: int = 65536,
+        batch_timeout: int = 10,
+        init_timeout: int = 300,
+        **kwargs: Any,
+    ):
+        """LLM constructor with omni-specific configuration loading."""
+        # Store stage management parameters (used by Omni class)
+        self.worker_backend = kwargs.get("worker_backend", "multi_process")
+        self.ray_address = kwargs.get("ray_address", None)
+        self.batch_timeout = batch_timeout
+        self.log_stats: bool = bool(log_stats)
+
+        # Load stage configurations
+        if stage_configs_path is None:
+            self.config_path = resolve_model_config_path(model)
+            self.stage_configs = load_stage_configs_from_model(model)
+        else:
+            self.config_path = stage_configs_path
+            self.stage_configs = load_stage_configs_from_yaml(stage_configs_path)
+
+        # Initialize connectors
+        self.omni_transfer_config, self.connectors = initialize_orchestrator_connectors(
+            self.config_path, worker_backend=self.worker_backend, shm_threshold_bytes=shm_threshold_bytes
+        )
+
+        # Initialize LLM engine
+        if "disable_log_stats" not in kwargs:
+            kwargs["disable_log_stats"] = True
+
+        if "worker_cls" in kwargs:
+            worker_cls = kwargs["worker_cls"]
+            # if the worker_cls is not qualified string name,
+            # we serialize it using cloudpickle to avoid pickling issues
+            if isinstance(worker_cls, type):
+                kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)
+
+        if "kv_transfer_config" in kwargs and isinstance(kwargs["kv_transfer_config"], dict):
+            from vllm.config.kv_transfer import KVTransferConfig
+
+            raw_config_dict = kwargs["kv_transfer_config"]
+            try:
+                kwargs["kv_transfer_config"] = KVTransferConfig(**raw_config_dict)
+            except ValidationError as e:
+                logger.error(
+                    "Failed to convert 'kv_transfer_config' dict to KVTransferConfig object. Dict: %s. Error: %s",
+                    raw_config_dict,
+                    e,
+                )
+                raise ValueError(f"Invalid 'kv_transfer_config' provided: {e}") from e
+
+        # Extract omni_kv_config from kwargs if present (injected by Omni)
+        omni_kv_config = kwargs.pop("omni_kv_config", None)
+
+        if compilation_config is not None:
+            if isinstance(compilation_config, int):
+                compilation_config_instance = CompilationConfig(level=compilation_config)
+            elif isinstance(compilation_config, dict):
+                compilation_config_instance = CompilationConfig(
+                    **{k: v for k, v in compilation_config.items() if is_init_field(CompilationConfig, k)}
+                )
+            else:
+                compilation_config_instance = compilation_config
+        else:
+            compilation_config_instance = CompilationConfig()
+
+        if structured_outputs_config is not None:
+            if isinstance(structured_outputs_config, dict):
+                structured_outputs_instance = StructuredOutputsConfig(
+                    **{k: v for k, v in structured_outputs_config.items() if is_init_field(StructuredOutputsConfig, k)}
+                )
+            else:
+                structured_outputs_instance = structured_outputs_config
+        else:
+            structured_outputs_instance = StructuredOutputsConfig()
+
+        engine_args = OmniEngineArgs(
+            model=model,
+            compilation_config=compilation_config_instance,
+            structured_outputs_config=structured_outputs_instance,
+            omni_kv_config=omni_kv_config,
+            **filter_dataclass_kwargs(OmniEngineArgs, kwargs),
+        )
+
+        # Create the Engine (autoselects V0 vs V1)
+        self.llm_engine = LLMEngine.from_engine_args(engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
+        self.llm_engine.output_processor = MultimodalOutputProcessor(
+            tokenizer=self.llm_engine.tokenizer,
+            log_stats=self.llm_engine.log_stats,
+            engine_core_output_type=engine_args.engine_output_type,
+        )
+        self.llm_engine.input_processor = OmniInputProcessor(vllm_config=self.llm_engine.vllm_config)
+        self.engine_class = type(self.llm_engine)
+
+        self.request_counter = Counter()
+        self.default_sampling_params: dict[str, Any] | None = None
+
+        supported_tasks = self.llm_engine.get_supported_tasks()  # type: ignore
+
+        logger.info("Supported_tasks: %s", supported_tasks)
+
+        self.supported_tasks = supported_tasks
+
+        # Load the Input/Output processor plugin if any
+        io_processor_plugin = self.llm_engine.model_config.io_processor_plugin
+        self.io_processor = get_io_processor(self.llm_engine.vllm_config, io_processor_plugin)
+        self.model_config = self.llm_engine.model_config
+        self.input_processor = self.llm_engine.input_processor
+
+    def close(self) -> None:
+        """Close resources.
+
+        Note: Stage management is now handled by Omni class.
+        This method closes the LLM engine but not stages.
+        """
+        # Close the LLM engine if it exists
+        if hasattr(self, "llm_engine") and self.llm_engine is not None:
+            if hasattr(self.llm_engine, "shutdown"):
+                self.llm_engine.shutdown()
+
+    def __del__(self) -> None:  # best-effort
+        try:
+            self.close()
+        except Exception as e:
+            logger.debug("[Orchestrator] __del__ close() raised: %s", e, exc_info=True)
+
+    def _run_engine(self, *, use_tqdm: bool | Callable[..., tqdm] = True) -> list[RequestOutput | PoolingRequestOutput]:
+        # Initialize tqdm.
+        if use_tqdm:
+            num_requests = self.llm_engine.get_num_unfinished_requests()
+            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
+            pbar = tqdm_func(
+                total=num_requests,
+                desc="Processed prompts",
+                dynamic_ncols=True,
+                postfix=(f"est. speed input: {0:.2f} toks/s, output: {0:.2f} toks/s"),
+            )
+
+        # Run the engine.
+        outputs: list[RequestOutput | PoolingRequestOutput] = []
+        total_in_toks = 0
+        total_out_toks = 0
+        while self.llm_engine.has_unfinished_requests():
+            step_outputs = self.llm_engine.step()
+            for output in step_outputs:
+                if output.finished:
+                    outputs.append(output)
+                    if use_tqdm:
+                        if isinstance(output, RequestOutput):
+                            # Calculate tokens only for RequestOutput
+                            n = len(output.outputs)
+                            assert output.prompt_token_ids is not None
+                            total_in_toks += len(output.prompt_token_ids) * n
+                            in_spd = total_in_toks / pbar.format_dict["elapsed"]
+                            total_out_toks += sum(len(stp.token_ids) for stp in output.outputs)
+                            out_spd = total_out_toks / pbar.format_dict["elapsed"]
+                            pbar.postfix = f"est. speed input: {in_spd:.2f} toks/s, output: {out_spd:.2f} toks/s"
+                            pbar.update(n)
+                        else:
+                            pbar.update(1)
+                        if pbar.n == num_requests:
+                            pbar.refresh()
+
+        if use_tqdm:
+            pbar.close()
+        # Sort the outputs by the int part of request ID which is in format of 'int-uuid'.
+        # This is necessary because some requests may be finished earlier than
+        # its previous requests.
+        return sorted(outputs, key=lambda x: int(x.request_id.split("-")[0]))
diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py
new file mode 100644
index 00000000000..ba10dbcc1c0
--- /dev/null
+++ b/vllm_omni/entrypoints/omni_stage.py
@@ -0,0 +1,1572 @@
+"""
+Stage manager for orchestrating multiple engines in vLLM-Omni.
+
+Enhanced to encapsulate per-stage process lifecycle and worker logic
+(device setup, LLM init, batching, shared-memory IPC), while preserving
+the original input processing utilities for cross-stage data wiring.
+"""
+
+import asyncio
+import fcntl
+import importlib
+import multiprocessing as mp
+import os
+import queue
+import sys
+import time
+import traceback
+from collections.abc import Sequence
+from contextlib import contextmanager
+from dataclasses import fields
+from typing import Any, Literal, cast
+
+from vllm import PromptType, RequestOutput
+from vllm.inputs import TextPrompt
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.tokenizers import TokenizerLike
+from vllm.usage.usage_lib import UsageContext
+from vllm.v1.engine import EngineCoreOutput
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.llm_engine import LLMEngine
+
+from vllm_omni.diffusion.data import OmniDiffusionConfig
+from vllm_omni.distributed.omni_connectors import build_stage_connectors
+from vllm_omni.distributed.omni_connectors.adapter import try_recv_via_connector
+from vllm_omni.distributed.omni_connectors.connectors.base import OmniConnectorBase
+from vllm_omni.distributed.ray_utils.utils import kill_ray_actor, start_ray_actor
+from vllm_omni.engine.arg_utils import AsyncOmniEngineArgs, OmniEngineArgs
+from vllm_omni.entrypoints.async_omni_diffusion import AsyncOmniDiffusion
+from vllm_omni.entrypoints.async_omni_llm import AsyncOmniLLM
+from vllm_omni.entrypoints.omni_diffusion import OmniDiffusion
+from vllm_omni.entrypoints.omni_llm import OmniLLM
+from vllm_omni.entrypoints.stage_utils import (
+    SHUTDOWN_TASK,
+    OmniStageTaskType,
+    _resolve_model_tokenizer_paths,
+    _to_dict,
+    is_profiler_task,
+    maybe_dump_to_shm,
+    set_stage_devices,
+)
+from vllm_omni.entrypoints.utils import detect_pid_host, filter_dataclass_kwargs
+from vllm_omni.entrypoints.zmq_utils import (
+    ZmqQueue,
+    create_zmq_queue,
+)
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType, OmniSamplingParams, OmniTokensPrompt
+from vllm_omni.metrics import count_tokens_from_outputs
+from vllm_omni.outputs import OmniRequestOutput
+
+logger = init_logger(__name__)
+
+
+@contextmanager
+def _sequential_init_lock(engine_args: dict[str, Any], stage_init_timeout: int = 300):
+    """Acquire device locks for sequential init if NVML is unavailable.
+
+    If process-scoped memory tracking is available (NVML works), stages can
+    safely initialize concurrently — each measures only its own GPU memory.
+    Otherwise, fall back to file-based locks to serialize initialization.
+    """
+    from vllm_omni.worker.gpu_memory_utils import is_process_scoped_memory_available
+
+    nvml_available = is_process_scoped_memory_available()
+    pid_host = detect_pid_host()
+
+    if nvml_available and pid_host:
+        logger.info(
+            "NVML process-scoped memory available and PID host is available — concurrent init is safe, skipping locks"
+        )
+        yield
+        return
+    else:
+        logger.info(
+            "Using sequential init locks (nvml_available=%s, pid_host=%s)",
+            nvml_available,
+            pid_host,
+        )
+
+    from vllm_omni.platforms import current_omni_platform
+
+    # Get all parallel sizes from engine_args or parallel_config (defaults to 1)
+    if "parallel_config" in engine_args:
+        parallel_config = engine_args["parallel_config"]
+        tensor_parallel_size = parallel_config.get("tensor_parallel_size", 1)
+        pipeline_parallel_size = parallel_config.get("pipeline_parallel_size", 1)
+        data_parallel_size = parallel_config.get("data_parallel_size", 1)
+        prefill_context_parallel_size = parallel_config.get("prefill_context_parallel_size", 1)
+        sequence_parallel_size = parallel_config.get("sequence_parallel_size", 1)
+        cfg_parallel_size = parallel_config.get("cfg_parallel_size", 1)
+    else:
+        tensor_parallel_size = engine_args.get("tensor_parallel_size", 1)
+        pipeline_parallel_size = engine_args.get("pipeline_parallel_size", 1)
+        data_parallel_size = engine_args.get("data_parallel_size", 1)
+        prefill_context_parallel_size = engine_args.get("prefill_context_parallel_size", 1)
+        sequence_parallel_size = 1
+        cfg_parallel_size = 1
+
+    num_devices_per_stage = (
+        tensor_parallel_size
+        * pipeline_parallel_size
+        * data_parallel_size
+        * prefill_context_parallel_size
+        * sequence_parallel_size
+        * cfg_parallel_size
+    )
+
+    # Get physical device IDs from device control env var
+    device_control_env = current_omni_platform.device_control_env_var
+    visible_devices_str = os.environ.get(device_control_env)
+    physical_devices = []
+
+    if visible_devices_str:
+        try:
+            physical_devices = [int(x.strip()) for x in visible_devices_str.split(",") if x.strip()]
+        except (ValueError, IndexError):
+            pass
+
+    if not physical_devices:
+        num_devices = current_omni_platform.get_device_count()
+        physical_devices = list(range(num_devices))
+
+    num_devices_to_lock = min(num_devices_per_stage, len(physical_devices))
+    devices_to_lock = sorted(physical_devices[:num_devices_to_lock])
+
+    logger.debug(
+        "Parallel config: TP=%d, PP=%d, DP=%d, PCP=%d, SP=%d, CFG=%d; will lock %d devices: %s",
+        tensor_parallel_size,
+        pipeline_parallel_size,
+        data_parallel_size,
+        prefill_context_parallel_size,
+        sequence_parallel_size,
+        cfg_parallel_size,
+        num_devices_to_lock,
+        devices_to_lock,
+    )
+
+    # Acquire exclusive locks for all devices using fcntl.flock
+    wait_start = time.time()
+    acquired_lock_fds = []
+
+    for device_id in devices_to_lock:
+        lock_file = f"/tmp/vllm_omni_device_{device_id}_init.lock"
+        lock_acquired = False
+
+        while not lock_acquired:
+            try:
+                lock_fd = os.open(lock_file, os.O_CREAT | os.O_RDWR, 0o644)
+
+                try:
+                    fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+                    os.ftruncate(lock_fd, 0)
+                    os.write(lock_fd, f"{os.getpid()}\n".encode())
+                    os.fsync(lock_fd)
+                    lock_acquired = True
+                    acquired_lock_fds.append(lock_fd)
+                    logger.debug("Acquired exclusive lock for device %s", device_id)
+                except BlockingIOError:
+                    os.close(lock_fd)
+
+                    if time.time() - wait_start > stage_init_timeout:
+                        logger.warning(
+                            "Timeout waiting for device %s initialization lock, proceeding anyway",
+                            device_id,
+                        )
+                        break
+
+                    time.sleep(0.1)
+            except OSError as e:
+                logger.debug(
+                    "Failed to acquire lock for device %s: %s, continuing anyway",
+                    device_id,
+                    e,
+                )
+                try:
+                    os.close(lock_fd)
+                except (OSError, NameError):
+                    pass
+                break
+
+    # Set FD_CLOEXEC to prevent child processes from inheriting locks
+    for lock_fd in acquired_lock_fds:
+        try:
+            flags = fcntl.fcntl(lock_fd, fcntl.F_GETFD)
+            fcntl.fcntl(lock_fd, fcntl.F_SETFD, flags | fcntl.FD_CLOEXEC)
+        except (OSError, ValueError):
+            pass
+
+    try:
+        yield
+    finally:
+        for lock_fd in acquired_lock_fds:
+            try:
+                fcntl.flock(lock_fd, fcntl.LOCK_UN)
+                os.close(lock_fd)
+                logger.debug("Released initialization lock (fd=%s)", lock_fd)
+            except (OSError, ValueError):
+                pass
+
+
+def _resolve_worker_cls(engine_args: dict[str, Any]) -> None:
+    worker_type = engine_args.get("worker_type", None)
+    if not worker_type:
+        return
+    worker_cls = engine_args.get("worker_cls")
+    if worker_cls is not None and worker_cls != "auto":
+        return
+    from vllm_omni.platforms import current_omni_platform
+
+    worker_type = str(worker_type).lower()
+    if worker_type == "ar":
+        engine_args["worker_cls"] = current_omni_platform.get_omni_ar_worker_cls()
+    elif worker_type == "generation":
+        engine_args["worker_cls"] = current_omni_platform.get_omni_generation_worker_cls()
+    else:
+        raise ValueError(f"Unknown worker_type: {worker_type}")
+
+
+def _build_od_config(engine_args: dict[str, Any], model: str) -> dict[str, Any]:
+    """Build OmniDiffusionConfig kwargs from engine args."""
+    od_config = engine_args.get("od_config", {})
+    if not od_config:
+        od_config = {"model": model}
+        od_field_names = {f.name for f in fields(OmniDiffusionConfig)}
+        for key, value in engine_args.items():
+            if key in od_field_names:
+                od_config[key] = value
+        od_config["model"] = model  # restore resolved path
+    return od_config
+
+
+class OmniStage:
+    """Stage manager for orchestrating a single stage in the omni pipeline.
+
+    Encapsulates per-stage process lifecycle and worker logic, including
+    device setup, LLM initialization, batching, and shared-memory IPC.
+    Preserves input processing utilities for cross-stage data wiring.
+
+    Args:
+        stage_config: Stage configuration object containing engine arguments,
+            runtime settings, and stage-specific parameters
+    """
+
+    def __init__(self, stage_config: Any, stage_init_timeout: int = 300):
+        logger.debug(f"[OmniStage] stage_config: {stage_config}")
+        self.stage_config = stage_config
+        self.engine = None
+        self.async_engine = None
+        self.vllm_config = None
+        self.tokenizer = None
+        self.input_preprocessor = None
+        self.is_tracing_enabled = False
+        self.stage_id = stage_config.stage_id
+        self.engine_args = stage_config.engine_args
+        self.model_stage = stage_config.engine_args.model_stage
+        self.requires_multimodal_data = getattr(stage_config.runtime, "requires_multimodal_data", False)
+        self.engine_input_source = getattr(stage_config, "engine_input_source", [])
+        self.engine_output_type = getattr(stage_config.engine_args, "engine_output_type", None)
+        self.engine_outputs = None
+        self.is_comprehension = getattr(stage_config, "is_comprehension", False)
+        # Support for different stage types: "llm" (default) or "diffusion"
+        self.stage_type: Literal["llm", "diffusion"] = getattr(stage_config, "stage_type", "llm")
+        if (
+            "stage_id" in stage_config.engine_args
+            and stage_config.engine_args.stage_id != self.stage_id
+            and self.stage_id is not None
+        ):
+            stage_config.engine_args.stage_id = self.stage_id
+        if hasattr(stage_config, "custom_process_input_func"):
+            # Import the module specified in the config (already a full module path)
+            module_path, func_name = stage_config.custom_process_input_func.rsplit(".", 1)
+            module = importlib.import_module(module_path)
+            self.custom_process_input_func = getattr(module, func_name)
+        else:
+            self.custom_process_input_func = None
+
+        self.final_output = getattr(stage_config, "final_output", False)
+        self.final_output_type = getattr(stage_config, "final_output_type", None)
+        self.tts_args = _to_dict(getattr(stage_config, "tts_args", {}))
+        default_sampling_params = getattr(stage_config, "default_sampling_params", {})
+        # For LLM stage, this can directly be a SamplingParams-compatible dict;
+        # For diffusion stage, this only serves as default values for diffusion kwargs.
+        default_sampling_params = _to_dict(default_sampling_params)
+        # Further convert it to dataclass to check fields
+        try:
+            self.default_sampling_params = (
+                SamplingParams if self.stage_type == "llm" else OmniDiffusionSamplingParams
+            )(**default_sampling_params)
+        except TypeError as error:
+            raise TypeError(f"Invalid default_sampling_params for stage {self.stage_id}: {error}") from error
+        # Runtime orchestration state (added)
+        self._in_q: mp.queues.Queue | ZmqQueue | str | None = None
+        self._out_q: mp.queues.Queue | ZmqQueue | str | None = None
+        self._proc: mp.Process | None = None
+        self._shm_threshold_bytes: int = 65536
+        self._stage_init_timeout: int = stage_init_timeout
+
+    def set_engine(self, engine: LLMEngine) -> None:
+        """Set the LLM engine for this stage.
+
+        Args:
+            engine: LLMEngine instance to use for this stage
+        """
+        self.engine = engine
+
+    def set_async_engine(self, async_engine: AsyncLLM) -> None:
+        """Set the async LLM engine for this stage.
+
+        Args:
+            async_engine: AsyncLLM instance to use for this stage
+        """
+        self.async_engine = async_engine
+
+    def set_vllm_config(self, vllm_config: Any) -> None:
+        """Set the vLLM configuration for this stage.
+
+        Args:
+            vllm_config: VllmConfig instance received from worker process
+        """
+        self.vllm_config = vllm_config
+
+    def set_tokenizer(self, tokenizer: TokenizerLike) -> None:
+        """Set the tokenizer for this stage.
+
+        Args:
+            tokenizer: Tokenizer instance received from worker process
+        """
+        self.tokenizer = tokenizer
+
+    def set_input_preprocessor(self, input_preprocessor: InputPreprocessor) -> None:
+        """Set the input preprocessor for this stage.
+
+        Args:
+            input_preprocessor: InputPreprocessor instance received from worker process
+        """
+        self.input_preprocessor = input_preprocessor
+
+    def set_is_tracing_enabled(self, is_tracing_enabled: bool) -> None:
+        """Set whether tracing is enabled for this stage.
+
+        Args:
+            is_tracing_enabled: Boolean indicating if tracing is enabled
+        """
+        self.is_tracing_enabled = is_tracing_enabled
+
+    def set_engine_outputs(self, engine_outputs: EngineCoreOutput) -> None:
+        """Set the engine outputs for this stage.
+
+        Args:
+            engine_outputs: EngineCoreOutput from this stage's processing
+        """
+        self.engine_outputs = engine_outputs
+
+    # ----------------- New Orchestration APIs -----------------
+    def attach_queues(
+        self,
+        in_q: mp.queues.Queue | ZmqQueue | str | None,
+        out_q: mp.queues.Queue | ZmqQueue | str | None,
+    ) -> None:
+        """Attach input and output queues for IPC communication.
+
+        Args:
+            in_q: Input queue for receiving tasks from orchestrator (queue object or endpoint string)
+            out_q: Output queue for sending results to orchestrator (queue object or endpoint string)
+        """
+        self._in_q = in_q
+        self._out_q = out_q
+
+    def stop_profile(self) -> dict:
+        """Stop profiling by sending a signal to worker and waiting for response."""
+        if self._in_q is None or self._out_q is None:
+            logger.warning(f"[Stage-{self.stage_id}] Queues not initialized, cannot stop profile.")
+            return {}
+
+        logger.info(f"[Stage-{self.stage_id}] Sending PROFILER_STOP to worker...")
+        self.submit({"type": OmniStageTaskType.PROFILER_STOP})
+
+        # Wait for result from worker
+        try:
+            # Profiling stop might take time to flush files, give it 600s
+            response = self._out_q.get(timeout=600)
+
+            if isinstance(response, dict):
+                if response.get("type") == "profiler_result":
+                    return response.get("data", {})
+                elif "error" in response:
+                    logger.error(f"[Stage-{self.stage_id}] Profiler error: {response['error']}")
+                    return {}
+
+            # If we got something else (e.g. late generation result), we might lose it here,
+            # but usually profiling stop is called when generation is done.
+            logger.warning(
+                f"[Stage-{self.stage_id}] Received unexpected message while waiting for profiler: {response}"
+            )
+            return {}
+
+        except queue.Empty:
+            logger.error(f"[Stage-{self.stage_id}] Timeout waiting for profiler results.")
+            return {}
+
+    def init_stage_worker(
+        self,
+        model: str,
+        *,
+        is_async: bool = False,
+        shm_threshold_bytes: int = 65536,
+        ctx: mp.context.BaseContext | None = None,
+        batch_timeout: int = 10,
+        connectors_config: dict | None = None,
+        worker_backend: str = "multi_process",
+        ignore_runtime_config: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize and start the stage worker process.
+
+        Creates a worker process that runs the LLM engine for this stage.
+        The worker handles batching, generation, and IPC communication.
+
+        Args:
+            model: Model name or path to load
+            is_async: Whether to use async engine (default: False)
+            shm_threshold_bytes: Threshold for using shared memory for IPC
+            ctx: Optional multiprocessing context (default: spawn)
+            batch_timeout: Timeout in seconds for batching requests
+            connectors_config: Configuration for stage connectors
+            worker_backend: Backend type ("multi_process" or "ray")
+            ignore_runtime_config: Whether to ignore runtime configuration (default: False)
+            **kwargs: Additional arguments (e.g. ray_placement_group)
+
+        Raises:
+            AssertionError: If queues are not attached before calling this method
+        """
+        assert self._in_q is not None and self._out_q is not None, "Queues must be attached before start_process"
+
+        if worker_backend == "ray":
+            ray_placement_group = kwargs.get("ray_placement_group", None)
+            assert ray_placement_group is not None, "Ray placement group must be provided"
+            self._shm_threshold_bytes = sys.maxsize
+        else:
+            self._shm_threshold_bytes = shm_threshold_bytes
+
+        ctx = ctx or mp.get_context("spawn")
+        # Prepare lightweight dict config for worker
+        engine_args = _to_dict(self.engine_args)
+        if ignore_runtime_config:
+            runtime_cfg = {}
+        else:
+            runtime_cfg = _to_dict(getattr(self.stage_config, "runtime", {}))
+        stage_payload: dict[str, Any] = {
+            "stage_id": self.stage_id,
+            "engine_args": engine_args,
+            "runtime": runtime_cfg,
+            "shm_threshold_bytes": self._shm_threshold_bytes,
+            "connectors_config": connectors_config or {},
+            "stage_type": self.stage_type,
+            "engine_input_source": self.engine_input_source,
+            "final_output": self.final_output,
+            "final_output_type": self.final_output_type,
+        }
+        try:
+            old_env = os.environ.get("VLLM_LOGGING_PREFIX")
+            new_env = f"[Stage-{self.stage_id}] {'' if old_env is None else old_env}"
+            os.environ["VLLM_LOGGING_PREFIX"] = new_env
+            if worker_backend == "ray":
+                if is_async:
+                    self._ray_actor = start_ray_actor(
+                        _stage_worker_async_entry,
+                        ray_placement_group,
+                        self.stage_id,
+                        model=model,
+                        stage_payload=stage_payload,
+                        in_q=self._in_q,
+                        out_q=self._out_q,
+                        batch_timeout=batch_timeout,
+                        stage_init_timeout=self._stage_init_timeout,
+                    )
+                else:
+                    self._ray_actor = start_ray_actor(
+                        _stage_worker,
+                        ray_placement_group,
+                        self.stage_id,
+                        model=model,
+                        stage_payload=stage_payload,
+                        in_q=self._in_q,
+                        out_q=self._out_q,
+                        batch_timeout=batch_timeout,
+                        stage_init_timeout=self._stage_init_timeout,
+                    )
+            else:
+                if is_async:
+                    self._proc = ctx.Process(
+                        target=_stage_worker_async_entry,
+                        args=(
+                            model,
+                            stage_payload,
+                            self._in_q.endpoint if isinstance(self._in_q, ZmqQueue) else self._in_q,
+                            self._out_q.endpoint if isinstance(self._out_q, ZmqQueue) else self._out_q,
+                            batch_timeout,
+                            self._stage_init_timeout,
+                        ),
+                    )
+                else:
+                    self._proc = ctx.Process(
+                        target=_stage_worker,
+                        args=(
+                            model,
+                            stage_payload,
+                            self._in_q.endpoint if isinstance(self._in_q, ZmqQueue) else self._in_q,
+                            self._out_q.endpoint if isinstance(self._out_q, ZmqQueue) else self._out_q,
+                            batch_timeout,
+                            self._stage_init_timeout,
+                        ),
+                    )
+                self._proc.start()
+        finally:
+            if old_env is None:
+                os.environ.pop("VLLM_LOGGING_PREFIX", None)
+            else:
+                os.environ["VLLM_LOGGING_PREFIX"] = old_env
+
+    def stop_stage_worker(self) -> None:
+        """Stop the stage worker process gracefully.
+
+        Sends shutdown signal to the worker and waits for it to terminate.
+        If graceful shutdown fails, forcefully terminates the process.
+        Handles both multiprocessing Process and Ray Actor.
+        """
+        if self._in_q is not None:
+            try:
+                self._in_q.put_nowait(SHUTDOWN_TASK)
+            except Exception as e:
+                logger.warning("Failed to send shutdown to in_q: %s", e)
+            close_fn = getattr(self._in_q, "close", None)
+            if callable(close_fn):
+                close_fn()
+        if self._out_q is not None:
+            close_fn = getattr(self._out_q, "close", None)
+            if callable(close_fn):
+                close_fn()
+
+        if hasattr(self, "_ray_actor") and self._ray_actor:
+            kill_ray_actor(self._ray_actor)
+            self._ray_actor = None
+        elif self._proc is not None:
+            try:
+                self._proc.join(timeout=5)
+            except Exception as e:
+                logger.debug("join() failed: %s", e)
+            if self._proc.is_alive():
+                try:
+                    self._proc.terminate()
+                except Exception as e:
+                    logger.warning("terminate() failed: %s", e)
+
+    def submit(self, payload: dict[str, Any]) -> None:
+        """Submit a task to the stage worker.
+
+        Args:
+            payload: Dictionary containing task data (request_id, engine_inputs,
+                sampling_params, etc.)
+        """
+        assert self._in_q is not None
+
+        # [Omni] Inject global request_id into additional_information for cross-stage ID consistency
+        # This allows workers (like GPUARModelRunner) to use the global ID for side-channel
+        # operations like KV transfer, even if they use internal IDs for execution.
+        if "request_id" in payload and "engine_inputs" in payload:
+            req_id = payload["request_id"]
+            ein = payload["engine_inputs"]
+
+            # Helper to inject into additional_information
+            def _inject_global_id(target_ein):
+                # OmniTokensPrompt is a TypedDict at runtime, so we treat it as a dict
+                if isinstance(target_ein, dict):
+                    if "additional_information" not in target_ein:
+                        target_ein["additional_information"] = {}
+
+                    # Ensure additional_information is a dict before assignment
+                    # (in case it was somehow initialized as None or other type)
+                    if target_ein["additional_information"] is None:
+                        target_ein["additional_information"] = {}
+
+                    if isinstance(target_ein["additional_information"], dict):
+                        # Wrap in list because OmniInputProcessor requires Tensor or list values
+                        target_ein["additional_information"]["global_request_id"] = [str(req_id)]
+
+            if isinstance(ein, list):
+                for item in ein:
+                    _inject_global_id(item)
+            else:
+                _inject_global_id(ein)
+
+        self._in_q.put(payload)
+
+    def try_collect(self) -> dict[str, Any] | None:
+        """Try to collect a result from the stage worker without blocking.
+
+        Returns:
+            Result dictionary if available, None otherwise. Result contains
+            request_id, engine_outputs (or engine_outputs_shm), and metrics.
+        """
+        assert self._out_q is not None
+        # Ensure transformers_modules (trust_remote_code cache) is importable
+        # in this process before pickle deserialization of Stage-0 output.
+        import os as _os
+        import sys as _sys
+
+        _hf_modules = _os.path.join(
+            _os.environ.get("HF_HOME", _os.path.join(_os.path.expanduser("~"), ".cache", "huggingface")), "modules"
+        )
+        if _hf_modules not in _sys.path:
+            _sys.path.insert(0, _hf_modules)
+        try:
+            return self._out_q.get_nowait()
+        except queue.Empty:
+            return None
+        except Exception as _e:
+            import logging as _lg
+
+            _lg.getLogger(__name__).error("[Stage-%s] try_collect deser error: %s", self.stage_id, _e)
+            # Message was consumed but deserialization failed (e.g. transformers_modules not loaded).
+            # Return minimal stage_ready so the orchestrator marks this stage as ready
+            # and triggers the engine_args fallback in _wait_for_stages_ready.
+            return {"type": "stage_ready", "stage_id": self.stage_id, "vllm_config": None, "tokenizer": None}
+
+    def process_engine_inputs(
+        self, stage_list: list[Any], prompt: OmniTokensPrompt | TextPrompt = None
+    ) -> list[OmniTokensPrompt | TextPrompt]:
+        """Process engine inputs for this stage from upstream stage outputs.
+
+        Derives inputs for this stage from outputs of upstream stages.
+        Uses engine_input_source configuration to determine which upstream
+        stage outputs to use. Supports custom processing functions.
+
+        Args:
+            stage_list: List of all stages in the pipeline
+            prompt: Optional original prompt (for multimodal data preservation)
+
+        Returns:
+            List of processed engine inputs ready for this stage
+
+        Raises:
+            ValueError: If engine_input_source is empty or invalid
+        """
+        if self.custom_process_input_func is None:
+            engine_inputs = []
+            if len(self.engine_input_source) == 0:
+                raise ValueError("engine_input_source is empty")
+            source_stage_id = self.engine_input_source[0]
+            source_outputs = stage_list[source_stage_id].engine_outputs
+            if not isinstance(prompt, list):
+                prompt = [prompt]
+            multi_modal_data = {
+                source_output.request_id: p.get("multi_modal_data", None)
+                for source_output, p in zip(source_outputs, prompt)
+            }
+
+            for source_output in source_outputs:
+                engine_input = OmniTokensPrompt(
+                    prompt_token_ids=source_output.outputs[0].token_ids,
+                    multi_modal_data=(
+                        multi_modal_data[source_output.request_id]
+                        if self.requires_multimodal_data and multi_modal_data
+                        else None
+                    ),
+                )
+                engine_inputs.append(engine_input)
+            return engine_inputs
+
+        else:
+            engine_input_source = self.engine_input_source
+            return self.custom_process_input_func(
+                stage_list, engine_input_source, prompt, self.requires_multimodal_data
+            )
+
+
+def _stage_worker(
+    model: str,
+    stage_payload: dict[str, Any],
+    in_q: mp.queues.Queue | ZmqQueue | str,
+    out_q: mp.queues.Queue | ZmqQueue | str,
+    batch_timeout: int = 10,
+    stage_init_timeout: int = 300,
+) -> None:
+    """Stage worker entry: device setup, LLM init, batching, SHM IPC."""
+    # Use local aliases to avoid conflicts with global imports in worker process
+    logger.info(f"Starting stage worker with model: {model}")
+    import multiprocessing as _mp
+    import os as _os
+    import time as _time
+
+    import zmq
+
+    from vllm_omni.plugins import load_omni_general_plugins
+
+    load_omni_general_plugins()
+    # IMPORTANT: Ensure vLLM's internal multiprocessing workers (e.g., GPUARWorker /
+    # GPUARModelRunner) are spawned with a fork-safe method.
+    # Mooncake / gRPC / RDMA and CUDA/NCCL can deadlock under fork-with-threads.
+    if _os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn":
+        _os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+        logger.info("[Stage] Set VLLM_WORKER_MULTIPROC_METHOD=spawn")
+    # Best-effort: also force python mp start method in this stage process.
+    # This may raise if already set; that's fine.
+    try:
+        _mp.set_start_method("spawn", force=True)
+    except RuntimeError:
+        pass
+
+    stage_id = stage_payload["stage_id"]
+    engine_args = stage_payload.get("engine_args", {})
+    runtime_cfg = stage_payload.get("runtime", {})
+    shm_threshold_bytes = int(stage_payload.get("shm_threshold_bytes", 65536))
+    connectors_config = stage_payload.get("connectors_config", {})
+    stage_type: Literal["llm", "diffusion"] = stage_payload.get("stage_type", "llm")
+
+    if stage_type != "diffusion":
+        _resolve_worker_cls(engine_args)
+
+    # Handle non-standard model directory structures (e.g., tokenizer in root, model in subdir)
+    model = _resolve_model_tokenizer_paths(model, engine_args)
+
+    # Resolve ZMQ queue endpoints if needed
+    zmq_ctx = None
+    if isinstance(in_q, str) or isinstance(out_q, str):
+        zmq_ctx = zmq.Context()
+        if isinstance(in_q, str):
+            in_q = create_zmq_queue(zmq_ctx, in_q, zmq.PULL)
+        if isinstance(out_q, str):
+            out_q = create_zmq_queue(zmq_ctx, out_q, zmq.PUSH)
+        # When using ZMQ (cross-node IPC), disable SHM so data is sent inline.
+        shm_threshold_bytes = sys.maxsize
+        logger.info(
+            "[Stage-%s] ZMQ transport detected; disabling SHM IPC (shm_threshold_bytes set to maxsize)",
+            stage_id,
+        )
+
+    # Aggregates for running average
+    _agg_total_tokens = 0
+    _agg_total_gen_time_ms = 0.0
+    # Monotonic batch id per stage process for orchestrator dedup on time aggregation
+    _batch_seq = 0
+
+    # Device mapping
+    device_type = None
+    try:
+        from vllm_omni.platforms import current_omni_platform
+
+        device_type = current_omni_platform.device_type
+        set_stage_devices(stage_id, runtime_cfg.get("devices"), device_type=device_type)
+    except Exception as e:
+        logger.warning("Device setup failed: %s", e)
+
+    # Use sequential init locks only when NVML is unavailable
+    with _sequential_init_lock(engine_args, stage_init_timeout):
+        # Init engine based on stage_type
+        logger.debug(
+            "[Stage-%s] Initializing %s engine with args keys=%s", stage_id, stage_type, list(engine_args.keys())
+        )
+        if engine_args.get("async_chunk", False):
+            logger.debug("[Stage-%s] Async chunk enabled, injecting connectors config", stage_id)
+            stage_connector_spec = {}
+            for v in connectors_config.values():
+                stage_connector_spec = dict(v.get("spec", {}))
+                break
+            engine_args["stage_connector_spec"] = stage_connector_spec
+            engine_args["stage_id"] = stage_id
+        if stage_type == "diffusion":
+            engine_args = filter_dataclass_kwargs(OmniDiffusionConfig, engine_args)
+            engine_args.pop("model_stage", None)
+            engine_args.pop("model", None)
+            stage_engine = OmniDiffusion(
+                model=model,
+                stage_id=stage_id,
+                engine_input_source=stage_payload.get("engine_input_source", []),
+                **engine_args,
+            )
+        else:
+            engine_args = filter_dataclass_kwargs(OmniEngineArgs, engine_args)
+            engine_args.pop("model", None)
+            # Default to LLM engine
+            stage_engine = OmniLLM(model=model, **engine_args)
+
+    logger.debug("Engine initialized")
+    # Initialize OmniConnectors if configured
+    connectors: dict[tuple[str, str], OmniConnectorBase] | None = {}
+    if connectors_config:
+        connectors = build_stage_connectors(
+            stage_id=stage_id,
+            connectors_config=connectors_config,
+        )
+        if connectors is None:
+            return
+
+    # Signal readiness to orchestrator
+    try:
+        out_q.put({"type": "stage_ready", "stage_id": stage_id})
+    except Exception:
+        pass
+
+    max_batch_size = int(runtime_cfg.get("max_batch_size", 1) or 1)
+    logger.info(f"Max batch size: {max_batch_size}")
+
+    def handle_profiler_task_local(task_type: OmniStageTaskType) -> dict:
+        """Handle profiler task locally in the worker process."""
+        if task_type == OmniStageTaskType.PROFILER_START:
+            if stage_type == "diffusion":
+                try:
+                    profile_dir = _os.environ.get("VLLM_TORCH_PROFILER_DIR", "./profiles")
+                    _os.makedirs(profile_dir, exist_ok=True)
+                    trace_filename = f"stage_{stage_id}_diffusion_{int(_time.time())}"
+                    stage_engine.start_profile(trace_filename=trace_filename)
+                    logger.info("[Stage-%s] Diffusion Torch profiler started", stage_id)
+                except Exception as e:
+                    logger.warning("[Stage-%s] Failed to start diffusion profiler: %s", stage_id, e)
+            else:
+                try:
+                    stage_engine.start_profile()
+                    logger.info("[Stage-%s] vLLM profiler started", stage_id)
+                except Exception as e:
+                    logger.warning("[Stage-%s] Failed to start vLLM profiler: %s", stage_id, e)
+            return {}
+
+        elif task_type == OmniStageTaskType.PROFILER_STOP:
+            if stage_type == "diffusion":
+                try:
+                    # CRITICAL: Capture return value
+                    result_data = stage_engine.stop_profile()
+                    logger.info("[Stage-%s] Diffusion Torch profiler stopped", stage_id)
+                    return result_data
+                except Exception as e:
+                    logger.warning("[Stage-%s] Failed to stop diffusion profiler: %s", stage_id, e)
+                    return {}
+            else:
+                try:
+                    stage_engine.stop_profile()
+                    logger.info("[Stage-%s] vLLM profiler stopped", stage_id)
+                except Exception as e:
+                    logger.warning("[Stage-%s] Failed to stop vLLM profiler: %s", stage_id, e)
+                return {}
+        return {}
+
+    # Batch processing loop
+    while True:
+        task = in_q.get()
+
+        _recv_dequeue_ts = _time.time()
+        task_type = task.get("type", OmniStageTaskType.GENERATE)
+        if task_type == OmniStageTaskType.SHUTDOWN:
+            logger.info("Received shutdown signal")
+            break
+
+        # Handle profiler control commands
+        if is_profiler_task(task_type):
+            profiler_data = handle_profiler_task_local(task_type)
+            # If it was a STOP command, we must reply to the Orchestrator
+            if task_type == OmniStageTaskType.PROFILER_STOP:
+                out_q.put({"type": "profiler_result", "data": profiler_data})
+            continue
+
+        batch_tasks: list[dict[str, Any]] = [task]
+        tasks_failed_to_add_to_batch: list[dict[str, Any]] = []
+        start_time = _time.time()
+        if max_batch_size > 1:
+            while len(batch_tasks) < max_batch_size:
+                if not in_q.empty():
+                    extra = in_q.get_nowait()
+                    if extra == SHUTDOWN_TASK:
+                        in_q.put(SHUTDOWN_TASK)
+                        break
+                    # Handle profiler commands that arrive during batching
+                    extra_type = extra.get("type") if isinstance(extra, dict) else None
+                    if is_profiler_task(extra_type):
+                        p_data = handle_profiler_task_local(extra_type)
+                        if extra_type == OmniStageTaskType.PROFILER_STOP:
+                            out_q.put({"type": "profiler_result", "data": p_data})
+                        continue
+                    # Ensure that all tasks have the same sampling params
+                    # If no, put them in a temporary container and add back to queue
+                    # This should be always true, because user only calls omni.generate() once and it blocks
+                    # User can only pass one sampling param object, but the list of prompts are separated.
+                    if task.get("sampling_params") != extra.get("sampling_params"):
+                        logger.warning(
+                            """In offline mode, expect all prompts in one `omni.generate()` call to share same sampling params"""  # noqa: E501 # line too long
+                            f"""However, prompt {task.get("engine_inputs")} has sampling params {task.get("sampling_params")}, """  # noqa: E501 # line too long
+                            f"""whereas the prompt {extra.get("engine_inputs")} has sampling params {extra.get("sampling_params")}."""  # noqa: E501 # line too long
+                            """The two tasks cannot be combined in one batch request."""
+                        )
+                        tasks_failed_to_add_to_batch.append(extra)
+                    else:
+                        batch_tasks.append(extra)
+                    end_time = _time.time()
+                    duration = end_time - start_time
+                    if duration > batch_timeout:
+                        break
+                    else:
+                        continue
+                else:
+                    end_time = _time.time()
+                    duration = end_time - start_time
+                    _time.sleep(0.05)
+                    if duration > batch_timeout:
+                        break
+                    else:
+                        continue
+        for task_to_readd in tasks_failed_to_add_to_batch:
+            in_q.put(task_to_readd)
+        # Ensure that the popped tasks are with identical sampling params. Take one of them.
+        batch_engine_sampling_params: OmniSamplingParams = batch_tasks[0]["sampling_params"]
+
+        batch_request_ids: list[Any] = []
+        batch_engine_inputs: list[OmniPromptType] = []
+        _rx_bytes_by_rid: dict[Any, int] = {}
+        _rx_decode_ms_by_rid: dict[Any, float] = {}
+        _in_flight_ms_by_rid: dict[Any, float] = {}
+        for t in batch_tasks:
+            rid = t["request_id"]
+            try:
+                sent_ts = float(t.get("sent_ts", None)) if isinstance(t, dict) else None
+                if sent_ts is not None:
+                    _in_flight_ms_by_rid[rid] = max(0.0, (_recv_dequeue_ts - sent_ts) * 1000.0)
+                else:
+                    _in_flight_ms_by_rid[rid] = 0.0
+            except Exception:
+                _in_flight_ms_by_rid[rid] = 0.0
+
+            # Resolve input data strictly via connectors if payload
+            # is larger than shm_threshold_bytes or using other connectors
+            ein, _rx_metrics = try_recv_via_connector(
+                task=t,
+                connectors=connectors,
+                stage_id=stage_id,
+            )
+            # TODO: hack type annotation for now.
+            # A better way is to refine type annotation of connection and task/payloads, maybe using template types.
+            ein = cast(OmniPromptType | Sequence[OmniPromptType] | None, ein)
+
+            if ein is None or _rx_metrics is None:
+                raise RuntimeError(
+                    f"[Stage-{stage_id}] Missing connector payload for request {rid}. "
+                    "Ensure connectors are configured for all incoming edges."
+                )
+
+            _rx_decode_ms_by_rid[rid] = float(_rx_metrics.get("rx_decode_time_ms", 0.0))
+            _rx_bytes_by_rid[rid] = int(_rx_metrics.get("rx_transfer_bytes", 0))
+
+            batch_request_ids.append(rid)
+
+            if isinstance(ein, dict | str):
+                # For diffusion stage-0, ein might be a string prompt directly
+                batch_engine_inputs.append(ein)
+            elif isinstance(ein, Sequence):
+                batch_engine_inputs.extend(ein)
+            else:
+                # Other unknown types, append as-is
+                batch_engine_inputs.append(ein)
+        logger.debug(
+            "Received batch size=%d, request_ids=%s",
+            len(batch_tasks),
+            batch_request_ids,
+        )
+        try:
+            _batch_seq += 1
+            gen_outputs: list[OmniRequestOutput | RequestOutput] = []
+            _gen_t0 = _time.time()
+            if stage_type == "diffusion":
+                stage_engine = cast(OmniDiffusion, stage_engine)
+                batch_engine_sampling_params = cast(OmniDiffusionSamplingParams, batch_engine_sampling_params)
+                # Diffusion generate returns results directly, not an iterator
+                diffusion_results = stage_engine.generate(
+                    batch_engine_inputs, batch_engine_sampling_params, batch_request_ids
+                )
+                gen_outputs.extend(diffusion_results)
+                # Assign request_ids if not present
+                for idx, result in enumerate(gen_outputs):
+                    if not hasattr(result, "request_id") or result.request_id is None:
+                        if idx < len(batch_request_ids):
+                            result.request_id = batch_request_ids[idx]
+            else:
+                stage_engine = cast(OmniLLM, stage_engine)
+                batch_engine_sampling_params = cast(SamplingParams, batch_engine_sampling_params)
+                results = stage_engine.generate(
+                    batch_engine_inputs,  # type: ignore # silent complaints about list of subclassed TypedDict
+                    batch_engine_sampling_params,
+                    use_tqdm=False,
+                )
+                gen_outputs.extend(results)
+            _gen_t1 = _time.time()
+            _gen_ms = (_gen_t1 - _gen_t0) * 1000.0
+            logger.debug(f"Generate done: batch={len(batch_tasks)}, req_ids={batch_request_ids}, gen_ms={_gen_ms:.1f}")
+
+            # Group outputs per request id with fallback
+            req_to_outputs: dict[Any, list[Any]] = {rid: [] for rid in batch_request_ids}
+            unmapped: list[Any] = []
+            for ro in gen_outputs:
+                rid = ro.request_id
+                if rid in req_to_outputs:
+                    req_to_outputs[rid].append(ro)
+                else:
+                    unmapped.append(ro)
+            if unmapped:
+                idx = 0
+                for ro in unmapped:
+                    target_rid = batch_request_ids[idx % len(batch_request_ids)]
+                    ro.request_id = target_rid
+                    req_to_outputs[target_rid].append(ro)
+                    idx += 1
+
+            _agg_total_gen_time_ms += _gen_ms
+
+            # Emit per-request results
+            for i, rid in enumerate(batch_request_ids):
+                r_outputs = req_to_outputs.get(rid, [])
+                _metrics = make_request_stats(
+                    r_outputs,
+                    _gen_ms,
+                    int(_batch_seq),
+                    int(len(batch_request_ids)),
+                    float(_rx_decode_ms_by_rid.get(rid, 0.0)),
+                    int(_rx_bytes_by_rid.get(rid, 0)),
+                    float(_in_flight_ms_by_rid.get(rid, 0.0)),
+                )
+                _agg_total_tokens += _metrics.num_tokens_out
+                if i == len(batch_request_ids) - 1:
+                    _metrics.stage_stats = make_stage_stats(_agg_total_tokens, _agg_total_gen_time_ms)
+                else:
+                    _metrics.stage_stats = None
+                try:
+                    use_shm, payload = maybe_dump_to_shm(r_outputs, shm_threshold_bytes)
+                    if use_shm:
+                        out_q.put(
+                            {
+                                "request_id": rid,
+                                "stage_id": stage_id,
+                                "engine_outputs_shm": payload,
+                                "metrics": _metrics,
+                            }
+                        )
+                    else:
+                        out_q.put(
+                            {
+                                "request_id": rid,
+                                "stage_id": stage_id,
+                                "engine_outputs": payload,
+                                "metrics": _metrics,
+                            }
+                        )
+                except Exception:
+                    out_q.put(
+                        {
+                            "request_id": rid,
+                            "stage_id": stage_id,
+                            "engine_outputs": r_outputs,
+                            "metrics": _metrics,
+                        }
+                    )
+                logger.debug(
+                    "Enqueued result for request %s to downstream",
+                    rid,
+                )
+        except Exception as e:
+            logger.exception("Failed on batch %s: %s", batch_request_ids, e)
+            _tb = traceback.format_exc()
+            for rid in batch_request_ids:
+                out_q.put(
+                    {
+                        "request_id": rid,
+                        "stage_id": stage_id,
+                        "error": str(e),
+                        "error_tb": _tb,
+                    }
+                )
+
+
+def _stage_worker_async_entry(
+    model: str,
+    stage_payload: dict[str, Any],
+    in_q: mp.queues.Queue | ZmqQueue | str,
+    out_q: mp.queues.Queue | ZmqQueue | str,
+    batch_timeout: int = 10,
+    stage_init_timeout: int = 300,
+) -> None:
+    asyncio.run(_stage_worker_async(model, stage_payload, in_q, out_q, batch_timeout, stage_init_timeout))
+
+
+async def _stage_worker_async(
+    model: str,
+    stage_payload: dict[str, Any],
+    in_q: mp.queues.Queue | ZmqQueue | str,
+    out_q: mp.queues.Queue | ZmqQueue | str,
+    batch_timeout: int = 10,
+    stage_init_timeout: int = 300,
+) -> None:
+    """Stage worker entry: device setup, LLM init, batching, SHM IPC."""
+    # Use local aliases to avoid conflicts with global imports in worker process
+    import multiprocessing as _mp
+    import os as _os
+    import time as _time
+
+    import zmq
+
+    from vllm_omni.plugins import load_omni_general_plugins
+
+    load_omni_general_plugins()
+    # IMPORTANT: Ensure vLLM's internal multiprocessing workers (e.g., GPUARWorker /
+    # GPUARModelRunner) are spawned with a fork-safe method.
+    if _os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn":
+        _os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+        logger.info("[Stage-async] Set VLLM_WORKER_MULTIPROC_METHOD=spawn")
+    try:
+        _mp.set_start_method("spawn", force=True)
+    except RuntimeError:
+        pass
+
+    stage_id = stage_payload["stage_id"]
+    engine_args = stage_payload.get("engine_args", {})
+    runtime_cfg = stage_payload.get("runtime", {})
+    shm_threshold_bytes = int(stage_payload.get("shm_threshold_bytes", 65536))
+    connectors_config = stage_payload.get("connectors_config", {})
+    stage_type = stage_payload.get("stage_type", "llm")
+    final_output = stage_payload.get("final_output", False)
+    final_output_type = stage_payload.get("final_output_type", None)
+
+    # Handle non-standard model directory structures (e.g., tokenizer in root, model in subdir)
+    model = _resolve_model_tokenizer_paths(model, engine_args)
+
+    if stage_type != "diffusion":
+        _resolve_worker_cls(engine_args)
+
+    # Resolve ZMQ queue endpoints if needed
+    zmq_ctx = None
+    if isinstance(in_q, str) or isinstance(out_q, str):
+        zmq_ctx = zmq.Context()
+        if isinstance(in_q, str):
+            in_q = create_zmq_queue(zmq_ctx, in_q, zmq.PULL)
+        if isinstance(out_q, str):
+            out_q = create_zmq_queue(zmq_ctx, out_q, zmq.PUSH)
+        # When using ZMQ (cross-node IPC), disable SHM so data is sent inline.
+        shm_threshold_bytes = sys.maxsize
+        logger.info(
+            "[Stage-%s] ZMQ transport detected; disabling SHM IPC (shm_threshold_bytes set to maxsize)",
+            stage_id,
+        )
+
+    # Aggregates for running average
+    _agg_total_tokens = 0
+    _agg_total_gen_time_ms = 0.0
+    # Monotonic batch id per stage process for orchestrator dedup on time
+    # aggregation
+    _batch_seq = 0
+
+    # Device mapping
+    device_type = None
+    try:
+        from vllm_omni.platforms import current_omni_platform
+
+        device_type = current_omni_platform.device_type
+        set_stage_devices(stage_id, runtime_cfg.get("devices"), device_type=device_type)
+    except Exception as e:
+        logger.warning("Device setup failed: %s", e)
+
+    # Initialize OmniConnectors if configured to match sync worker behavior
+    connectors: dict[Any, Any] = {}
+    if connectors_config:
+        built_connectors = build_stage_connectors(
+            stage_id=stage_id,
+            connectors_config=connectors_config,
+        )
+        if built_connectors is None:
+            return
+        connectors = built_connectors
+
+    # Use sequential init locks only when NVML is unavailable
+    with _sequential_init_lock(engine_args, stage_init_timeout):
+        # Init engine based on stage_type
+        logger.debug(
+            "[Stage-%s] Initializing %s engine with args keys=%s",
+            stage_id,
+            stage_type,
+            list(engine_args.keys()),
+        )
+        if engine_args.get("async_chunk", False):
+            logger.debug("[Stage-%s] Async chunk enabled, injecting connectors config", stage_id)
+            stage_connector_spec = {}
+            for v in connectors_config.values():
+                stage_connector_spec = dict(v.get("spec", {}))
+                break
+            engine_args["stage_connector_spec"] = stage_connector_spec
+            engine_args["stage_id"] = stage_id
+        if stage_type == "diffusion":
+            # For diffusion, we need to extract diffusion-specific config
+            engine_args = filter_dataclass_kwargs(OmniDiffusionConfig, engine_args)
+            od_config = _build_od_config(engine_args, model)
+
+            # Inject omni config for worker to access stage info
+            if "omni_kv_config" not in od_config:
+                od_config["omni_kv_config"] = {}
+            od_config["omni_kv_config"]["stage_id"] = stage_id
+            od_config["omni_kv_config"]["engine_input_source"] = stage_payload.get("engine_input_source", [])
+
+            logger.debug(f"[Stage-%s] Initializing diffusion engine with config: {od_config}", stage_id)
+            stage_engine = AsyncOmniDiffusion(
+                model=model,
+                od_config=od_config,
+                **{k: v for k, v in engine_args.items() if k not in {"od_config", "model"}},
+            )
+            vllm_config = None  # Diffusion doesn't use vllm_config
+        else:
+            engine_args = filter_dataclass_kwargs(AsyncOmniEngineArgs, engine_args)
+            engine_args.pop("model", None)
+            omni_engine_args = AsyncOmniEngineArgs(model=model, **engine_args)
+            usage_context = UsageContext.OPENAI_API_SERVER
+            vllm_config = omni_engine_args.create_engine_config(usage_context=usage_context)
+            stage_engine = AsyncOmniLLM.from_vllm_config(
+                vllm_config=vllm_config,
+                usage_context=usage_context,
+                engine_args=omni_engine_args,
+                disable_log_stats=bool(
+                    engine_args.get("disable_log_stats", True) or getattr(omni_engine_args, "disable_log_stats", True)
+                ),
+            )
+    if hasattr(stage_engine, "log_stats") and stage_engine.log_stats:
+
+        async def _force_log():
+            try:
+                while True:
+                    await asyncio.sleep(10.0)
+                    await stage_engine.do_log_stats()
+            except asyncio.CancelledError:
+                pass
+
+        log_stats_task = asyncio.create_task(_force_log())
+    else:
+        log_stats_task = None
+
+    # Don't keep the dummy data in memory (only for LLM engines)
+    if stage_type != "diffusion":
+        await stage_engine.reset_mm_cache()
+    logger.debug("[Stage-%s] Engine initialized", stage_id)
+
+    async def handle_profiler_task_async(task_type: OmniStageTaskType) -> dict:
+        """Handle profiler task asynchronously for both LLM and diffusion stages."""
+        if task_type == OmniStageTaskType.PROFILER_START:
+            if stage_type == "diffusion":
+                try:
+                    profile_dir = os.environ.get("VLLM_TORCH_PROFILER_DIR", "./profiles")
+                    os.makedirs(profile_dir, exist_ok=True)
+                    trace_filename = f"stage_{stage_id}_diffusion_{int(time.time())}"
+                    await stage_engine.start_profile(trace_filename=trace_filename)
+                    logger.info("[Stage-%s] Diffusion Torch profiler started", stage_id)
+                except Exception as e:
+                    logger.warning("[Stage-%s] Failed to start diffusion profiler: %s", stage_id, e)
+            else:
+                try:
+                    await stage_engine.start_profile()
+                    logger.info("[Stage-%s] vLLM profiler started", stage_id)
+                except Exception as e:
+                    logger.warning("[Stage-%s] Failed to start vLLM profiler: %s", stage_id, e)
+            return {}
+
+        elif task_type == OmniStageTaskType.PROFILER_STOP:
+            result_data: dict = {}
+            if stage_type == "diffusion":
+                try:
+                    trace_files = await stage_engine.stop_profile()
+                    logger.info("[Stage-%s] Diffusion Torch profiler stopped", stage_id)
+                    if trace_files:
+                        logger.info("Diffusion trace files: %s", trace_files)
+                        result_data = trace_files
+                except Exception as e:
+                    logger.warning("[Stage-%s] Failed to stop diffusion profiler: %s", stage_id, e)
+            else:
+                try:
+                    await stage_engine.stop_profile()
+                    logger.info("[Stage-%s] vLLM profiler stopped", stage_id)
+                except Exception as e:
+                    logger.warning("[Stage-%s] Failed to stop vLLM profiler: %s", stage_id, e)
+            return result_data
+        return {}
+
+    # Signal readiness to orchestrator and send vllm_config back to main process
+    try:
+        # Send vllm_config back to main process so it can be accessed via
+        # get_vllm_config(). This is needed because async_engine is only available
+        # in the worker process
+
+        # input_preprocessor = await stage_engine.get_input_preprocessor()
+        stage_ready_payload = {
+            "type": "stage_ready",
+            "stage_id": stage_id,
+            "vllm_config": vllm_config,
+            "tokenizer": getattr(stage_engine, "tokenizer", None),
+        }
+        # Only add is_tracing_enabled for LLM engines
+        if stage_type != "diffusion":
+            stage_ready_payload["is_tracing_enabled"] = await stage_engine.is_tracing_enabled()
+        try:
+            from vllm_omni.entrypoints.stage_utils import serialize_obj
+
+            serialize_obj(stage_ready_payload)
+        except Exception:
+            logger.warning("[Stage-%s] stage_ready_payload not picklable, dropping vllm_config/tokenizer", stage_id)
+            stage_ready_payload = {
+                "type": "stage_ready",
+                "stage_id": stage_id,
+                "is_tracing_enabled": stage_ready_payload.get("is_tracing_enabled"),
+                "vllm_config": None,
+                "tokenizer": None,
+            }
+        out_q.put(stage_ready_payload)
+    except Exception as e:
+        logger.warning("Failed to send stage ready signal: %s", e)
+    generation_out_q = asyncio.Queue()
+
+    # Batch processing loop
+    _rx_bytes_by_rid: dict[Any, int] = {}
+    _rx_decode_ms_by_rid: dict[Any, float] = {}
+    _in_flight_ms_by_rid: dict[Any, float] = {}
+
+    async def generation_single_request(task: dict[str, Any]):
+        _recv_dequeue_ts = _time.time()
+        rid = task["request_id"]
+        try:
+            sent_ts = float(task.get("sent_ts", None)) if isinstance(task, dict) else None
+            if sent_ts is not None:
+                _in_flight_ms_by_rid[rid] = max(0.0, (_recv_dequeue_ts - sent_ts) * 1000.0)
+            else:
+                _in_flight_ms_by_rid[rid] = 0.0
+        except Exception:
+            _in_flight_ms_by_rid[rid] = 0.0
+        try:
+            ein, _rx_metrics = try_recv_via_connector(
+                task=task,
+                connectors=connectors,
+                stage_id=stage_id,
+            )
+            # TODO: hack type annotation for now.
+            # A better way is to refine type annotation of connection and task/payloads, maybe using template types.
+            ein = cast(OmniPromptType | Sequence[OmniPromptType] | None, ein)
+
+            if ein is None or _rx_metrics is None:
+                raise RuntimeError(
+                    f"[Stage-{stage_id}] Missing connector payload for request {rid}. "
+                    "Ensure connectors are configured for all incoming edges."
+                )
+            _rx_decode_ms_by_rid[rid] = float(_rx_metrics.get("rx_decode_time_ms", 0.0))
+            _rx_bytes_by_rid[rid] = int(_rx_metrics.get("rx_transfer_bytes", 0))
+
+            logger.debug("Received batch size=1, request_ids=%s", rid)
+            _gen_t0 = _time.time()
+            if isinstance(ein, Sequence) and not isinstance(ein, str):
+                if len(ein) == 0:
+                    logger.info("[Stage-%s] Skipping request %s: no engine inputs", stage_id, rid)
+                    out_q.put({"request_id": rid, "stage_id": stage_id, "skipped": True})
+                    return
+                ein = ein[0]
+
+            if stage_type == "diffusion":
+                diffusion_sampling_params = cast(OmniDiffusionSamplingParams, task["sampling_params"])
+                # AsyncOmniDiffusion.generate returns a single result, not an async generator
+                gen_output = await cast(AsyncOmniDiffusion, stage_engine).generate(ein, diffusion_sampling_params, rid)
+                _gen_t1 = _time.time()
+                _gen_ms = (_gen_t1 - _gen_t0) * 1000.0
+                await generation_out_q.put((rid, gen_output, _gen_ms))
+            else:
+                ein = cast(PromptType, ein)
+                llm_sampling_params: SamplingParams = task["sampling_params"]
+                gen_output = None
+                async for res in cast(AsyncLLM, stage_engine).generate(ein, llm_sampling_params, rid):
+                    gen_output = res
+                    _gen_t1 = _time.time()
+                    _gen_ms = (_gen_t1 - _gen_t0) * 1000.0
+                    _gen_t0 = _gen_t1
+                    await generation_out_q.put((rid, gen_output, _gen_ms))
+        except Exception as e:
+            logger.exception("Failed on request %s: %s", rid, e)
+            out_q.put(
+                {
+                    "request_id": rid,
+                    "stage_id": stage_id,
+                    "error": str(e),
+                }
+            )
+
+    _batch_gen_t0 = _time.time()
+    while True:
+        try:
+            task = in_q.get_nowait()
+            task_type = task.get("type", OmniStageTaskType.GENERATE)
+            if task_type == OmniStageTaskType.SHUTDOWN:
+                logger.debug("Received shutdown signal")
+                stage_engine.shutdown()
+                break
+            elif task_type == OmniStageTaskType.ABORT:
+                rid = task["request_id"]
+                asyncio.create_task(stage_engine.abort(rid))
+            elif is_profiler_task(task_type):
+                profiler_data = await handle_profiler_task_async(task_type)
+                # Send result back to orchestrator for STOP command
+                if task_type == OmniStageTaskType.PROFILER_STOP:
+                    out_q.put({"type": "profiler_result", "data": profiler_data})
+            else:
+                asyncio.create_task(generation_single_request(task))
+
+        except queue.Empty:
+            await asyncio.sleep(0.001)
+        batch_request_outputs: list[Any] = []
+        batch_request_ids: list[Any] = []
+        _gen_ms_list = []
+        batch_metrics: list[Any] = []
+        while True:
+            try:
+                rid, gen_output, _gen_ms = generation_out_q.get_nowait()
+                _metrics = make_request_stats(
+                    [gen_output],
+                    _gen_ms,
+                    int(_batch_seq),
+                    1,  # temporarily set to 1
+                    float(_rx_decode_ms_by_rid.get(rid, 0.0)),
+                    int(_rx_bytes_by_rid.get(rid, 0)),
+                    float(_in_flight_ms_by_rid.get(rid, 0.0)),
+                )
+                batch_metrics.append(_metrics)
+                batch_request_outputs.append(gen_output)
+                _gen_ms_list.append(_gen_ms)
+                batch_request_ids.append(rid)
+                _agg_total_tokens += _metrics.num_tokens_out
+            except asyncio.QueueEmpty:
+                await asyncio.sleep(0.001)
+                break
+
+        if not batch_request_outputs:
+            continue
+        _batch_seq += 1
+
+        _batch_gen_t1 = _time.time()
+        _agg_total_gen_time_ms += (_batch_gen_t1 - _batch_gen_t0) * 1000
+        _batch_gen_t0 = _batch_gen_t1
+        for idx, metrics in enumerate(batch_metrics):
+            metrics.batch_size = len(batch_metrics)
+            if idx == len(batch_metrics) - 1:
+                metrics.stage_stats = make_stage_stats(_agg_total_tokens, _agg_total_gen_time_ms)
+
+        logger.debug("Sending outputs to main process")
+        for rid, output, _gen_ms, _metrics in zip(
+            batch_request_ids, batch_request_outputs, _gen_ms_list, batch_metrics
+        ):
+            try:
+                r_outputs = [output_strip(output, final_output, final_output_type)]
+                use_shm, payload = maybe_dump_to_shm(r_outputs, shm_threshold_bytes)
+                if use_shm:
+                    out_q.put(
+                        {
+                            "request_id": rid,
+                            "stage_id": stage_id,
+                            "engine_outputs_shm": payload,
+                            "metrics": _metrics,
+                        }
+                    )
+                else:
+                    out_q.put(
+                        {
+                            "request_id": rid,
+                            "stage_id": stage_id,
+                            "engine_outputs": payload,
+                            "metrics": _metrics,
+                        }
+                    )
+                    logger.debug(f"Enqueued req={rid}, use_shm={use_shm}, tokens_out={_metrics.num_tokens_out}")
+            except Exception as e:
+                logger.exception(
+                    "Failed to enqueue result for request %s: %s",
+                    rid,
+                    e,
+                )
+                out_q.put(
+                    {
+                        "request_id": rid,
+                        "stage_id": stage_id,
+                        "engine_outputs": r_outputs,
+                        "metrics": _metrics,
+                    }
+                )
+            logger.debug("Enqueued result for request %s to downstream", rid)
+    if log_stats_task is not None:
+        log_stats_task.cancel()
+    logger.info("Stage worker exiting")
+
+
+def count_prompt_tokens_from_outputs(engine_outputs: list[Any]) -> int:
+    """Count prompt tokens from engine outputs."""
+    total = 0
+    for _ro in engine_outputs:
+        try:
+            prompt_token_ids = getattr(_ro, "prompt_token_ids", None)
+            if prompt_token_ids is not None:
+                total += len(prompt_token_ids)
+        except Exception:
+            pass
+    return total
+
+
+def make_request_stats(
+    req_output: list[Any],
+    stage_gen_time_ms: float,
+    batch_id: int,
+    batch_size: int,
+    rx_decode_time_ms: float,
+    rx_transfer_bytes: int,
+    rx_in_flight_time_ms: float,
+):
+    from vllm_omni.metrics import StageRequestStats
+
+    num_tokens_in = count_prompt_tokens_from_outputs(req_output)
+    num_tokens_out = count_tokens_from_outputs(req_output)
+    return StageRequestStats(
+        num_tokens_in=num_tokens_in,
+        num_tokens_out=num_tokens_out,
+        stage_gen_time_ms=stage_gen_time_ms,
+        batch_id=batch_id,
+        batch_size=batch_size,
+        rx_decode_time_ms=rx_decode_time_ms,
+        rx_transfer_bytes=rx_transfer_bytes,
+        rx_in_flight_time_ms=rx_in_flight_time_ms,
+        stage_stats=None,
+    )
+
+
+def make_stage_stats(_agg_total_tokens: int, _agg_total_gen_time_ms: float):
+    from vllm_omni.metrics import StageStats
+
+    return StageStats(total_token=_agg_total_tokens, total_gen_time_ms=_agg_total_gen_time_ms)
+
+
+def output_strip(r_output: RequestOutput | OmniRequestOutput, final_output: bool, final_output_type: str | None):
+    """
+    Strip unnecessary multimodal outputs from stages results,
+    in order to:
+    - reduce memory usage
+    - reduce transfer & serialization overhead
+    """
+
+    # check multimodal data is required by stage output config.
+    if final_output and final_output_type != "text":
+        return r_output
+
+    # If the request has already finished, should not be altered.
+    if getattr(r_output, "finished", False):
+        return r_output
+
+    mm_output = getattr(r_output, "multimodal_output", None)
+    if mm_output is not None:
+        r_output.multimodal_output = {}
+
+    outputs = getattr(r_output, "outputs", None)
+    if outputs is not None:
+        for out in outputs:
+            if getattr(out, "multimodal_output", None):
+                out.multimodal_output = {}
+
+    return r_output
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 39fcbc9a0aa..5e40d4c263b 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -6,13 +6,14 @@
 from collections.abc import AsyncGenerator, AsyncIterator, Callable
 from datetime import datetime, timedelta, timezone
 from io import BytesIO
-from typing import Any, Final, cast
+from typing import TYPE_CHECKING, Any, Final, Optional, cast
 
 import jinja2
 import torch
 from fastapi import Request
 from PIL import Image
 from pydantic import TypeAdapter
+from vllm.renderers import BaseRenderer
 
 from vllm_omni.entrypoints.async_omni import AsyncOmni
 from vllm_omni.entrypoints.openai.protocol.chat_completion import OmniChatCompletionResponse
@@ -62,11 +63,11 @@
 from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
 from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
 from vllm.entrypoints.utils import should_include_usage
-from vllm.inputs import PromptType
+from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.reasoning import ReasoningParser
-from vllm.renderers import BaseRenderer, merge_kwargs
+from vllm.renderers import merge_kwargs
 from vllm.renderers.inputs import TokPrompt
 from vllm.sampling_params import SamplingParams
 from vllm.tokenizers import TokenizerLike
@@ -82,7 +83,6 @@
 from vllm.utils.collection_utils import as_list
 
 from vllm_omni.entrypoints.openai.audio_utils_mixin import AudioMixin
-from vllm_omni.entrypoints.openai.image_api_utils import validate_layered_layers
 from vllm_omni.entrypoints.openai.protocol import OmniChatCompletionStreamResponse
 from vllm_omni.entrypoints.openai.protocol.audio import AudioResponse, CreateAudio
 from vllm_omni.entrypoints.openai.utils import (
@@ -91,8 +91,12 @@
     validate_requested_speaker,
 )
 from vllm_omni.lora.request import LoRARequest
+from vllm_omni.lora.utils import stable_lora_int_id
 from vllm_omni.outputs import OmniRequestOutput
 
+if TYPE_CHECKING:
+    from vllm_omni.entrypoints.async_omni_diffusion import AsyncOmniDiffusion
+
 logger = init_logger(__name__)
 
 
@@ -108,14 +112,14 @@ class OmniOpenAIServingChat(OpenAIServingChat, AudioMixin):
 
     # Diffusion mode attributes
     _diffusion_mode: bool = False
-    _diffusion_engine: AsyncOmni | None = None
+    _diffusion_engine: Optional["AsyncOmniDiffusion"] = None
     _diffusion_model_name: str = ""
     _supported_speakers: set[str] | None = None
 
     @classmethod
     def for_diffusion(
         cls,
-        diffusion_engine: AsyncOmni,
+        diffusion_engine: "AsyncOmniDiffusion",
         model_name: str,
     ) -> "OmniOpenAIServingChat":
         """Create a chat serving instance for diffusion models.
@@ -271,9 +275,7 @@ async def create_chat_completion(
                 )
             else:
                 should_include_tools = tool_dicts is not None
-                conversation, engine_prompts = self.openai_serving_render._make_request_with_harmony(
-                    request, should_include_tools
-                )
+                conversation, engine_prompts = self._make_request_with_harmony(request, should_include_tools)
 
         except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
@@ -282,6 +284,26 @@ async def create_chat_completion(
                 message = f"{message} {e.__cause__}"
             return self.create_error_response(message)
 
+        # Zero-shot TTS: extract last input_audio bytes as speaker reference for S2S.
+        _ref_audio_b64 = None
+        for _msg in request.messages:
+            _content = getattr(_msg, "content", None) or (_msg.get("content") if isinstance(_msg, dict) else None)
+            if isinstance(_content, list):
+                for _part in _content:
+                    _ptype = _part.get("type") if isinstance(_part, dict) else getattr(_part, "type", None)
+                    if _ptype == "input_audio":
+                        _ia = (
+                            _part.get("input_audio") if isinstance(_part, dict) else getattr(_part, "input_audio", None)
+                        )
+                        if _ia is not None:
+                            _data = _ia.get("data") if isinstance(_ia, dict) else getattr(_ia, "data", None)
+                            if _data:
+                                _ref_audio_b64 = _data
+        if _ref_audio_b64 is not None:
+            for _ep in engine_prompts:
+                if isinstance(_ep, dict):
+                    _ep["ref_audio_b64"] = _ref_audio_b64
+
         request_id = f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}"
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
@@ -293,13 +315,23 @@ async def create_chat_completion(
             output_modalities if output_modalities is not None else self.engine_client.output_modalities
         )
 
-        num_inference_steps = None
         # Omni multistage image generation: Stage-0 (AR) should receive a clean
         # text prompt (and optional conditioning image/size) so the model's own
         # processor can construct the correct inputs.
         # If we pass pre-tokenized chat-template ids, GLM-Image can become
         # effectively unconditioned and produce nonsense images.
-        if request.modalities and ("image" in request.modalities):
+        # Skip if audio input is present (A2T/S2T needs full chat template prompt).
+        # Skip if Stage-0 is an LLM (e.g. HCX Omni thinker) - it needs the
+        # pre-tokenized chat-template prompt, not a bare text prompt.
+        _has_audio_input = any("audio" in (ep.get("multi_modal_data") or {}) for ep in engine_prompts)
+        _stage0_is_llm = (
+            hasattr(self.engine_client, "stage_list")
+            and self.engine_client.stage_list
+            and getattr(self.engine_client.stage_list[0], "stage_type", None) == "llm"
+        )
+        _wants_image = request.modalities and ("image" in request.modalities)
+        _no_continue = not getattr(request, "continue_final_message", False)
+        if _wants_image and _no_continue and not _has_audio_input and not _stage0_is_llm:
             try:
                 messages_as_dicts: list[dict[str, Any]] = []
                 for msg in request.messages:
@@ -318,21 +350,9 @@ async def create_chat_completion(
                 if not extracted_prompt:
                     return self.create_error_response("No text prompt found in messages")
 
-                # [NOTE] When sending request via openai client Python library,
-                #   `extra_body` is flattented and merged into the payload's root.
-                #   These extra fields are accessible via `model_extra` property (from Pydantic base class).
-                #   When sending raw request with curl, no flattening happens. Directly read the `extra_body` dict.
-                extra_body = getattr(request, "extra_body", None)
-                if not extra_body:
-                    extra_body = request.model_extra or {}
+                extra_body = getattr(request, "extra_body", None) or {}
                 height = extra_body.get("height")
                 width = extra_body.get("width")
-                num_inference_steps = extra_body.get("num_inference_steps")
-                if num_inference_steps is not None:
-                    try:
-                        num_inference_steps = int(num_inference_steps)
-                    except Exception:
-                        num_inference_steps = None
                 if "size" in extra_body:
                     try:
                         size_str = extra_body["size"]
@@ -344,23 +364,17 @@ async def create_chat_completion(
                 negative_prompt = extra_body.get("negative_prompt")
 
                 engine_prompt_image: dict[str, Any] | None = None
-                is_img2img = False
                 if reference_images:
                     # Best-effort decode first reference image for i2i.
                     try:
                         img_bytes = base64.b64decode(reference_images[0])
                         img = Image.open(BytesIO(img_bytes))
-                        engine_prompt_image = {"img2img": img}
-                        is_img2img = True
+                        engine_prompt_image = {"image": img}
                     except Exception:
                         engine_prompt_image = None
 
                 # Override the prompts produced by chat-template preprocessing.
                 tprompt: OmniTextPrompt = {"prompt": extracted_prompt}
-                if is_img2img:
-                    tprompt["modalities"] = ["img2img"]
-                else:
-                    tprompt["modalities"] = ["image"]
                 if negative_prompt is not None:
                     tprompt["negative_prompt"] = negative_prompt
                 # GLM-Image's _call_hf_processor expects target_h/target_w in mm_processor_kwargs
@@ -396,15 +410,14 @@ async def create_chat_completion(
                     # Use standard OpenAI API parameters for comprehension stage
                     sampling_params_list = self._build_sampling_params_list_from_request(request)
 
-                # Apply user-specified overrides to diffusion stage(s) for image generation
-                if _image_gen_height is not None or _image_gen_width is not None or num_inference_steps is not None:
+                # Apply user-specified height/width to diffusion stage(s) for image generation
+                if _image_gen_height is not None or _image_gen_width is not None:
                     for idx, sp in enumerate(sampling_params_list):
+                        # Diffusion stages typically have height/width attributes
                         if hasattr(sp, "height") and _image_gen_height is not None:
                             sp.height = _image_gen_height
                         if hasattr(sp, "width") and _image_gen_width is not None:
                             sp.width = _image_gen_width
-                        if hasattr(sp, "num_inference_steps") and num_inference_steps is not None:
-                            sp.num_inference_steps = num_inference_steps
 
                 self._log_inputs(
                     request_id,
@@ -493,15 +506,6 @@ async def _preprocess_chat(
             default_template_content_format,
         ).with_defaults(default_template_kwargs)
 
-        # OMNI: When use_audio_in_video=True, the qwen2_5_omni_thinker mm
-        # processor asserts that audio items are present alongside video items
-        # (inside render_chat_async).  We must inject audio_url items into the
-        # messages BEFORE calling render_chat_async so the mm processor can
-        # count them correctly during tokenisation.
-        mm_proc_kw = getattr(request, "mm_processor_kwargs", None) or {}
-        if mm_proc_kw.get("use_audio_in_video", False):
-            messages = await self._inject_audio_from_video_urls(messages)
-
         (conversation,), (engine_prompt,) = await renderer.render_chat_async(
             [messages],
             chat_params,
@@ -511,6 +515,28 @@ async def _preprocess_chat(
             },
         )
 
+        # OMNI: When use_audio_in_video=True, the upstream renderer does not
+        # extract audio from video.  We do it here after rendering so that the
+        # audio data is present in multi_modal_data before the engine processes
+        # the request.
+        mm_proc_kw = getattr(request, "mm_processor_kwargs", None) or {}
+        if mm_proc_kw.get("use_audio_in_video", False) and isinstance(engine_prompt, dict):
+            mm_data = engine_prompt.get("multi_modal_data")
+            if mm_data is not None and "video" in mm_data and "audio" not in mm_data:
+                from vllm_omni.entrypoints.chat_utils import extract_audio_from_video_async
+
+                video_urls: list[str] = []
+                for msg in messages:
+                    for part in msg.get("content") or []:
+                        if isinstance(part, dict) and part.get("type") == "video_url":
+                            url = part.get("video_url", {}).get("url")
+                            if url:
+                                video_urls.append(url)
+
+                if video_urls:
+                    audios = await asyncio.gather(*(extract_audio_from_video_async(u) for u in video_urls))
+                    engine_prompt.setdefault("multi_modal_data", {})["audio"] = list(audios)
+
         tokenizer = renderer.get_tokenizer()
 
         # tool parsing is done only if a tool_parser has been set and if
@@ -530,11 +556,12 @@ async def _preprocess_chat(
             )
 
         # Preserve a clean text prompt for downstream stages (e.g., GLM-Image diffusion).
-        # For image generation, we want the raw user caption instead of a rendered template.
-        # But for multimodal comprehension (img2text), we MUST keep the rendered prompt
-        # containing image tokens.
-        req_modalities = getattr(request, "modalities", [])
-        if req_modalities and ("image" in req_modalities):
+        # For /v1/chat/completions, `request_prompt` is often the rendered chat template.
+        # Diffusion models generally want the raw user caption instead.
+        # Skip if audio is already in mm_data (A2T request needs full chat template).
+        output_modalities = getattr(self.engine_client, "output_modalities", None)
+        _has_audio_mm = "audio" in (engine_prompt.get("multi_modal_data") or {})
+        if output_modalities and ("image" in output_modalities) and not continue_final_message and not _has_audio_mm:
             messages_as_dicts: list[dict[str, Any]] = []
             for msg in messages:
                 if hasattr(msg, "model_dump"):
@@ -574,95 +601,6 @@ async def _preprocess_chat(
 
         return conversation, [engine_prompt]
 
-    async def _inject_audio_from_video_urls(
-        self,
-        messages: list[ChatCompletionMessageParam],
-    ) -> list[ChatCompletionMessageParam]:
-        """Pre-extract audio from video URLs and inject as audio_url content items.
-
-        When use_audio_in_video=True, the qwen2_5_omni_thinker multimodal
-        processor requires that the number of audio items equals the number of
-        video items (it subtracts mm_counts["video"] from mm_counts["audio"]).
-        The client only sends video_url items; this method adds the matching
-        audio_url items on the server side before the renderer processes them.
-        """
-        import io
-
-        from vllm_omni.entrypoints.chat_utils import extract_audio_from_video_async
-
-        new_messages: list[ChatCompletionMessageParam] = []
-        for msg in messages:
-            content = msg.get("content") if isinstance(msg, dict) else getattr(msg, "content", None)
-            if not isinstance(content, list):
-                new_messages.append(msg)
-                continue
-
-            video_urls = [
-                part.get("video_url", {}).get("url")
-                for part in content
-                if isinstance(part, dict) and part.get("type") == "video_url" and part.get("video_url", {}).get("url")
-            ]
-
-            if not video_urls:
-                new_messages.append(msg)
-                continue
-
-            audios = await asyncio.gather(*(extract_audio_from_video_async(u) for u in video_urls))
-
-            audio_items: list[dict] = []
-            for audio_array, sample_rate in audios:
-                buf = io.BytesIO()
-                if soundfile is not None:
-                    soundfile.write(buf, audio_array, samplerate=int(sample_rate), format="WAV")
-                else:
-                    import struct
-
-                    import numpy as np
-
-                    audio_np = np.asarray(audio_array, dtype=np.float32)
-                    sr = int(sample_rate)
-                    num_channels = 1
-                    bits_per_sample = 32
-                    num_frames = len(audio_np)
-                    data_size = num_frames * num_channels * (bits_per_sample // 8)
-                    # Write minimal RIFF/WAV header
-                    buf.write(b"RIFF")
-                    buf.write(struct.pack("<I", 36 + data_size))
-                    buf.write(b"WAVE")
-                    buf.write(b"fmt ")
-                    buf.write(
-                        struct.pack(
-                            "<IHHIIHH",
-                            16,
-                            3,
-                            num_channels,
-                            sr,
-                            sr * num_channels * (bits_per_sample // 8),
-                            num_channels * (bits_per_sample // 8),
-                            bits_per_sample,
-                        )
-                    )
-                    buf.write(b"data")
-                    buf.write(struct.pack("<I", data_size))
-                    buf.write(audio_np.tobytes())
-
-                audio_b64 = base64.b64encode(buf.getvalue()).decode()
-                audio_items.append(
-                    {
-                        "type": "audio_url",
-                        "audio_url": {"url": f"data:audio/wav;base64,{audio_b64}"},
-                    }
-                )
-
-            new_content = list(content) + audio_items
-            if isinstance(msg, dict):
-                new_msg = {**msg, "content": new_content}
-            else:
-                new_msg = msg.model_copy(update={"content": new_content})
-            new_messages.append(new_msg)
-
-        return new_messages
-
     def _to_sampling_params_list(self, sampling_params_list: list[dict]) -> list[SamplingParams]:
         final_sampling_params_list = []
         for sampling_params in sampling_params_list:
@@ -675,10 +613,10 @@ def _to_sampling_params_list(self, sampling_params_list: list[dict]) -> list[Sam
         return final_sampling_params_list
 
     def _get_comprehension_stage_index(self) -> int:
-        for idx, stage in enumerate(self.engine_client.stage_configs):
+        for idx, stage in enumerate(self.engine_client.stage_list):
             if stage.is_comprehension:
                 return idx
-        raise ValueError("No comprehension stage (is_comprehension=True) found in stage configs")
+        raise ValueError("No comprehension stage (is_comprehension=True) found in stage_list")
 
     # OpenAI API standard sampling parameters that can be safely overridden.
     # These are the most commonly used parameters with compatible types
@@ -839,7 +777,7 @@ async def chat_completion_stream_generator(
         # Prepare the tool parser if it's needed
         try:
             if tool_choice_auto and self.tool_parser:
-                tool_parsers: list[ToolParser | None] = [self.tool_parser(tokenizer, request.tools)] * num_choices
+                tool_parsers: list[ToolParser | None] = [self.tool_parser(tokenizer)] * num_choices
             else:
                 tool_parsers = [None] * num_choices
         except Exception as e:
@@ -1343,21 +1281,11 @@ async def chat_completion_stream_generator(
                                     latest_delta_len = len(delta_message.tool_calls[0].function.arguments)
 
                                 # get the expected call based on partial JSON
-                                # parsing which "autocompletes" the JSON.
-                                # Tool parsers (e.g. Qwen3Coder) store
-                                # arguments as a JSON string in
-                                # prev_tool_call_arr. Calling json.dumps()
-                                # on an already-serialized string would
-                                # double-serialize it (e.g. '{"k":1}' becomes
-                                # '"{\\"k\\":1}"'), which then causes the
-                                # replace() below to fail and append the
-                                # entire double-serialized string as a
-                                # spurious final delta.
-                                args = tool_parser.prev_tool_call_arr[index].get("arguments", {})
-                                if isinstance(args, str):
-                                    expected_call = args
-                                else:
-                                    expected_call = json.dumps(args, ensure_ascii=False)
+                                # parsing which "autocompletes" the JSON
+                                expected_call = json.dumps(
+                                    tool_parser.prev_tool_call_arr[index].get("arguments", {}),
+                                    ensure_ascii=False,
+                                )
 
                                 # get what we've streamed so far for arguments
                                 # for the current tool
@@ -1428,22 +1356,71 @@ async def chat_completion_stream_generator(
 
                 elif final_output_type == "audio":
                     role = self.get_chat_request_role(request)
-                    choices_data = self._create_audio_choice(omni_res, role, request, stream=True)
-                    chunk = OmniChatCompletionStreamResponse(
-                        id=request_id,
-                        object=chunk_object_type,
-                        created=created_time,
-                        choices=choices_data,
-                        model=model_name,
-                        modality=final_output_type,
-                    )
-                    chunk.usage = UsageInfo(
-                        prompt_tokens=num_prompt_tokens,
-                        completion_tokens=0,
-                        total_tokens=num_prompt_tokens,
-                    )
-                    data = chunk.model_dump_json(exclude_unset=True)
-                    yield f"data: {data}\n\n"
+                    # Stream audio as PCM chunks (200ms each @ 24kHz).
+                    # BigVGAN decoder is non-causal: full audio is generated
+                    # first, then split for streaming delivery to the client.
+                    import numpy as np
+
+                    _AUDIO_CHUNK_SAMPLES = 4800  # 200ms @ 24kHz
+                    _final_res = omni_res.request_output
+                    if _final_res is not None and _final_res.outputs:
+                        _audio_data = _final_res.outputs[0].multimodal_output.get("audio")
+                    else:
+                        _audio_data = omni_res.multimodal_output.get("audio")
+                    # Normalize audio to float32 tensor for uniform PCM chunking.
+                    # HyperCLOVAXAudioPipeline returns WAV bytes (with header); parse to strip it.
+                    # Qwen3-Omni returns float tensors directly.
+                    import io as _io
+
+                    import soundfile as _sf
+
+                    if isinstance(_audio_data, bytes):
+                        _arr, _ = _sf.read(_io.BytesIO(_audio_data))
+                        if _arr.ndim > 1:
+                            _arr = _arr.mean(axis=1)
+                        _audio_tensor = torch.from_numpy(_arr.astype(np.float32))
+                    elif isinstance(_audio_data, list):
+                        _audio_tensor = torch.cat(_audio_data, dim=-1).float().detach().cpu()
+                    else:
+                        _audio_tensor = _audio_data.float().detach().cpu()
+                    _audio_tensor = _audio_tensor.flatten()
+                    _chunks = list(torch.split(_audio_tensor, _AUDIO_CHUNK_SAMPLES))
+                    _stream_outputs = _final_res.outputs if (_final_res is not None and _final_res.outputs) else [None]
+                    for _chunk_idx, _wav_chunk in enumerate(_chunks):
+                        _pcm = (_wav_chunk.numpy() * 32767.0).clip(-32768, 32767).astype(np.int16)
+                        _pcm_b64 = base64.b64encode(_pcm.tobytes()).decode("ascii")
+                        _is_last_chunk = _chunk_idx == len(_chunks) - 1
+                        _stream_choices = []
+                        for _so_idx, output in enumerate(_stream_outputs):
+                            _stream_choices.append(
+                                ChatCompletionResponseStreamChoice(
+                                    index=output.index if output is not None else _so_idx,
+                                    delta=DeltaMessage(
+                                        role=role if _chunk_idx == 0 else None,
+                                        content=_pcm_b64,
+                                    ),
+                                    logprobs=None,
+                                    finish_reason="stop" if _is_last_chunk else None,
+                                    stop_reason=(
+                                        output.stop_reason if (output is not None and _is_last_chunk) else None
+                                    ),
+                                )
+                            )
+                        _audio_chunk_resp = OmniChatCompletionStreamResponse(
+                            id=request_id,
+                            object=chunk_object_type,
+                            created=created_time,
+                            choices=_stream_choices,
+                            model=model_name,
+                            modality="audio",
+                        )
+                        if _is_last_chunk:
+                            _audio_chunk_resp.usage = UsageInfo(
+                                prompt_tokens=num_prompt_tokens,
+                                completion_tokens=0,
+                                total_tokens=num_prompt_tokens,
+                            )
+                        yield f"data: {_audio_chunk_resp.model_dump_json(exclude_unset=True)}\n\n"
 
                 else:
                     logger.warning(f"Unsupported streaming final output type: {final_output_type}")
@@ -1660,12 +1637,12 @@ def _create_text_choice(
                 logprobs = None
 
             if self.use_harmony:
-                reasoning, content, _ = parse_chat_output(token_ids)
+                reasoning_content, content, _ = parse_chat_output(token_ids)
                 if not request.include_reasoning:
-                    reasoning = None
+                    reasoning_content = None
 
                 if self.tool_parser is not None:
-                    tool_parser = self.tool_parser(tokenizer, request.tools)
+                    tool_parser = self.tool_parser(tokenizer)
                     # NOTE: We use token_ids for openai tool parser
                     tool_call_info = tool_parser.extract_tool_calls(
                         "",
@@ -1675,14 +1652,14 @@ def _create_text_choice(
                     content = tool_call_info.content
                     message = ChatMessage(
                         role=role,
-                        reasoning=reasoning,
+                        reasoning_content=reasoning_content,
                         content=content,
                         tool_calls=tool_call_info.tool_calls,
                     )
                 else:
                     message = ChatMessage(
                         role=role,
-                        reasoning=reasoning,
+                        reasoning_content=reasoning_content,
                         content=content,
                     )
 
@@ -1703,11 +1680,11 @@ def _create_text_choice(
             if reasoning_parser:
                 # If the reasoning parser is enabled,
                 # tool calls are extracted exclusively from the content.
-                reasoning, content = reasoning_parser.extract_reasoning(output.text, request=request)
+                reasoning_content, content = reasoning_parser.extract_reasoning(output.text, request=request)
                 if not request.include_reasoning:
-                    reasoning = None
+                    reasoning_content = None
             else:
-                reasoning = None
+                reasoning_content = None
                 content = output.text
 
             auto_tools_called = False
@@ -1717,14 +1694,14 @@ def _create_text_choice(
                 not isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam)
                 and request.tool_choice != "required"
             ):
-                message = ChatMessage(role=role, reasoning=reasoning, content=content)
+                message = ChatMessage(role=role, reasoning_content=reasoning_content, content=content)
 
             # if the request uses tools and specified a tool choice
             elif request.tool_choice and type(request.tool_choice) is ChatCompletionNamedToolChoiceParam:
                 tool_call_class = MistralToolCall if isinstance(tokenizer, MistralTokenizer) else ToolCall
                 message = ChatMessage(
                     role=role,
-                    reasoning=reasoning,
+                    reasoning_content=reasoning_content,
                     content="",
                     tool_calls=[
                         tool_call_class(
@@ -1766,13 +1743,13 @@ def _create_text_choice(
                         )
                         for i, tool_call in enumerate(tool_calls)
                     ],
-                    reasoning=reasoning,
+                    reasoning_content=reasoning_content,
                 )
 
             # if the request doesn't use tool choice
             # OR specifies to not use a tool
             elif not request.tool_choice or request.tool_choice == "none":
-                message = ChatMessage(role=role, reasoning=reasoning, content=content)
+                message = ChatMessage(role=role, reasoning_content=reasoning_content, content=content)
 
             # handle when there are tools and tool choice is auto
             elif (
@@ -1782,7 +1759,7 @@ def _create_text_choice(
                 and self.tool_parser
             ):
                 try:
-                    tool_parser = self.tool_parser(tokenizer, request.tools)
+                    tool_parser = self.tool_parser(tokenizer)
                 except RuntimeError as e:
                     logger.exception("Error in tool parser creation.")
                     return self.create_error_response(e)
@@ -1795,7 +1772,7 @@ def _create_text_choice(
                 if tool_call_info.tools_called:
                     message = ChatMessage(
                         role=role,
-                        reasoning=reasoning,
+                        reasoning_content=reasoning_content,
                         content=tool_call_info.content,
                         tool_calls=tool_call_info.tool_calls,
                     )
@@ -1811,7 +1788,7 @@ def _create_text_choice(
                         ret_content = tool_call_info.content
                     message = ChatMessage(
                         role=role,
-                        reasoning=reasoning,
+                        reasoning_content=reasoning_content,
                         content=ret_content,
                     )
 
@@ -1821,7 +1798,7 @@ def _create_text_choice(
                     "Error in chat_completion_full_generator - cannot determine if tools should be extracted. "
                     "Returning a standard chat completion."
                 )
-                message = ChatMessage(role=role, reasoning=reasoning, content=content)
+                message = ChatMessage(role=role, reasoning_content=reasoning_content, content=content)
 
             choice_data = ChatCompletionResponseChoice(
                 index=output.index,
@@ -1870,33 +1847,42 @@ def _create_audio_choice(
     ):
         choices: list[ChatCompletionResponseChoice] = []
         final_res = omni_outputs.request_output
-        # OMNI: Access multimodal_output from CompletionOutput (outputs[0]), not from RequestOutput
-        # Reference: examples/offline_inference/qwen3_omni/end2end.py line 421
-        audio_data = final_res.outputs[0].multimodal_output.get("audio")
-        if isinstance(audio_data, list):
-            if stream:
-                audio_tensor = audio_data[-1]
-            else:
-                audio_tensor = torch.cat(audio_data, dim=-1)
+        # HyperCLOVAXAudioPipeline (diffusion): audio is in omni_outputs.multimodal_output
+        # (final_res.request_output is None, so final_res.outputs == []).
+        # Qwen3-Omni pipeline: audio is in final_res.outputs[0].multimodal_output.
+        if final_res is not None and final_res.outputs:
+            audio_data = final_res.outputs[0].multimodal_output.get("audio")
         else:
-            audio_tensor = audio_data
-        audio_tensor = audio_tensor.float().detach().cpu().numpy()
-
-        # Ensure audio is 1D (flatten if needed)
-        if audio_tensor.ndim > 1:
-            audio_tensor = audio_tensor.flatten()
-
-        audio_obj = CreateAudio(
-            audio_tensor=audio_tensor,
-            sample_rate=24000,
-            response_format="wav",
-            speed=1.0,
-            stream_format="audio",
-            base64_encode=True,
-        )
+            audio_data = omni_outputs.multimodal_output.get("audio")
+        # HyperCLOVAXAudioPipeline post-process returns bytes (WAV/PCM).
+        # Qwen3-Omni returns tensors or list-of-tensors.
+        if isinstance(audio_data, bytes):
+            audio_base64 = base64.b64encode(audio_data).decode("ascii")
+        else:
+            if isinstance(audio_data, list):
+                if stream:
+                    audio_tensor = audio_data[-1]
+                else:
+                    audio_tensor = torch.cat(audio_data, dim=-1)
+            else:
+                audio_tensor = audio_data
+            audio_tensor = audio_tensor.float().detach().cpu().numpy()
+
+            # Ensure audio is 1D (flatten if needed)
+            if audio_tensor.ndim > 1:
+                audio_tensor = audio_tensor.flatten()
+
+            audio_obj = CreateAudio(
+                audio_tensor=audio_tensor,
+                sample_rate=24000,
+                response_format="wav",
+                speed=1.0,
+                stream_format="audio",
+                base64_encode=True,
+            )
 
-        audio_response: AudioResponse = self.create_audio(audio_obj)
-        audio_base64 = audio_response.audio_data
+            audio_response: AudioResponse = self.create_audio(audio_obj)
+            audio_base64 = audio_response.audio_data
 
         # Generate unique ID for the audio
         audio_id = f"audio-{uuid.uuid4().hex[:16]}"
@@ -1912,19 +1898,22 @@ def _create_audio_choice(
             transcript="",  # Empty transcript if not available
         )
 
-        for output in final_res.outputs:
+        _output_list = final_res.outputs if (final_res is not None and final_res.outputs) else [None]
+        for _choice_idx, output in enumerate(_output_list):
             if stream:
                 choice_data = ChatCompletionResponseStreamChoice(
-                    index=output.index,
+                    index=output.index if output is not None else _choice_idx,
                     delta=DeltaMessage(role=role, content=audio_base64),
                     logprobs=None,
                     finish_reason="stop",
-                    stop_reason=output.stop_reason,
-                    token_ids=(as_list(output.token_ids) if request.return_token_ids else None),
+                    stop_reason=output.stop_reason if output is not None else None,
+                    token_ids=(
+                        as_list(output.token_ids) if (output is not None and request.return_token_ids) else None
+                    ),
                 )
             else:
                 choice_data = ChatCompletionResponseChoice(
-                    index=output.index,
+                    index=output.index if output is not None else _choice_idx,
                     message=ChatMessage(role=role, audio=audio_obj),
                     logprobs=None,
                     finish_reason="stop",
@@ -1953,10 +1942,6 @@ def _create_image_choice(
         choices: list[ChatCompletionResponseChoice] = []
         final_res = omni_outputs.request_output
 
-        # Handle profiling data
-        stage_durations = omni_outputs.stage_durations
-        peak_memory_mb = omni_outputs.peak_memory_mb
-
         # Handle different image output formats
         images = []
 
@@ -2010,8 +1995,6 @@ def _create_image_choice(
                     "image_url": {
                         "url": f"data:image/png;base64,{img_base64}",
                     },
-                    "stage_durations": stage_durations,
-                    "peak_memory_mb": peak_memory_mb,
                 }
             )
 
@@ -2048,6 +2031,7 @@ def _create_image_choice(
         return choices
 
     # ==================== Diffusion Mode Methods ====================
+
     async def _create_diffusion_chat_completion(
         self,
         request: ChatCompletionRequest,
@@ -2079,15 +2063,12 @@ async def _create_diffusion_chat_completion(
             # Extract prompt and images from messages
             prompt, reference_images = self._extract_diffusion_prompt_and_images(messages)
 
+            if not prompt:
+                return self._create_error_response("No text prompt found in messages")
+
             # Extract generation parameters from extra_body (preferred)
             # Reference: text_to_image.py and text_to_video.py for supported parameters
-            # [NOTE] When sending request via openai client Python library,
-            #   `extra_body` is flattented and merged into the payload's root.
-            #   These extra fields are accessible via `model_extra` property (from Pydantic base class).
-            #   When sending raw request with curl, no flattening happens. Directly read the `extra_body` dict.
-            extra_body = getattr(request, "extra_body", None)
-            if not extra_body:
-                extra_body = request.model_extra or {}
+            extra_body = getattr(request, "extra_body", None) or {}
 
             # Parse size if provided (supports "1024x1024" format)
             height = extra_body.get("height")
@@ -2101,31 +2082,20 @@ async def _create_diffusion_chat_completion(
                 except ValueError:
                     logger.warning("Invalid size format: %s", extra_body.get("size"))
 
-            # Get request parameters from extra_body.
-            # Avoid hardcoded defaults here — let each pipeline's forward()
-            # method apply its own model-specific default when the user does
-            # not provide a value.
-            num_inference_steps = extra_body.get("num_inference_steps")
+            # Get request parameters from extra_body
+            # Text-to-image parameters (ref: text_to_image.py)
+            num_inference_steps = extra_body.get("num_inference_steps", 50)
             guidance_scale = extra_body.get("guidance_scale")
-            true_cfg_scale = extra_body.get("true_cfg_scale") or extra_body.get("cfg_scale")
+            true_cfg_scale = extra_body.get("true_cfg_scale")  # Qwen-Image specific
             seed = extra_body.get("seed")
             negative_prompt = extra_body.get("negative_prompt")
             num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1)
 
             # Text-to-video parameters (ref: text_to_video.py)
             num_frames = extra_body.get("num_frames")
-            guidance_scale_2 = extra_body.get("guidance_scale_2")
+            guidance_scale_2 = extra_body.get("guidance_scale_2")  # For video high-noise CFG
             lora_body = extra_body.get("lora")
 
-            # Qwen-Image-Layered parameters
-            layers = extra_body.get("layers")
-            resolution = extra_body.get("resolution")
-
-            try:
-                layers = validate_layered_layers(layers)
-            except ValueError as e:
-                return self._create_error_response(str(e), status_code=400)
-
             logger.info(
                 "Diffusion chat request %s: prompt=%r, ref_images=%d, params=%s",
                 request_id,
@@ -2149,36 +2119,50 @@ async def _create_diffusion_chat_completion(
                 "negative_prompt": negative_prompt,
             }
             gen_params = OmniDiffusionSamplingParams(
+                num_inference_steps=num_inference_steps,
                 height=height,
                 width=width,
                 num_outputs_per_prompt=num_outputs_per_prompt,
                 seed=seed,
             )
 
-            # Only override defaults when the user explicitly provides values
-            if num_inference_steps is not None:
-                gen_params.num_inference_steps = num_inference_steps
             if guidance_scale is not None:
                 gen_params.guidance_scale = guidance_scale
+
+            # Add Qwen-Image specific parameter
             if true_cfg_scale is not None:
                 gen_params.true_cfg_scale = true_cfg_scale
+
+            # Add video generation parameters if set
             if num_frames is not None:
                 gen_params.num_frames = num_frames
             if guidance_scale_2 is not None:
                 gen_params.guidance_scale_2 = guidance_scale_2
-            if layers is not None:
-                gen_params.layers = layers
-            if resolution is not None:
-                gen_params.resolution = resolution
 
-            # Parse per-request LoRA.
+            # Parse per-request LoRA (works for both AsyncOmniDiffusion and AsyncOmni).
             if lora_body and isinstance(lora_body, dict):
                 try:
-                    lora_req, lora_scale = parse_lora_request(lora_body)
-                    if lora_req is not None:
+                    lora_name = lora_body.get("name") or lora_body.get("lora_name") or lora_body.get("adapter")
+                    lora_path = (
+                        lora_body.get("local_path")
+                        or lora_body.get("path")
+                        or lora_body.get("lora_path")
+                        or lora_body.get("lora_local_path")
+                    )
+                    # using "or" directly here may be buggy if `scale=0`
+                    lora_scale = lora_body.get("scale")
+                    if lora_scale is None:
+                        lora_scale = lora_body.get("lora_scale")
+                    lora_int_id = lora_body.get("int_id")
+                    if lora_int_id is None:
+                        lora_int_id = lora_body.get("lora_int_id")
+                    if lora_int_id is None and lora_path:
+                        lora_int_id = stable_lora_int_id(str(lora_path))
+                    if lora_name and lora_path:
+                        lora_req = LoRARequest(str(lora_name), int(lora_int_id), str(lora_path))
                         gen_params.lora_request = lora_req
                         if lora_scale is not None:
-                            gen_params.lora_scale = lora_scale
+                            gen_params.lora_scale = float(lora_scale)
                 except Exception as e:  # pragma: no cover - safeguard
                     logger.warning("Failed to parse LoRA request: %s", e)
 
@@ -2205,43 +2189,44 @@ async def _create_diffusion_chat_completion(
                         )
 
             # Generate image
-            diffusion_engine = cast(AsyncOmni, self._diffusion_engine)
-            result = None
-            async for output in diffusion_engine.generate(
-                prompt=gen_prompt,
-                sampling_params_list=[gen_params],  # Pass as single-stage params
-                request_id=request_id,
-            ):
-                result = output
-            if result is None:
-                return self._create_error_response("No output generated from AsyncOmni")
+            # Handle both AsyncOmniDiffusion (returns OmniRequestOutput) and AsyncOmni (returns AsyncGenerator)
+            if hasattr(self._diffusion_engine, "stage_list"):
+                # AsyncOmni: iterate through async generator to get final output
+                diffusion_engine = cast(AsyncOmni, self._diffusion_engine)
+                result = None
+                async for output in diffusion_engine.generate(
+                    prompt=gen_prompt,
+                    sampling_params_list=[gen_params],  # Pass as single-stage params
+                    request_id=request_id,
+                ):
+                    result = output
+                if result is None:
+                    return self._create_error_response("No output generated from AsyncOmni")
+            else:
+                # AsyncOmniDiffusion: direct call
+                diffusion_engine = cast(AsyncOmniDiffusion, self._diffusion_engine)
+                result = await diffusion_engine.generate(
+                    prompt=gen_prompt,
+                    sampling_params=gen_params,
+                    request_id=request_id,
+                )
             # Extract images from result
             # Handle nested OmniRequestOutput structure where images might be in request_output
             images = getattr(result.request_output, "images", [])
-            stage_durations = result.stage_durations
-            peak_memory_mb = result.peak_memory_mb
 
             # Convert images to base64 content
             image_contents: list[dict[str, Any]] = []
-            flat_images = []
-            for item in images:
-                if isinstance(item, list):
-                    flat_images.extend(item)
-                else:
-                    flat_images.append(item)
-
-            for img in flat_images:
+            for img in images:
                 with BytesIO() as buffer:
                     img.save(buffer, format="PNG")
-                    img_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
+                    img_bytes = buffer.getvalue()
+                img_base64 = base64.b64encode(img_bytes).decode("utf-8")
                 image_contents.append(
                     {
                         "type": "image_url",
                         "image_url": {
                             "url": f"data:image/png;base64,{img_base64}",
                         },
-                        "stage_durations": stage_durations,
-                        "peak_memory_mb": peak_memory_mb,
                     }
                 )
 
diff --git a/vllm_omni/entrypoints/stage_utils.py b/vllm_omni/entrypoints/stage_utils.py
index 7b725f469eb..e734ea4e52d 100644
--- a/vllm_omni/entrypoints/stage_utils.py
+++ b/vllm_omni/entrypoints/stage_utils.py
@@ -1,90 +1,155 @@
 from __future__ import annotations
 
+import enum
+import json
 import logging
 import os
 from multiprocessing import shared_memory as _shm
 from typing import Any
 
-from vllm_omni.config.yaml_util import to_dict as _omega_to_dict
-from vllm_omni.platforms import current_omni_platform
+from omegaconf import OmegaConf
 
 logger = logging.getLogger(__name__)
 
 
+class OmniStageTaskType(enum.Enum):
+    GENERATE = "generate"
+    ABORT = "abort"
+    SHUTDOWN = "shutdown"
+    PROFILER_START = "profiler_start"
+    PROFILER_STOP = "profiler_stop"
+
+
+SHUTDOWN_TASK = {"type": OmniStageTaskType.SHUTDOWN}
+
+
+def is_profiler_task(task_type: OmniStageTaskType) -> bool:
+    return task_type in (OmniStageTaskType.PROFILER_START, OmniStageTaskType.PROFILER_STOP)
+
+
 def set_stage_devices(
     stage_id: int,
     devices: str | int | None,
-) -> str | None:
+    device_type: str | None = None,
+) -> None:
     """Configure per-stage device visibility and current device (CUDA or NPU).
 
     This function sets environment variables that control which devices are visible
-    to the process. It must be called BEFORE worker initialization so that workers
-    see the correct devices.
-
-
-    NOTE: This will set the control variable for the appropriate platform.
-        - CUDA: CUDA_VISIBLE_DEVICES
-        - NPU: ASCEND_RT_VISIBLE_DEVICES
+    to the process, and sets the current device. It must be called BEFORE worker
+    initialization so that workers see the correct devices.
 
     Args:
         stage_id: Stage identifier for logging
-        devices: Devices specified as either:
-            - None / "cpu"; uses the default visibility.
-            - An int or a str composed of one or more ints separated by commas,
-              which correspond to logical indices. If the control env var is
-              set, e.g., CUDA_VISIBLE_DEVICES, we will map the logical indices
-              to physical, e.g.,
-                    devices: [0,1,2,3]
-                    CUDA_VISIBLE_DEVICES -> [1, 3, 4, 5, 6]
-            will leverage [1, 3, 4, 5]
-
-    Returns:
-        The list of physical devices that were set for the given stage
-        or None if we have no passed devices / are using cpu.
+        devices: Device specification:
+            - Comma-separated string (e.g. "2,5,7"): interpreted as logical
+              indices against the current device visibility env var (e.g.
+              CUDA_VISIBLE_DEVICES/ASCEND_RT_VISIBLE_DEVICES) when present;
+              falls back to physical IDs if no mapping exists. Logical index 0
+              is used as current device.
+            - Integer or digit-string: treat as logical index (0-based) into the
+              current device visibility mapping; map to physical device, then set
+              env var to this single device.
+            - None/"cpu": keep default visibility.
+            - Otherwise: set env var to the provided single device string.
+        device_type: Device type ("cuda" or "npu"). If None, auto-detects.
+
+    Behavior:
+        - CUDA: Sets CUDA_VISIBLE_DEVICES and calls torch.cuda.set_device()
+        - NPU: Sets ASCEND_RT_VISIBLE_DEVICES and calls torch.npu.set_device()
     """
-    env_var = current_omni_platform.device_control_env_var
-    vis = os.environ.get(env_var)
-
-    if devices in (None, "cpu"):
-        logger.debug("[Stage-%s] Using default device visibility (devices=%s)", stage_id, devices)
-        return None
-
-    elif isinstance(devices, (int, str)):
-        device_list = _parse_device_list(devices)
-        if vis is not None:
-            visible_device_list = _parse_device_list(vis)
-            device_list = _map_device_list(stage_id, device_list, visible_device_list)
-        device_str = ",".join(device_list)
-        current_omni_platform.set_device_control_env_var(device_str)
-        return device_str
+    from vllm_omni.utils import detect_device_type, get_device_control_env_var
 
-    raise TypeError(f"Expected str or int device IDs for stage initialization, got type {type(devices)}")
+    if device_type is None:
+        device_type = detect_device_type()
 
+    env_var = get_device_control_env_var()
 
-def _parse_device_list(devices: str | int) -> list[str]:
-    """Given an int or a str representing one or more comma separated
-    non-negative IDs, coerce it to a list of strs.
+    try:
+        selected_physical: int | None = None
+        logical_idx: int | None = None
+
+        if isinstance(devices, str) and "," in devices:
+            toks = [t.strip() for t in devices.split(",") if t.strip() != ""]
+            vis = os.environ.get(env_var)
+            mapped_devices: list[str] = []
+            mapping: list[int] = []
+            if vis:
+                try:
+                    mapping = [int(x) for x in vis.split(",") if x.strip() != ""]
+                except Exception as e:
+                    logger.debug("[Stage-%s] Failed to parse existing %s: %s", stage_id, env_var, e)
+            for tok in toks:
+                try:
+                    idx = int(tok)
+                except Exception:
+                    mapped_devices.append(tok)
+                    continue
+                if mapping and 0 <= idx < len(mapping):
+                    mapped_devices.append(str(mapping[idx]))
+                else:
+                    mapped_devices.append(str(idx))
+            mapped_devices_str = ",".join(mapped_devices)
+            os.environ[env_var] = mapped_devices_str
+            if toks:
+                try:
+                    selected_physical = int(mapped_devices[0])
+                    logger.debug(
+                        "[Stage-%s] Set %s to %s; logical 0 -> physical %s",
+                        stage_id,
+                        env_var,
+                        mapped_devices_str,
+                        selected_physical,
+                    )
+                except Exception as e:
+                    logger.debug("[Stage-%s] Failed to parse first %s device: %s", stage_id, device_type, e)
+                    selected_physical = None
+        elif isinstance(devices, int | str) and (isinstance(devices, int) or str(devices).isdigit()):
+            logical_idx = max(0, int(devices))
+            vis = os.environ.get(env_var)
+            if vis:
+                try:
+                    mapping = [int(x) for x in vis.split(",") if x.strip() != ""]
+                    if 0 <= logical_idx < len(mapping):
+                        selected_physical = mapping[logical_idx]
+                except Exception as e:
+                    logger.debug("[Stage-%s] Failed to map logical index via %s: %s", stage_id, env_var, e)
+                    selected_physical = None
+            if selected_physical is None:
+                selected_physical = int(logical_idx)
+            os.environ[env_var] = str(selected_physical)
+            logger.debug(
+                "[Stage-%s] Logical index %d -> physical %s; set %s to single device",
+                stage_id,
+                logical_idx + 1,
+                selected_physical,
+                env_var,
+            )
+        elif devices in (None, "cpu"):
+            logger.debug("[Stage-%s] Using default device visibility (devices=%s)", stage_id, devices)
+        else:
+            selected_physical = int(str(devices))
+            os.environ[env_var] = str(selected_physical)
+            logger.debug("[Stage-%s] Set %s to single device %s (fallback)", stage_id, env_var, selected_physical)
+    except Exception as e:
+        logger.warning("Failed to interpret devices for stage %s: %s", stage_id, e)
 
-    Args:
-        devices: devices to be converted to a list of strs.
-    """
-    if isinstance(devices, int):
-        if devices < 0:
-            raise ValueError("Device IDs must be non-negative integers!")
-        return [str(devices)]
-    # Devices will usually be ints, but not always
-    # so we don't explicitly validate that here.
-    return [t.strip() for t in devices.split(",") if t.strip() != ""]
 
+def _resolve_model_tokenizer_paths(
+    model: str,
+    engine_args: dict,
+) -> str:
+    """Resolve model and tokenizer paths for non-standard directory structures.
 
 def _map_device_list(stage_id: int, device_list: list[str], visible_device_list: list[str]) -> list[str]:
     """Map logical stage devices onto the currently available device pool.
 
     Args:
-        stage_id: The stage ID currently configuring devices.
-        device_list: List of (logical) devices to be used, which are strings
-            holding non-negative nums counting from 0, 1, ..., n devices needed.
-        visible_device_list: List of physical devices available.
+        stage_id: Stage identifier for logging.
+        device_list: Logical device IDs requested by the stage.
+        visible_device_list: Physical devices currently available.
+
+    Returns:
+        Mapped physical device IDs.
     """
     num_visible = len(visible_device_list)
 
@@ -125,6 +190,42 @@ def _map_device_list(stage_id: int, device_list: list[str], visible_device_list:
     return mapped_devices
 
 
+def _resolve_model_tokenizer_paths(
+    model: str,
+    engine_args: dict,
+) -> str:
+    """Resolve model and tokenizer paths for non-standard directory structures.
+
+    Some models (e.g., GLM-Image) have tokenizer in root and model in subdirectory.
+    This function handles model_subdir and tokenizer_subdir engine_args.
+
+    Args:
+        model: Base model path
+        engine_args: Engine arguments (modified in-place to remove subdir args
+            and set tokenizer if needed)
+
+    Returns:
+        Resolved model path (may be subdirectory of original)
+    """
+    model_subdir = engine_args.pop("model_subdir", None)
+    tokenizer_subdir = engine_args.pop("tokenizer_subdir", None)
+    base_model_path = model
+
+    if model_subdir:
+        model = os.path.join(model, model_subdir)
+        logger.info(f"Using model subdirectory: {model}")
+
+    if tokenizer_subdir is not None:
+        tokenizer_path = os.path.join(base_model_path, tokenizer_subdir) if tokenizer_subdir else base_model_path
+        engine_args["tokenizer"] = tokenizer_path
+        logger.info(f"Using tokenizer from: {tokenizer_path}")
+    elif model_subdir and "tokenizer" not in engine_args:
+        engine_args["tokenizer"] = base_model_path
+        logger.info(f"Using tokenizer from base model path: {base_model_path}")
+
+    return model
+
+
 def serialize_obj(obj: Any) -> bytes:
     """Serialize a Python object to bytes using centralized serializer (defaults to cloudpickle)."""
     from vllm_omni.distributed.omni_connectors.utils.serialization import OmniSerializer
@@ -132,25 +233,12 @@ def serialize_obj(obj: Any) -> bytes:
     return OmniSerializer.serialize(obj)
 
 
-def shm_write_bytes(payload: bytes, name: str | None = None) -> dict[str, Any]:
+def shm_write_bytes(payload: bytes) -> dict[str, Any]:
     """Write bytes into SharedMemory and return meta dict {name,size}.
 
     Caller should close the segment; the receiver should unlink.
     """
-    try:
-        shm = _shm.SharedMemory(create=True, size=len(payload), name=name)
-    except FileExistsError:
-        if name:
-            # If name is specified and exists, unlink it and try again
-            try:
-                existing = _shm.SharedMemory(name=name)
-                existing.unlink()
-            except Exception:
-                pass
-            shm = _shm.SharedMemory(create=True, size=len(payload), name=name)
-        else:
-            raise
-
+    shm = _shm.SharedMemory(create=True, size=len(payload))
     mv = memoryview(shm.buf)
     mv[: len(payload)] = payload
     del mv
@@ -179,6 +267,57 @@ def shm_read_bytes(meta: dict[str, Any]) -> bytes:
     return data
 
 
+def _ensure_parent_dir(path: str) -> None:
+    """Ensure the parent directory for a file path exists (best-effort)."""
+    try:
+        parent = os.path.dirname(path)
+        if parent:
+            os.makedirs(parent, exist_ok=True)
+    except Exception:
+        pass
+
+
+def append_jsonl(path: str, record: dict[str, Any]) -> None:
+    """Append a JSON record as one line to a JSONL file (best-effort).
+
+    This is safe to call from multiple processes when each process writes
+    to a distinct file. For concurrent writes to the same file, OS append
+    semantics typically suffice, but no additional locking is provided.
+    """
+    try:
+        _ensure_parent_dir(path)
+        line = json.dumps(record, ensure_ascii=False)
+        fd = os.open(path, os.O_APPEND | os.O_CREAT | os.O_WRONLY, 0o644)
+        with os.fdopen(fd, "a", encoding="utf-8") as f:
+            f.write(line + "\n")
+    except Exception:
+        logger.exception("Failed to append JSONL to %s", path)
+
+
+def maybe_dump_to_shm(obj: Any, threshold: int) -> tuple[bool, Any]:
+    """Dump object to SHM if serialized size exceeds threshold.
+
+    Returns (True, meta) when dumped; otherwise (False, original_obj).
+    """
+    payload = serialize_obj(obj)
+    if len(payload) > threshold:
+        return True, shm_write_bytes(payload)
+    return False, obj
+
+
+def maybe_load_from_ipc(container: dict[str, Any], obj_key: str, shm_key: str) -> Any:
+    """Load object from container that may carry SHM or inline object.
+
+    Deprecated: prefer `maybe_load_from_ipc_with_metrics` to also obtain
+    decode-time and size metrics.
+    """
+    if shm_key in container:
+        from vllm_omni.distributed.omni_connectors.utils.serialization import OmniSerializer
+
+        return OmniSerializer.deserialize(shm_read_bytes(container[shm_key]))
+    return container[obj_key]
+
+
 def maybe_load_from_ipc_with_metrics(
     container: dict[str, Any], obj_key: str, shm_key: str
 ) -> tuple[Any, dict[str, float]]:
@@ -215,12 +354,27 @@ def maybe_load_from_ipc_with_metrics(
     }
 
 
+def encode_for_ipc(obj: Any, threshold: int, obj_key: str, shm_key: str) -> dict[str, Any]:
+    """Return a dict payload for IPC: inline (obj_key) or SHM (shm_key).
+
+    When serialized size exceeds threshold, returns {shm_key: {name,size}};
+    otherwise returns {obj_key: obj}.
+    """
+    payload: dict[str, Any] = {}
+    use_shm, data = maybe_dump_to_shm(obj, threshold)
+    if use_shm:
+        payload[shm_key] = data
+    else:
+        payload[obj_key] = data
+    return payload
+
+
 # Convert OmegaConf/objects to plain dicts
 def _to_dict(x: Any) -> dict[str, Any]:
     try:
         if isinstance(x, dict):
             return dict(x)
-        return _omega_to_dict(x)
+        return OmegaConf.to_container(x, resolve=True)  # type: ignore[arg-type]
     except Exception:
         try:
             return dict(x)
diff --git a/vllm_omni/entrypoints/zmq_utils.py b/vllm_omni/entrypoints/zmq_utils.py
new file mode 100644
index 00000000000..2ef5685cdaa
--- /dev/null
+++ b/vllm_omni/entrypoints/zmq_utils.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""ZMQ-based queue utilities for Omni IPC."""
+
+from __future__ import annotations
+
+import queue
+from typing import Any
+
+import zmq
+from vllm.utils.network_utils import make_zmq_socket
+
+
+class ZmqQueue:
+    """Queue-like wrapper on a ZMQ socket."""
+
+    def __init__(
+        self,
+        ctx: zmq.Context,
+        socket_type: int,
+        *,
+        bind: str | None = None,
+        connect: str | None = None,
+        recv_timeout_ms: int | None = None,
+        send_timeout_ms: int | None = None,
+    ) -> None:
+        # Determine path and bind mode
+        path = bind if bind is not None else connect
+        if path is None:
+            raise ValueError("Either bind or connect must be specified")
+        bind_mode = bind is not None
+
+        self._socket = make_zmq_socket(ctx, path, socket_type, bind=bind_mode, linger=5000)
+
+        # Reusable poller for efficient polling operations
+        self._poller = zmq.Poller()
+        self._poller.register(self._socket, zmq.POLLIN)
+
+        # Store default timeout settings
+        self._default_recv_timeout = recv_timeout_ms
+        self._default_send_timeout = send_timeout_ms
+
+        # Apply timeout settings if specified
+        if recv_timeout_ms is not None:
+            self._socket.rcvtimeo = recv_timeout_ms
+        if send_timeout_ms is not None:
+            self._socket.sndtimeo = send_timeout_ms
+
+        self.endpoint = path
+
+    def put(self, obj: Any) -> None:
+        """Send an object to the queue. Blocks until sent or timeout."""
+        try:
+            self._socket.send_pyobj(obj)
+        except zmq.Again as e:
+            raise queue.Full() from e
+
+    def put_nowait(self, obj: Any) -> None:
+        """Send an object to the queue without blocking."""
+        try:
+            self._socket.send_pyobj(obj, flags=zmq.NOBLOCK)
+        except zmq.Again as e:
+            raise queue.Full() from e
+
+    def get(self, timeout: float | None = None) -> Any:
+        """Receive an object from the queue with optional timeout in seconds."""
+        if timeout is None:
+            return self._socket.recv_pyobj()
+
+        # Use the reusable poller for timeout handling
+        events = dict(self._poller.poll(int(timeout * 1000)))
+        if events.get(self._socket) == zmq.POLLIN:
+            return self._socket.recv_pyobj()
+        raise queue.Empty()
+
+    def get_nowait(self) -> Any:
+        """Receive an object from the queue without blocking."""
+        try:
+            return self._socket.recv_pyobj(flags=zmq.NOBLOCK)
+        except zmq.Again as e:
+            raise queue.Empty() from e
+
+    def empty(self) -> bool:
+        """Check if the queue is empty without blocking."""
+        events = dict(self._poller.poll(0))
+        return events.get(self._socket) != zmq.POLLIN
+
+    def close(self) -> None:
+        self._socket.close(0)
+
+
+def create_zmq_queue(ctx: zmq.Context, endpoint: str, socket_type: int) -> ZmqQueue:
+    """Create a ZmqQueue from an endpoint string and socket type."""
+    return ZmqQueue(ctx, socket_type, connect=endpoint)
diff --git a/vllm_omni/model_executor/models/hcx_omni/__init__.py b/vllm_omni/model_executor/models/hcx_omni/__init__.py
new file mode 100644
index 00000000000..50793fc6936
--- /dev/null
+++ b/vllm_omni/model_executor/models/hcx_omni/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""HyperCLOVAX-SEED-Omni-8B models for vLLM-Omni."""
diff --git a/vllm_omni/model_executor/models/hcx_omni/hcx_omni.py b/vllm_omni/model_executor/models/hcx_omni/hcx_omni.py
new file mode 100644
index 00000000000..d05fad00a6d
--- /dev/null
+++ b/vllm_omni/model_executor/models/hcx_omni/hcx_omni.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""HyperCLOVAX-SEED-Omni-8B multi-stage model dispatcher.
+
+Architecture overview
+---------------------
+HyperCLOVAX-SEED-Omni-8B is a 3-stage omni model:
+
+  Stage 0  – Thinker (this module, LLM engine)
+    Input : text + optional image/audio
+    Output: text tokens + discrete audio codes (128606–135167)
+             + discrete vision codes (135168+)
+    Config: engine_output_type = "latent"
+
+  Stage 1  – Vision Decoder  (diffusion engine)
+    Input : 729 discrete vision codes from stage 0
+    Output: generated image (PNG / JPEG)
+    Config: model_class_name = "HyperCLOVAXVisionPipeline"
+
+  Stage 2  – Audio Decoder   (diffusion engine)
+    Input : N discrete audio codes from stage 0
+    Output: 24 kHz waveform (WAV / PCM)
+    Config: model_class_name = "HyperCLOVAXAudioPipeline"
+
+Stages 1 and 2 are handled by the vLLM-Omni *diffusion* engine and do
+**not** go through this LLM model registry.  This dispatcher exists
+only for stage 0 so that the standard ``model_arch`` routing works.
+"""
+
+from collections.abc import Iterable
+from typing import Any
+
+import torch
+from torch import nn
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.models.hcx_omni import (
+    HCXOmniDummyInputsBuilder,
+    HCXOmniForCausalLM,
+    HCXOmniMultiModalProcessor,
+    HCXOmniProcessingInfo,
+)
+from vllm.model_executor.models.interfaces import (
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.sequence import IntermediateTensors
+
+logger = init_logger(__name__)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    HCXOmniMultiModalProcessor,
+    info=HCXOmniProcessingInfo,
+    dummy_inputs=HCXOmniDummyInputsBuilder,
+)
+class HCXOmniForConditionalGeneration(
+    nn.Module,
+    SupportsMultiModal,
+    SupportsMRoPE,
+    SupportsPP,
+    SupportsQuant,
+):
+    """Top-level HyperCLOVAX-SEED-Omni-8B model for vLLM-Omni.
+
+    This class is the ``model_arch`` entry point for the thinker stage.
+    It delegates all logic to :class:`~vllm.model_executor.models.hcx_omni.
+    HCXOmniForCausalLM` from the vLLM base repository.
+
+    The vision decoder and audio decoder stages use ``model_class_name``
+    (diffusion engine) and therefore do not require an entry here.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        self._model = HCXOmniForCausalLM(vllm_config=vllm_config, prefix=prefix)
+
+    # ------------------------------------------------------------------ #
+    # Delegate interface implementations to the inner model               #
+    # ------------------------------------------------------------------ #
+
+    @property
+    def config(self):
+        return self._model.config
+
+    # SupportsMRoPE
+    def get_mrope_input_positions(self, *args: Any, **kwargs: Any):
+        return self._model.get_mrope_input_positions(*args, **kwargs)
+
+    def iter_mm_grid_thw(self, *args: Any, **kwargs: Any):
+        return self._model.iter_mm_grid_thw(*args, **kwargs)
+
+    # SupportsMultiModal
+    def get_multimodal_embeddings(self, **kwargs: Any):
+        return self._model.get_multimodal_embeddings(**kwargs)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings=None,
+    ) -> torch.Tensor:
+        return self._model.get_input_embeddings(input_ids, multimodal_embeddings)
+
+    # SupportsPP
+    def make_empty_intermediate_tensors(self, *args: Any, **kwargs: Any):
+        return self._model.make_empty_intermediate_tensors(*args, **kwargs)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        return self._model.forward(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
+        return self._model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        return self._model.load_weights(weights)
+
+    def get_mm_mapping(self):
+        return self._model.get_mm_mapping()
diff --git a/vllm_omni/model_executor/models/hcx_omni/hcx_omni_thinker.py b/vllm_omni/model_executor/models/hcx_omni/hcx_omni_thinker.py
new file mode 100644
index 00000000000..d5ab6396231
--- /dev/null
+++ b/vllm_omni/model_executor/models/hcx_omni/hcx_omni_thinker.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Thin wrapper around the vLLM base HCXOmniForCausalLM thinker.
+
+Registers the multimodal processor for the vLLM-Omni pipeline context
+and exposes all interfaces required by the thinker stage.
+"""
+
+from collections.abc import Iterable
+from typing import Any
+
+import torch
+from torch import nn
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.models.hcx_omni import (
+    HCXOmniDummyInputsBuilder,
+    HCXOmniForCausalLM,
+    HCXOmniMultiModalProcessor,
+    HCXOmniProcessingInfo,
+)
+from vllm.model_executor.models.interfaces import (
+    MultiModalEmbeddings,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.sequence import IntermediateTensors
+
+logger = init_logger(__name__)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    HCXOmniMultiModalProcessor,
+    info=HCXOmniProcessingInfo,
+    dummy_inputs=HCXOmniDummyInputsBuilder,
+)
+class HCXOmniThinkerForConditionalGeneration(
+    nn.Module,
+    SupportsMultiModal,
+    SupportsMRoPE,
+    SupportsPP,
+    SupportsQuant,
+):
+    """Thinker stage model for HyperCLOVAX-SEED-Omni-8B.
+
+    This is a thin wrapper around :class:`HCXOmniForCausalLM` (defined in
+    the vLLM base repository) that registers the multimodal processor and
+    exposes the standard vLLM model interfaces needed by the omni pipeline
+    thinker stage.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        self._model = HCXOmniForCausalLM(vllm_config=vllm_config, prefix=prefix)
+
+    # --- delegate all interface methods to inner model ------------------- #
+
+    @property
+    def config(self):
+        return self._model.config
+
+    @property
+    def language_model(self):
+        return self._model.language_model
+
+    @property
+    def visual(self):
+        return self._model.visual
+
+    @property
+    def audio_tower(self):
+        return self._model.audio_tower
+
+    # SupportsMRoPE
+    def get_mrope_input_positions(self, *args, **kwargs):
+        return self._model.get_mrope_input_positions(*args, **kwargs)
+
+    def iter_mm_grid_thw(self, *args, **kwargs):
+        return self._model.iter_mm_grid_thw(*args, **kwargs)
+
+    # SupportsMultiModal
+    def get_multimodal_embeddings(self, **kwargs: Any) -> MultiModalEmbeddings | None:
+        return self._model.get_multimodal_embeddings(**kwargs)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+    ) -> torch.Tensor:
+        return self._model.get_input_embeddings(input_ids, multimodal_embeddings)
+
+    # SupportsPP
+    def make_empty_intermediate_tensors(self, *args, **kwargs):
+        return self._model.make_empty_intermediate_tensors(*args, **kwargs)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        return self._model.forward(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
+        return self._model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        return self._model.load_weights(weights)
+
+    def get_mm_mapping(self):
+        return self._model.get_mm_mapping()
+
+    # SupportsQuant
+    def get_quant_config(self):
+        return getattr(self._model, "get_quant_config", lambda: None)()
diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py
index 617f0f9e325..4798fec5da7 100644
--- a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py
+++ b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py
@@ -1,6 +1,6 @@
-"""Thin Omni wrapper: reuse upstream Qwen2.5-Omni thinker (v0.14) with minimal overrides."""
+"""Thin Omni wrapper: reuse upstream Qwen2.5-Omni thinker (v0.12) with minimal overrides."""
 
-from collections.abc import Iterable, Mapping
+from collections.abc import Iterable
 from typing import Any
 
 import torch
@@ -12,7 +12,6 @@
     Qwen2_5OmniAudioEncoder,
 )
 from vllm.config import VllmConfig
-from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import (
     MultiModalEmbeddings,
@@ -22,19 +21,34 @@
     SupportsPP,
 )
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.model_executor.models.qwen2_5_omni_thinker import (
-    Qwen2_5OmniAudioFeatureInputs,
-    Qwen2_5OmniThinkerDummyInputsBuilder,
-    Qwen2_5OmniThinkerProcessingInfo,
-    check_interleaved_audio_video,
-    merge_interleaved_embeddings,
-)
+
+try:
+    from vllm.model_executor.models.qwen2_5_omni_thinker import (
+        Qwen2_5OmniAudioFeatureInputs,
+        Qwen2_5OmniThinkerDummyInputsBuilder,
+        Qwen2_5OmniThinkerMultiModalProcessor,
+        Qwen2_5OmniThinkerProcessingInfo,
+        check_interleaved_audio_video,
+        merge_interleaved_embeddings,
+    )
+except ImportError:
+    from vllm.model_executor.models.qwen2_5_omni_thinker import (  # type: ignore[no-redef]
+        Qwen2_5OmniAudioFeatureInputs,
+        Qwen2_5OmniThinkerDummyInputsBuilder,
+        Qwen2_5OmniThinkerMultiModalProcessor,
+        Qwen2_5OmniThinkerProcessingInfo,
+    )
+
+    def check_interleaved_audio_video(*a, **k):
+        return False
+
+    def merge_interleaved_embeddings(*a, **k):
+        raise NotImplementedError("merge_interleaved_embeddings not available")
+
+
 from vllm.model_executor.models.qwen2_5_omni_thinker import (
     Qwen2_5OmniConditionalGenerationMixin as Qwen2_5OmniConditionalGenerationMixinBase,
 )
-from vllm.model_executor.models.qwen2_5_omni_thinker import (
-    Qwen2_5OmniThinkerMultiModalProcessor as _Qwen2_5OmniThinkerMultiModalProcessorBase,
-)
 from vllm.model_executor.models.qwen2_5_vl import (
     Qwen2_5_VisionTransformer,
     Qwen2_5_VLImageEmbeddingInputs,
@@ -55,12 +69,6 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalFeatureSpec,
-    MultiModalKwargsItems,
-)
-from vllm.multimodal.parse import MultiModalDataItems
-from vllm.multimodal.processing.processor import (
-    MultiModalPromptUpdates,
-    PlaceholderFeaturesInfo,
 )
 from vllm.sequence import IntermediateTensors
 
@@ -75,78 +83,6 @@
 logger = init_logger(__name__)
 
 
-class Qwen2_5OmniThinkerMultiModalProcessor(
-    _Qwen2_5OmniThinkerMultiModalProcessorBase,
-):
-    """Override to fix use_audio_in_video detection when mm cache returns None."""
-
-    def _maybe_apply_prompt_updates(
-        self,
-        mm_items: MultiModalDataItems,
-        prompt_ids: list[int],
-        mm_kwargs: MultiModalKwargsItems,
-        mm_prompt_updates: MultiModalPromptUpdates,
-        is_update_applied: bool,
-    ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
-        mm_item_counts = mm_items.get_all_counts()
-        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
-        self._validate_mm_updates(mm_prompt_updates, mm_item_counts)
-
-        use_audio_in_video = False
-        if "video" in mm_kwargs:
-            non_none_items = [item for item in mm_kwargs["video"] if item is not None]
-            if non_none_items:
-                for item in non_none_items:
-                    if item.get("use_audio_in_video"):
-                        uaiv_tensor = item["use_audio_in_video"].data
-                        if uaiv_tensor.numel() > 0:
-                            use_audio_in_video = bool(uaiv_tensor.item())
-                            break
-            elif "audio" in mm_prompt_updates:
-                tokenizer = self.info.get_tokenizer()
-                audio_pad_id = tokenizer.convert_tokens_to_ids("<|audio_pad|>")
-                use_audio_in_video = audio_pad_id not in prompt_ids
-            # for mutilmodality cache
-            if any(item is None for item in mm_kwargs["video"]):
-                video_token_id = self.info.get_hf_config().video_token_id
-                audio_token_id = self.info.get_hf_config().audio_token_id
-                video_audio_item_num = sum(id in (video_token_id, audio_token_id) for id in prompt_ids)
-                audio_updates_num = len(mm_prompt_updates.get("audio", []))
-                video_updates_num = len(mm_prompt_updates.get("video", []))
-                if video_audio_item_num != video_updates_num + audio_updates_num:
-                    use_audio_in_video = True
-
-        if is_update_applied:
-            mm_placeholders = self._find_mm_placeholders(
-                prompt_ids,
-                mm_prompt_updates,
-            )
-            self._validate_mm_placeholders(
-                mm_placeholders,
-                mm_item_counts,
-            )
-        else:
-            if use_audio_in_video and "audio" in mm_prompt_updates:
-                filtered_updates = {k: v for k, v in mm_prompt_updates.items() if k != "audio"}
-                prompt_ids, mm_placeholders = self._apply_prompt_updates(
-                    prompt_ids,
-                    filtered_updates,
-                )
-                mm_placeholders = self._derive_audio_from_video_placeholders(mm_placeholders, mm_prompt_updates)
-            else:
-                prompt_ids, mm_placeholders = self._apply_prompt_updates(
-                    prompt_ids,
-                    mm_prompt_updates,
-                )
-
-            self._validate_mm_placeholders(
-                mm_placeholders,
-                mm_item_counts,
-            )
-
-        return prompt_ids, mm_placeholders
-
-
 class Qwen2_5OmniConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMixinBase):
     def _parse_and_validate_audio_input(self, **kwargs: object) -> Qwen2_5OmniAudioFeatureInputs | None:
         input_audio_features = kwargs.pop("input_audio_features", None)
@@ -253,43 +189,6 @@ def _parse_and_validate_video_input(
                 video_grid_thw=video_grid_thw,
             )
 
-    def _process_image_input(self, image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
-        if image_input["type"] == "image_embeds":
-            return image_input["image_embeds"].type(self.visual.dtype)
-
-        grid_thw = image_input["image_grid_thw"]
-        assert grid_thw.ndim == 2
-
-        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-        with set_forward_context(None, self.vllm_config):
-            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
-        # Split concatenated embeddings for each image item.
-        merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
-
-        return image_embeds.split(sizes.tolist())
-
-    def _process_video_input(
-        self,
-        video_input: Qwen2_5_VLVideoInputs,
-        video_hashes: list[str] = None,
-        cached_video_embeds: torch.Tensor = None,
-    ) -> torch.Tensor:
-        if video_input["type"] == "video_embeds":
-            return video_input["video_embeds"].type(self.visual.dtype)
-
-        grid_thw = video_input["video_grid_thw"]
-        assert grid_thw.ndim == 2
-
-        pixel_values_videos = video_input["pixel_values_videos"].type(self.visual.dtype)
-        with set_forward_context(None, self.vllm_config):
-            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
-        # Split concatenated embeddings for each video item.
-        merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
-
-        return video_embeds.split(sizes.tolist())
-
 
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen2_5OmniThinkerMultiModalProcessor,
@@ -304,6 +203,8 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     SupportsMRoPE,
     Qwen2_5OmniConditionalGenerationMixin,
 ):
+    merge_by_field_config = True
+
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "thinker.lm_head.": "language_model.lm_head.",
@@ -390,9 +291,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.language_model = init_vllm_registered_model(
                 vllm_config=vllm_config,
                 prefix=maybe_prefix(prefix, "language_model"),
-                hf_config=thinker_config.text_config,
-                architectures=["Qwen2ForCausalLM"],
-            )
+            hf_config=thinker_config.text_config,
+            architectures=["Qwen2ForCausalLM"],
+        )
 
         self.make_empty_intermediate_tensors = self.language_model.make_empty_intermediate_tensors
 
@@ -406,7 +307,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
                 mm_input_by_modality["image"] = self._parse_and_validate_image_input(**kwargs)
             if input_key in ("pixel_values_videos", "video_embeds") and "video" not in mm_input_by_modality:
                 mm_input_by_modality["video"] = self._parse_and_validate_video_input(**kwargs)
-            if input_key in ("input_audio_features",) and "audio" not in mm_input_by_modality:
+            if input_key in ("input_audio_features") and "audio" not in mm_input_by_modality:
                 mm_input_by_modality["audio"] = self._parse_and_validate_audio_input(**kwargs)
         return mm_input_by_modality
 
@@ -599,58 +500,25 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
                 multimodal_embeddings += tuple(audio_embeddings)
         return multimodal_embeddings
 
+    # TODO (ywang96): support overlapping modality embeddings so that
+    # `use_audio_in_video` will work on V1.
     def embed_input_ids(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
         is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
+        # This is to satisfy the type checker for each overload
         if multimodal_embeddings is None or is_multimodal is None:
             return super().embed_input_ids(input_ids)
 
-        inputs_embeds = self._embed_text_input_ids(
-            input_ids,
-            self.get_language_model().embed_input_ids,
-            is_multimodal=is_multimodal,
-        )
-
-        if len(multimodal_embeddings) == 0:
-            return inputs_embeds
-
-        # Check for audio-in-video: interleaved video and audio tokens
-        # in the multimodal region. Only use the interleaved path when
-        # needed; otherwise fall back to the default parent implementation.
-        video_token_id = self.config.video_token_index
-        audio_token_id = self.config.audio_token_index
-
-        is_video = is_multimodal & (input_ids == video_token_id)
-        is_audio = is_multimodal & (input_ids == audio_token_id)
-
-        num_video = is_video.sum().item()
-        num_audio = is_audio.sum().item()
-
-        if check_interleaved_audio_video(is_video, is_audio, num_video, num_audio):
-            inputs_embeds = self._embed_text_input_ids(
-                input_ids,
-                self.get_language_model().embed_input_ids,
-                is_multimodal=is_multimodal,
-            )
-            return merge_interleaved_embeddings(
-                inputs_embeds,
-                multimodal_embeddings,
-                is_video,
-                is_audio,
-                is_multimodal,
-                num_video,
-                num_audio,
-            )
-
-        # Default: standard merge (no interleaving), same as parent class
         return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
         )
 
     def forward(
diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py
index d03a96fd85a..900734de932 100644
--- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py
+++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py
@@ -22,15 +22,13 @@
 # limitations under the License.
 """Inference-only Qwen3-Omni-Moe model (thinker part)."""
 
-from collections.abc import Iterable, Iterator, Mapping, Sequence
-from dataclasses import replace
+from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
-from typing import Any, Literal, cast
+from typing import Any
 
 import numpy as np
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from packaging.version import Version
 from transformers import PretrainedConfig
 from transformers import __version__ as TRANSFORMERS_VERSION
@@ -39,43 +37,48 @@
     Qwen3OmniMoeConfig,
     Qwen3OmniMoeThinkerConfig,
 )
+from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import (
+    Qwen3OmniMoeAudioEncoder,
+)
 from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import (
     Qwen3OmniMoeProcessor,
 )
 from transformers.models.whisper import WhisperFeatureExtractor
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import PromptType
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
-from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
-from vllm.model_executor.layers.attention.mm_encoder_attention import (
-    MMEncoderAttention,
-)
-from vllm.model_executor.layers.linear import (
-    ColumnParallelLinear,
-    QKVParallelLinear,
-    ReplicatedLinear,
-    RowParallelLinear,
-)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.interfaces import (
     MultiModalEmbeddings,
     SupportsMRoPE,
     SupportsMultiModal,
     SupportsPP,
-    SupportsTranscription,
-)
-from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.model_executor.models.qwen2_5_omni_thinker import (
-    Qwen2_5OmniAudioFeatureInputs,
-    Qwen2_5OmniThinkerDummyInputsBuilder,
-    Qwen2_5OmniThinkerMultiModalProcessor,
-    check_interleaved_audio_video,
-    merge_interleaved_embeddings,
 )
+
+try:
+    from vllm.model_executor.models.qwen2_5_omni_thinker import (
+        Qwen2_5OmniAudioFeatureInputs,
+        Qwen2_5OmniThinkerDummyInputsBuilder,
+        Qwen2_5OmniThinkerMultiModalProcessor,
+        check_interleaved_audio_video,
+        merge_interleaved_embeddings,
+    )
+except ImportError:
+    from vllm.model_executor.models.qwen2_5_omni_thinker import (  # type: ignore[no-redef]
+        Qwen2_5OmniAudioFeatureInputs,
+        Qwen2_5OmniThinkerDummyInputsBuilder,
+        Qwen2_5OmniThinkerMultiModalProcessor,
+    )
+
+    def check_interleaved_audio_video(*a, **k):
+        return False
+
+    def merge_interleaved_embeddings(*a, **k):
+        raise NotImplementedError("merge_interleaved_embeddings not available")
+
+
 from vllm.model_executor.models.qwen2_5_vl import (
     Qwen2_5_VLProcessingInfo,
 )
@@ -83,18 +86,7 @@
 from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
 from vllm.model_executor.models.qwen3_moe import Qwen3MoeModel as _Qwen3MoeLLMModel
 from vllm.model_executor.models.qwen3_omni_moe_thinker import (
-    Qwen3Omni_VisionTransformer as _Qwen3Omni_VisionTransformer,
-)
-from vllm.model_executor.models.qwen3_omni_moe_thinker import (
-    Qwen3OmniMoeAudioAttention as _Qwen3OmniMoeAudioAttention,
-)
-from vllm.model_executor.models.qwen3_omni_moe_thinker import (
-    Qwen3OmniMoeAudioEncoder as _Qwen3OmniMoeAudioEncoder,
-)
-from vllm.model_executor.models.qwen3_omni_moe_thinker import (
-    Qwen3OmniMoeAudioEncoderLayer as _Qwen3OmniMoeAudioEncoderLayer,
-)
-from vllm.model_executor.models.qwen3_omni_moe_thinker import (
+    Qwen3Omni_VisionTransformer,
     _get_feat_extract_output_lengths,
 )
 from vllm.model_executor.models.utils import (
@@ -103,10 +95,13 @@
     _merge_multimodal_embeddings,
     maybe_prefix,
 )
+from vllm.model_executor.models.vision import (
+    get_llm_pos_ids_for_vision,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
 from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
-from vllm.multimodal.processing.processor import (
+from vllm.multimodal.processing import (
     MultiModalPromptUpdates,
     PlaceholderFeaturesInfo,
     PromptReplacement,
@@ -114,7 +109,6 @@
     PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.processor import cached_processor_from_config
 
 from vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_thinker import (
     Qwen2_5OmniConditionalGenerationMixin,
@@ -132,381 +126,6 @@
 logger = init_logger(__name__)
 
 
-class Qwen3Omni_VisionTransformer(_Qwen3Omni_VisionTransformer):
-    """Subclass that fixes Qwen2_5_VisionAttention.forward() compatibility.
-
-    The upstream Qwen3_VisionBlock.forward() does not pass the
-    ``sequence_lengths`` argument required by the updated
-    Qwen2_5_VisionAttention.forward() signature.  This subclass overrides
-    ``forward()`` to compute ``sequence_lengths`` via MMEncoderAttention
-    (following the pattern in qwen3_vl.py) and calls block internals
-    directly so the argument is forwarded correctly.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.tp_size = get_tensor_model_parallel_world_size()
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        grid_thw,
-    ) -> torch.Tensor:
-        hidden_states = x.to(device=self.device, dtype=self.dtype)
-        hidden_states = self.patch_embed(hidden_states)
-
-        if self.apply_vit_abs_pos_embed:
-            pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
-            hidden_states = hidden_states + pos_embeds
-        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw)
-
-        if isinstance(grid_thw, torch.Tensor):
-            grid_thw_tensor = grid_thw.to(self.device)
-        else:
-            grid_thw_tensor = torch.as_tensor(grid_thw, dtype=torch.int32, device=self.device)
-
-        try:
-            cu_seqlens = torch.repeat_interleave(
-                grid_thw_tensor[:, 1] * grid_thw_tensor[:, 2],
-                grid_thw_tensor[:, 0],
-            ).cumsum(
-                dim=0,
-                dtype=grid_thw_tensor.dtype if torch.jit.is_tracing() else torch.int32,
-            )
-            cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
-        except RuntimeError:
-            logger.warning(
-                "torch.repeat_interleave not executable, switching to vectorized searchsorted implementation."
-            )
-            repeat_counts = grid_thw_tensor[:, 0]
-            values = grid_thw_tensor[:, 1] * grid_thw_tensor[:, 2]
-            repeat_cumsum = repeat_counts.cumsum(0)
-            total_items = repeat_cumsum[-1].item()
-            indices = torch.searchsorted(
-                repeat_cumsum,
-                torch.arange(total_items, device=grid_thw_tensor.device),
-                right=True,
-            )
-            cu_seqlens = values[indices].cumsum(
-                dim=0,
-                dtype=grid_thw_tensor.dtype if torch.jit.is_tracing() else torch.int32,
-            )
-            cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
-
-        hidden_states = hidden_states.unsqueeze(1)
-        rotary_pos_emb_cos = rotary_pos_emb_cos.to(hidden_states.device)
-        rotary_pos_emb_sin = rotary_pos_emb_sin.to(hidden_states.device)
-        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
-
-        grid_thw_np = grid_thw_tensor.cpu().numpy().astype(np.int32)
-        cu_seqlens_np = np.repeat(grid_thw_np[:, 1] * grid_thw_np[:, 2], grid_thw_np[:, 0]).cumsum(
-            axis=0, dtype=np.int32
-        )
-        cu_seqlens_np = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens_np])
-        sequence_lengths = MMEncoderAttention.maybe_compute_seq_lens(
-            self.attn_backend,
-            cu_seqlens_np,
-            self.device,
-        )
-
-        hidden_states_list = []
-        deepstack_visual_indexes = self.deepstack_visual_indexes
-
-        for layer_num, blk in enumerate(self.blocks):
-            hidden_states = blk(
-                hidden_states,
-                cu_seqlens=cu_seqlens,
-                rotary_pos_emb_cos=rotary_pos_emb_cos,
-                rotary_pos_emb_sin=rotary_pos_emb_sin,
-                max_seqlen=max_seqlen,
-                sequence_lengths=sequence_lengths,
-            )
-
-            if deepstack_visual_indexes is not None and layer_num in deepstack_visual_indexes:
-                hidden_states_list.append(hidden_states)
-
-        hidden_states = self.merger(hidden_states)
-
-        if deepstack_visual_indexes is not None:
-            processed_hidden_states_list = [hidden_states]
-            for idx, x_ds in enumerate(hidden_states_list):
-                x_ds = self.merger_list[idx](x_ds)
-                processed_hidden_states_list.append(x_ds)
-            hidden_states = torch.cat(processed_hidden_states_list, dim=1)
-
-        return hidden_states
-
-
-class Qwen3OmniMoeAudioAttention(_Qwen3OmniMoeAudioAttention):
-    """Subclass that adds quant_config support to audio attention."""
-
-    def __init__(
-        self,
-        config,
-        quant_config: QuantizationConfig | None = None,
-        prefix: str = "",
-    ):
-        nn.Module.__init__(self)
-        self.embed_dim = config.d_model
-        self.num_heads = config.encoder_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        tp_size = get_tensor_model_parallel_world_size()
-        self.num_local_heads = self.num_heads // tp_size
-
-        if (self.head_dim * self.num_heads) != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: "
-                f"{self.embed_dim} and `num_heads`: {self.num_heads})."
-            )
-
-        self.scaling = self.head_dim**-0.5
-
-        self.qkv = QKVParallelLinear(
-            hidden_size=self.embed_dim,
-            head_size=self.head_dim,
-            total_num_heads=self.num_heads,
-            total_num_kv_heads=self.num_heads,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.qkv",
-        )
-
-        self.out_proj = RowParallelLinear(
-            input_size=self.embed_dim,
-            output_size=self.embed_dim,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.out_proj",
-        )
-
-        self.attn = MMEncoderAttention(
-            num_heads=self.num_local_heads,
-            head_size=self.head_dim,
-            scale=self.scaling,
-            prefix=f"{prefix}.attn",
-        )
-
-
-class Qwen3OmniMoeAudioEncoderLayer(_Qwen3OmniMoeAudioEncoderLayer):
-    """Subclass that adds quant_config support to audio encoder layers."""
-
-    def __init__(
-        self,
-        config,
-        quant_config: QuantizationConfig | None = None,
-        prefix: str = "",
-    ):
-        nn.Module.__init__(self)
-        self.embed_dim = config.d_model
-        self.self_attn = Qwen3OmniMoeAudioAttention(config, quant_config=quant_config, prefix=f"{prefix}.self_attn")
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.activation_fn = _ACTIVATION_REGISTRY[config.activation_function]
-        self.fc1 = ColumnParallelLinear(
-            self.embed_dim,
-            config.encoder_ffn_dim,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.fc1",
-        )
-        self.fc2 = RowParallelLinear(
-            config.encoder_ffn_dim,
-            self.embed_dim,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.fc2",
-        )
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
-
-
-class Qwen3OmniMoeAudioEncoder(_Qwen3OmniMoeAudioEncoder):
-    """Subclass that adds quant_config support to the audio encoder."""
-
-    def __init__(
-        self,
-        config,
-        quant_config: QuantizationConfig | None = None,
-        prefix: str = "",
-    ):
-        nn.Module.__init__(self)
-
-        embed_dim = config.d_model
-        self.num_mel_bins = config.num_mel_bins
-        self.max_source_positions = config.max_source_positions
-        self.n_window = config.n_window
-        self.n_window_infer = config.n_window_infer
-        self.conv_chunksize = config.conv_chunksize
-
-        from vllm.model_executor.models.qwen3_omni_moe_thinker import (
-            SinusoidsPositionEmbedding,
-        )
-
-        self.positional_embedding = SinusoidsPositionEmbedding(self.max_source_positions, embed_dim)
-
-        # Convolutional layers (not quantized)
-        self.conv2d1 = nn.Conv2d(1, config.downsample_hidden_size, 3, 2, padding=1)
-        self.conv2d2 = nn.Conv2d(
-            config.downsample_hidden_size,
-            config.downsample_hidden_size,
-            3,
-            2,
-            padding=1,
-        )
-        self.conv2d3 = nn.Conv2d(
-            config.downsample_hidden_size,
-            config.downsample_hidden_size,
-            3,
-            2,
-            padding=1,
-        )
-
-        conv_out_dim = config.downsample_hidden_size * ((((config.num_mel_bins + 1) // 2 + 1) // 2 + 1) // 2)
-        self.conv_out = ReplicatedLinear(
-            conv_out_dim,
-            config.d_model,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.conv_out",
-        )
-
-        self.layers = nn.ModuleList(
-            [
-                Qwen3OmniMoeAudioEncoderLayer(
-                    config,
-                    quant_config=quant_config,
-                    prefix=f"{prefix}.layers.{i}",
-                )
-                for i in range(config.encoder_layers)
-            ]
-        )
-
-        self.ln_post = nn.LayerNorm(config.d_model)
-        self.proj1 = ReplicatedLinear(
-            config.d_model,
-            config.d_model,
-            quant_config=quant_config,
-            prefix=f"{prefix}.proj1",
-        )
-        self.act = _ACTIVATION_REGISTRY[config.activation_function]
-        self.proj2 = ReplicatedLinear(
-            config.d_model,
-            config.output_dim,
-            quant_config=quant_config,
-            prefix=f"{prefix}.proj2",
-        )
-
-        from vllm.model_executor.models.vision import get_vit_attn_backend
-
-        self.attn_backend = get_vit_attn_backend(
-            head_size=config.d_model // config.encoder_attention_heads,
-            dtype=torch.get_default_dtype(),
-        )
-
-    def forward(
-        self,
-        input_features: torch.Tensor,
-        feature_lens: torch.Tensor,
-        aftercnn_lens: torch.Tensor,
-    ):
-        import torch.nn.functional as F
-
-        chunk_num = torch.ceil(feature_lens / (self.n_window * 2)).long()
-
-        chunk_lengths = torch.tensor(
-            [self.n_window * 2] * chunk_num.sum(),
-            dtype=torch.long,
-            device=feature_lens.device,
-        )
-        tail_chunk_index = F.pad(chunk_num, (1, 0), value=-1).cumsum(0)[1:]
-        chunk_lengths[tail_chunk_index] = feature_lens % (self.n_window * 2)
-        chunk_lengths[chunk_lengths == 0] = self.n_window * 2
-
-        chunk_list = input_features.T.split(chunk_lengths.tolist(), dim=0)
-        padded_feature = nn.utils.rnn.pad_sequence(chunk_list, batch_first=True).transpose(1, 2)
-
-        feature_lens_after_cnn = self._get_cnn_output_lengths(chunk_lengths)
-        max_len_after_cnn = feature_lens_after_cnn.max().item()
-        indices = torch.arange(max_len_after_cnn, device=padded_feature.device)
-        padded_mask_after_cnn = indices.unsqueeze(0) < feature_lens_after_cnn.unsqueeze(1)
-
-        padded_feature = padded_feature.unsqueeze(1)
-
-        if padded_feature.size(0) <= self.conv_chunksize:
-            padded_embed = F.gelu(self.conv2d1(padded_feature))
-            padded_embed = F.gelu(self.conv2d2(padded_embed))
-            padded_embed = F.gelu(self.conv2d3(padded_embed))
-        else:
-            padded_embeds = []
-            for chunk in padded_feature.split(self.conv_chunksize, dim=0):
-                padded_embed = F.gelu(self.conv2d1(chunk))
-                padded_embed = F.gelu(self.conv2d2(padded_embed))
-                padded_embed = F.gelu(self.conv2d3(padded_embed))
-                padded_embeds.append(padded_embed)
-            padded_embed = torch.cat(padded_embeds, dim=0)
-
-        b, c, f, t = padded_embed.size()
-        padded_embed, _ = self.conv_out(padded_embed.permute(0, 3, 1, 2).contiguous().view(b, t, c * f))
-
-        positional_embedding = (
-            self.positional_embedding.positional_embedding[: padded_embed.shape[1], :]
-            .unsqueeze(0)
-            .to(padded_embed.dtype)
-        )
-        padded_embed = padded_embed + positional_embedding
-
-        hidden_states = padded_embed[padded_mask_after_cnn]
-
-        cu_chunk_lens = [0]
-        window_aftercnn = padded_mask_after_cnn.shape[-1] * (self.n_window_infer // (self.n_window * 2))
-        for cnn_len in aftercnn_lens.tolist():
-            num_full_chunks = cnn_len // window_aftercnn
-            remainder = cnn_len % window_aftercnn
-            cu_chunk_lens.extend([window_aftercnn] * num_full_chunks)
-            if remainder:
-                cu_chunk_lens.append(remainder)
-        cu_seqlens = torch.tensor(cu_chunk_lens, device=aftercnn_lens.device).cumsum(-1, dtype=torch.int32)
-
-        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
-
-        for encoder_layer in self.layers:
-            hidden_states = encoder_layer(
-                hidden_states,
-                cu_seqlens,
-                max_seqlen,
-            )
-
-        # ReplicatedLinear returns (output, bias) — unpack
-        hidden_states = self.ln_post(hidden_states)
-        hidden_states, _ = self.proj1(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states, _ = self.proj2(hidden_states)
-
-        return hidden_states
-
-
-# Speech input languages supported by Qwen3-Omni
-# From: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct
-ISO639_1_SUPPORTED_LANGS = {
-    "en": "English",
-    "zh": "Chinese",
-    "ko": "Korean",
-    "ja": "Japanese",
-    "de": "German",
-    "ru": "Russian",
-    "it": "Italian",
-    "fr": "French",
-    "es": "Spanish",
-    "pt": "Portuguese",
-    "ms": "Malay",
-    "nl": "Dutch",
-    "id": "Indonesian",
-    "tr": "Turkish",
-    "vi": "Vietnamese",
-    "yue": "Cantonese",
-    "ar": "Arabic",
-    "ur": "Urdu",
-}
-
-
 @support_torch_compile(
     dynamic_arg_dims={
         "input_ids": 0,
@@ -523,6 +142,7 @@ def forward(
         positions: torch.Tensor,
         intermediate_tensors: IntermediateTensors | None = None,
         inputs_embeds: torch.Tensor | None = None,
+        *,
         capture_layer_indices: Sequence[int] | None = None,
         return_hidden_states: bool = False,
         deepstack_input_embeds: IntermediateTensors | None = None,
@@ -607,38 +227,6 @@ def get_feature_extractor(self, **kwargs: object):
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None, "image": None, "video": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int] | None = None,
-    ) -> Mapping[str, int] | None:
-        mm_counts = mm_counts or {}
-        requested_modalities = {m for m, c in mm_counts.items() if c > 0}
-        mm_max_tokens: dict[str, int] = {}
-
-        if requested_modalities & {"image", "video"}:
-            vl_tokens = Qwen2_5_VLProcessingInfo.get_mm_max_tokens_per_item(
-                self,
-                seq_len=seq_len,
-                mm_counts=mm_counts,
-            )
-            mm_max_tokens.update({m: vl_tokens[m] for m in ["image", "video"] if m in requested_modalities})
-
-        if "audio" in requested_modalities:
-            audio_tokens = Qwen2AudioProcessingInfo.get_mm_max_tokens_per_item(
-                self,
-                seq_len=seq_len,
-                mm_counts=mm_counts,
-            )
-            if audio_tokens is None:
-                feature_extractor = self.get_feature_extractor()
-                max_audio_samples = feature_extractor.chunk_length * feature_extractor.sampling_rate
-                max_audio_tokens = int(_get_feat_extract_output_lengths(torch.tensor([max_audio_samples])).item())
-                audio_tokens = {"audio": max_audio_tokens}
-            mm_max_tokens["audio"] = audio_tokens["audio"]
-
-        return mm_max_tokens
-
 
 Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder
 
@@ -664,7 +252,7 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray:
             return x
 
         # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
-        feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
+        feature_extractor = self.info.get_feature_extractor()
         hop_length = feature_extractor.hop_length
         if audios:
             # NOTE: Qwen3-Omni processor accept "audio"
@@ -682,36 +270,11 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray:
             # https://github.com/huggingface/transformers/pull/41473
             mm_kwargs = dict(mm_kwargs)
             tok_kwargs = dict(tok_kwargs)
-            mm_kwargs["audio_kwargs"] = dict(mm_kwargs.get("audio_kwargs") or {})
-            mm_kwargs["text_kwargs"] = dict(mm_kwargs.get("text_kwargs") or {})
             if Version(TRANSFORMERS_VERSION) < Version("4.58.0"):
-                # Extract audio_sample_rate before restructuring
-                audio_sample_rate = mm_kwargs.pop("audio_sample_rate", None)
-
                 # move truncation to audio_kwargs level to avoid conflict
                 # with tok_kwargs
-                mm_kwargs["audio_kwargs"].setdefault("truncation", mm_kwargs.pop("truncation", False))
-                mm_kwargs["text_kwargs"].setdefault("truncation", tok_kwargs.pop("truncation", False))
-
-                # Validate and conditionally pass audio_sample_rate
-                # WhisperFeatureExtractor has a fixed sampling rate, and vLLM's
-                # audio loader already resamples audio to the target rate.
-                # Only pass the value if it matches to avoid unexpected behavior.
-                if audio_sample_rate is not None:
-                    expected_sr = feature_extractor.sampling_rate
-                    if audio_sample_rate != expected_sr:
-                        logger.warning(
-                            "[%s] audio_sample_rate mismatch: user provided %dHz "
-                            "but model expects %dHz. Ignoring user value. "
-                            "vLLM's audio loader already resampled to %dHz.",
-                            self.__class__.__name__,
-                            audio_sample_rate,
-                            expected_sr,
-                            expected_sr,
-                        )
-                    else:
-                        # Sample rate matches, safe to pass
-                        mm_kwargs["audio_kwargs"]["audio_sample_rate"] = audio_sample_rate
+                mm_kwargs["audio_kwargs"] = {"truncation": mm_kwargs.pop("truncation", False)}
+                mm_kwargs["text_kwargs"] = {"truncation": tok_kwargs.pop("truncation", False)}
 
         hf_inputs = super()._call_hf_processor(
             prompt=prompt,
@@ -754,26 +317,11 @@ def _maybe_apply_prompt_updates(
 
         use_audio_in_video = False
         if "video" in mm_kwargs:
-            non_none_items = [item for item in mm_kwargs["video"] if item is not None]
-            if non_none_items:
-                # Normal case: at least one non-cached item, read flag directly
-                use_audio_in_video = any(item["use_audio_in_video"].data for item in non_none_items)
-            elif "audio" in mm_prompt_updates:
-                # All video items are from cache (None); infer from prompt:
-                # use_audio_in_video=True means the prompt has no <|audio_pad|>
-                # placeholder (audio is embedded in video tokens instead)
-                tokenizer = self.info.get_tokenizer()
-                audio_pad_id = tokenizer.convert_tokens_to_ids("<|audio_pad|>")
-                use_audio_in_video = audio_pad_id not in prompt_ids
-            # for mutilmodality cache
-            if any(item is None for item in mm_kwargs["video"]):
-                video_token_id = self.info.get_hf_config().video_token_id
-                audio_token_id = self.info.get_hf_config().audio_token_id
-                video_audio_item_num = sum(id in (video_token_id, audio_token_id) for id in prompt_ids)
-                audio_updates_num = len(mm_prompt_updates.get("audio", []))
-                video_updates_num = len(mm_prompt_updates.get("video", []))
-                if video_audio_item_num != video_updates_num + audio_updates_num:
+            for item in mm_kwargs["video"]:
+                if item and item["use_audio_in_video"].data:
                     use_audio_in_video = True
+                else:
+                    use_audio_in_video = False
 
         # normal case with `use_audio_in_video=False`
         if is_update_applied:
@@ -792,6 +340,7 @@ def _maybe_apply_prompt_updates(
                     prompt_ids,
                     filtered_updates,
                 )
+                # Derive audio placeholders from video placeholders
                 mm_placeholders = self._derive_audio_from_video_placeholders(mm_placeholders, mm_prompt_updates)
             else:
                 prompt_ids, mm_placeholders = self._apply_prompt_updates(
@@ -871,11 +420,11 @@ def _get_prompt_updates(
         if audio_feature_lengths is None and feature_attention_mask is None:
             audio_output_lengths = []
         elif audio_feature_lengths is not None:
-            audio_output_lens = _get_feat_extract_output_lengths(audio_feature_lengths)
+            _, audio_output_lens = _get_feat_extract_output_lengths(audio_feature_lengths)
             audio_output_lengths = audio_output_lens.tolist()
         elif feature_attention_mask is not None:
             assert isinstance(feature_attention_mask, torch.Tensor)
-            audio_output_lens = _get_feat_extract_output_lengths(feature_attention_mask.sum(-1))
+            _, audio_output_lens = _get_feat_extract_output_lengths(feature_attention_mask.sum(-1))
             audio_output_lengths = audio_output_lens.tolist()
 
         # number of audios read from video.
@@ -911,7 +460,7 @@ def get_replacement_qwen2_vision(item_idx: int, modality: str):
 
         def get_replacement_qwen2_use_audio_in_video(item_idx: int):
             nonlocal audio_in_video_item_idx
-            audio_num_features = audio_output_lengths[audio_in_video_item_idx]
+            audio_num_features = audio_output_lengths[audio_in_video_item_idx + item_idx]
             video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
 
             audio_in_video_item_idx += 1
@@ -966,6 +515,7 @@ def _derive_audio_from_video_placeholders(
         if "video" not in placeholders:
             return placeholders
 
+        # Validate audio and video counts match
         num_videos = len(placeholders["video"])
         num_audios = len(mm_prompt_updates.get("audio", []))
         if num_audios != num_videos:
@@ -1041,23 +591,61 @@ def _get_raw_input_ids(
 
 
 class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMixin):
+    def _parse_and_validate_audio_input(self, **kwargs: object) -> Qwen2_5OmniAudioFeatureInputs | None:
+        input_audio_features = kwargs.pop("input_audio_features", None)
+        audio_feature_lengths = kwargs.pop("audio_feature_lengths", None)
+        feature_attention_mask = kwargs.pop("feature_attention_mask", None)
+        if input_audio_features is None:
+            return None
+        if (
+            input_audio_features is not None
+            and isinstance(input_audio_features, torch.Tensor)
+            and input_audio_features.ndim == 3
+        ):
+            # (batch_size, feature_dim, chunk_size) -> (feature_dim, batch_size * chunk_size)
+            input_audio_features = input_audio_features.permute(1, 0, 2).flatten(1)
+        elif input_audio_features is not None and isinstance(input_audio_features, list):
+            input_audio_features = torch.cat(input_audio_features, dim=-1)
+        if (
+            audio_feature_lengths is not None
+            and isinstance(audio_feature_lengths, torch.Tensor)
+            and audio_feature_lengths.ndim == 2
+        ):
+            audio_feature_lengths = audio_feature_lengths.reshape(-1)
+        elif audio_feature_lengths is not None and isinstance(audio_feature_lengths, list):
+            audio_feature_lengths = torch.cat(audio_feature_lengths, dim=-1)
+        if (
+            feature_attention_mask is not None
+            and isinstance(feature_attention_mask, torch.Tensor)
+            and feature_attention_mask.ndim == 3
+        ):
+            feature_attention_mask = feature_attention_mask.reshape(-1, feature_attention_mask.shape[-1])
+        elif feature_attention_mask is not None and isinstance(feature_attention_mask, list):
+            for i in range(len(feature_attention_mask)):
+                feature_attention_mask[i] = feature_attention_mask[i].reshape(-1)
+        return Qwen2_5OmniAudioFeatureInputs(
+            type="audio_features",
+            input_features=input_audio_features,
+            audio_feature_lengths=audio_feature_lengths,
+            feature_attention_mask=feature_attention_mask,
+        )
+
     def _process_audio_input(
         self,
         audio_input: Qwen2_5OmniAudioFeatureInputs,
         audio_hashes: list[str] | None = None,
         cached_audio_features: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, ...]:
+    ) -> torch.Tensor:
         input_features = audio_input["input_features"]
         audio_feature_lengths = audio_input["audio_feature_lengths"]
 
-        audio_output_lengths = _get_feat_extract_output_lengths(audio_feature_lengths)
+        audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths(audio_feature_lengths)
 
         audio_outputs = self.audio_tower(
             input_features.to(self.audio_tower.dtype),
             feature_lens=audio_feature_lengths,
-            aftercnn_lens=audio_output_lengths,
         )
-        audio_features = audio_outputs if isinstance(audio_outputs, torch.Tensor) else audio_outputs.last_hidden_state
+        audio_features = audio_outputs.last_hidden_state
         return audio_features.split(audio_output_lengths.tolist())
 
 
@@ -1072,8 +660,9 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
     SupportsPP,
     SupportsMRoPE,
     Qwen3OmniMoeConditionalGenerationMixin,
-    SupportsTranscription,
 ):
+    merge_by_field_config = True
+
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "thinker.lm_head.": "language_model.lm_head.",
@@ -1082,20 +671,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
         }
     )
 
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
-    supported_languages = ISO639_1_SUPPORTED_LANGS
-
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
@@ -1115,7 +690,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = thinker_config
         self.multimodal_config = multimodal_config
-        self.quant_config = quant_config
 
         # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) only quantize
         # the Thinker LM (language model). Vision and audio encoder weights
@@ -1147,63 +721,51 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 visual_quant_config = None
                 language_quant_config = quant_config.resolve("language_model")
         else:
-            audio_quant_config = None
-            visual_quant_config = None
-            language_quant_config = None
-
-        with self._mark_tower_model(vllm_config, "audio"):
-            self.audio_tower = Qwen3OmniMoeAudioEncoder(
-                thinker_config.audio_config,
-                quant_config=audio_quant_config,
-                prefix=maybe_prefix(prefix, "audio_tower"),
+            logger.warning(
+                "flash_attn is not available, the model may not yield the "
+                "exactly same result as the transformers implementation "
+                "in the audio tower part."
             )
 
+        self.audio_tower = Qwen3OmniMoeAudioEncoder(thinker_config.audio_config)
+
+        attn_backend_override = multimodal_config.mm_encoder_attn_backend if multimodal_config is not None else None
+        self.visual = Qwen3Omni_VisionTransformer(
+            vision_config=thinker_config.vision_config,
+            norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "visual"),
+            attn_backend_override=attn_backend_override,
+        )
+        self.quant_config = quant_config
+
+        self.language_model = Qwen3MoeLLMForCausalLM(
+            vllm_config=vllm_config.with_hf_config(thinker_config.text_config, architectures=["Qwen3MoeForCausalLM"]),
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.make_empty_intermediate_tensors = self.language_model.make_empty_intermediate_tensors
+
         self.use_deepstack = hasattr(thinker_config.vision_config, "deepstack_visual_indexes")
         self.deepstack_num_level = (
             len(thinker_config.vision_config.deepstack_visual_indexes) if self.use_deepstack else 0
         )
+        # register buffer for deepstack
+        self.deepstack_input_embeds = (
+            [
+                torch.zeros(
+                    vllm_config.scheduler_config.max_num_batched_tokens,
+                    thinker_config.text_config.hidden_size,
+                )
+                for _ in range(self.deepstack_num_level)
+            ]
+            if self.use_deepstack
+            else None
+        )
         self.visual_dim = thinker_config.vision_config.out_hidden_size
         self.multiscale_dim = self.visual_dim * self.deepstack_num_level
 
-        with self._mark_tower_model(vllm_config, {"image", "video"}):
-            self.visual = Qwen3Omni_VisionTransformer(
-                vision_config=thinker_config.vision_config,
-                norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
-                quant_config=visual_quant_config,
-                prefix=maybe_prefix(prefix, "visual"),
-            )
-
-            # register buffer for deepstack
-            if self.use_deepstack:
-                self.deepstack_input_embeds = [
-                    torch.zeros(
-                        vllm_config.scheduler_config.max_num_batched_tokens,
-                        thinker_config.text_config.hidden_size,
-                    )
-                    for _ in range(self.deepstack_num_level)
-                ]
-
-        with self._mark_language_model(vllm_config):
-            lm_vllm_config = vllm_config.with_hf_config(
-                thinker_config.text_config,
-                architectures=["Qwen3MoeForCausalLM"],
-            )
-            if language_quant_config is not quant_config:
-                lm_vllm_config = replace(lm_vllm_config, quant_config=language_quant_config)
-            self.language_model = Qwen3MoeLLMForCausalLM(
-                vllm_config=lm_vllm_config,
-                prefix=maybe_prefix(prefix, "language_model"),
-            )
-
-        self.make_empty_intermediate_tensors = self.language_model.make_empty_intermediate_tensors
-
-    def _get_deepstack_input_embeds(
-        self,
-        num_tokens: int,
-    ) -> IntermediateTensors | None:
-        if not getattr(self, "deepstack_input_embeds", None):
-            return None  # If vision tower is skipped
-
+    def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors:
         # get deepstack_input_embeds from buffer, and clear the buffer
         return IntermediateTensors(
             {
@@ -1213,9 +775,6 @@ def _get_deepstack_input_embeds(
         )
 
     def _set_deepstack_input_embeds(self, deepstack_input_embeds: torch.Tensor) -> None:
-        if not getattr(self, "deepstack_input_embeds", None):
-            return
-
         # set deepstack_input_embeds to buffer
         num_tokens = deepstack_input_embeds.size(1)
         if num_tokens > self.deepstack_input_embeds[0].size(0):
@@ -1232,9 +791,6 @@ def _set_deepstack_input_embeds(self, deepstack_input_embeds: torch.Tensor) -> N
             self.deepstack_input_embeds[idx][:num_tokens].copy_(deepstack_input_embeds[idx])
 
     def _clear_deepstack_input_embeds(self, num_tokens: int) -> None:
-        if not getattr(self, "deepstack_input_embeds", None):
-            return
-
         # clear deepstack_input_embeds in buffer
         if num_tokens > 0:
             for idx in range(self.deepstack_num_level):
@@ -1250,7 +806,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
                 mm_input_by_modality["image"] = self._parse_and_validate_image_input(**kwargs)
             if input_key in ("pixel_values_videos", "video_embeds") and "video" not in mm_input_by_modality:
                 mm_input_by_modality["video"] = self._parse_and_validate_video_input(**kwargs)
-            if input_key in ("input_audio_features",) and "audio" not in mm_input_by_modality:
+            if input_key in ("input_audio_features") and "audio" not in mm_input_by_modality:
                 mm_input_by_modality["audio"] = self._parse_and_validate_audio_input(**kwargs)
         return mm_input_by_modality
 
@@ -1263,7 +819,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
             return []
 
         # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor corresponding to a multimodal data item (image or video).
+        # tensor correspoending to a multimodal data item (image or video).
         multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
         # NOTE: It is important to iterate over the keys in this dictionary
@@ -1287,28 +843,21 @@ def embed_input_ids(
         multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
         is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         inputs_embeds = self._embed_text_input_ids(
             input_ids,
             self.language_model.embed_input_ids,
             is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
         )
 
         if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
             return inputs_embeds
 
-        # Detect interleaved audio-in-video early, since it affects
-        # both the deepstack path and the final embedding merge.
-        video_token_id = self.config.video_token_id
-        audio_token_id = self.config.audio_token_id
-        is_video = is_multimodal & (input_ids == video_token_id)
-        is_audio = is_multimodal & (input_ids == audio_token_id)
-        num_video = is_video.sum().item()
-        num_audio = is_audio.sum().item()
-
-        is_interleaved = check_interleaved_audio_video(is_video, is_audio, num_video, num_audio)
-
         deepstack_input_embeds = None
+        # TODO (ywang96): support overlapping modalitiy embeddings so that
+        # `use_audio_in_video` will work on V1.
         # split the feat dim to obtain multi-scale visual feature
         has_vision_embeddings = [
             embeddings.shape[-1] != self.config.text_config.hidden_size for embeddings in multimodal_embeddings
@@ -1316,18 +865,12 @@ def embed_input_ids(
         if self.visual.deepstack_visual_indexes is not None and any(has_vision_embeddings):
             multiscale_len = len(self.visual.deepstack_visual_indexes)
             multimodal_embeddings_multiscale = []
-
-            if is_interleaved:
-                # Use input_ids-based mask for correct vision positions
-                # when audio and video tokens are interleaved.
-                is_vision = is_video.clone()
-            else:
-                is_vision = torch.zeros_like(is_multimodal)
-                mm_positions = torch.nonzero(is_multimodal, as_tuple=True)[0]
-                mm_position_idx = 0
-
+            is_vision = torch.zeros_like(is_multimodal)
+            mm_positions = torch.nonzero(is_multimodal, as_tuple=True)[0]
+            mm_position_idx = 0
             for index, embeddings in enumerate(multimodal_embeddings):
                 num_tokens = embeddings.shape[0]
+                current_positions = mm_positions[mm_position_idx : mm_position_idx + num_tokens]
 
                 # Vision embeddings
                 if embeddings.shape[-1] != self.config.text_config.hidden_size:
@@ -1336,18 +879,13 @@ def embed_input_ids(
                     embeddings_main, embeddings_multiscale = torch.split(embeddings, [visual_dim, multi_dim], dim=-1)
                     multimodal_embeddings[index] = embeddings_main
                     multimodal_embeddings_multiscale.append(embeddings_multiscale)
-                    if not is_interleaved:
-                        current_positions = mm_positions[mm_position_idx : mm_position_idx + num_tokens]
-                        is_vision[current_positions] = True
+                    is_vision[current_positions] = True
 
                 # Audio embeddings
                 else:
-                    if not is_interleaved:
-                        current_positions = mm_positions[mm_position_idx : mm_position_idx + num_tokens]
-                        is_vision[current_positions] = False
+                    is_vision[current_positions] = False
 
-                if not is_interleaved:
-                    mm_position_idx += num_tokens
+                mm_position_idx += num_tokens
 
             deepstack_input_embeds = inputs_embeds.new_zeros(
                 inputs_embeds.size(0), multiscale_len * inputs_embeds.size(1)
@@ -1364,27 +902,14 @@ def embed_input_ids(
             )
             self._set_deepstack_input_embeds(deepstack_input_embeds)
 
-        if is_interleaved:
-            return merge_interleaved_embeddings(
-                inputs_embeds,
-                multimodal_embeddings,
-                is_video,
-                is_audio,
-                is_multimodal,
-                num_video,
-                num_audio,
-            )
-
-        # Default: standard merge (no interleaving), same as parent class.
-        # multimodal_embeddings may have been updated above (deepstack
-        # main-scale). Use super() to stay consistent with the parent
-        # implementation and avoid issues seen in Qwen2.5-Omni (#34506).
-        return super().embed_input_ids(
-            input_ids,
+        inputs_embeds = _merge_multimodal_embeddings(
+            inputs_embeds=inputs_embeds,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
         )
 
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -1434,311 +959,211 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
         return loaded_weights
 
-    def _compute_audio_token_count(self, audio_feature_length: int) -> int:
-        """Compute audio tokens from feature length using Qwen3-Omni formula."""
-        return _get_feat_extract_output_lengths(torch.tensor([audio_feature_length])).item()
-
-    def _get_audio_for_video_mapping(self, mm_features: list[MultiModalFeatureSpec]) -> tuple[dict[int, int], set[int]]:
-        """
-        Map video offset -> paired audio_feature_length for use_audio_in_video.
-
-        When use_audio_in_video=True, audio is interleaved within video.
-        The pairing is based on feature order in mm_features.
-
-        Returns:
-            Tuple of (video_offset -> audio_feature_length mapping,
-                      set of paired audio offsets to skip)
-        """
-        videos_with_audio = [
-            f
-            for f in mm_features
-            if f.modality == "video" and f.data.get("use_audio_in_video") and f.data["use_audio_in_video"].data.item()
-        ]
-        audios = [f for f in mm_features if f.modality == "audio"]
-
-        mapping: dict[int, int] = {}
-        paired_audio_offsets: set[int] = set()
-        for i, video_f in enumerate(videos_with_audio):
-            if i < len(audios):
-                audio_len = audios[i].data["audio_feature_lengths"].data.item()
-                mapping[video_f.mm_position.offset] = audio_len
-                paired_audio_offsets.add(audios[i].mm_position.offset)
-        return mapping, paired_audio_offsets
-
-    def iter_mm_features(self, mm_features: list[MultiModalFeatureSpec]) -> Iterator[tuple[int, str, dict[str, Any]]]:
-        """
-        Iterate over multimodal features sorted by position offset.
-
-        Yields: (offset, modality, feature_data) where feature_data contains:
-        - image: {"grid_t", "grid_h", "grid_w", "t_factor"}
-        - video: {"grid_t", "grid_h", "grid_w", "t_factor",
-                  "use_audio_in_video", "audio_feature_length"}
-        - audio: {"audio_feature_length"}
-        """
-        config = self.config
-        spatial_merge_size = config.vision_config.spatial_merge_size
-        position_id_per_seconds = config.position_id_per_seconds
-
-        sorted_features = sorted(mm_features, key=lambda f: f.mm_position.offset)
-        audio_for_video, paired_audio_offsets = self._get_audio_for_video_mapping(sorted_features)
-
-        for mm_feature in sorted_features:
-            offset = mm_feature.mm_position.offset
-            modality = mm_feature.modality
-
-            if modality == "image":
-                t, h, w = mm_feature.data["image_grid_thw"].data.tolist()
-                yield (
-                    offset,
-                    "image",
-                    {
-                        "grid_t": t,
-                        "grid_h": h // spatial_merge_size,
-                        "grid_w": w // spatial_merge_size,
-                        "t_factor": position_id_per_seconds,
-                    },
-                )
-            elif modality == "video":
-                t, h, w = mm_feature.data["video_grid_thw"].data.tolist()
-                second_per_grid_ts = 2.0
-                if mm_feature.data.get("second_per_grid_ts"):
-                    second_per_grid_ts = mm_feature.data["second_per_grid_ts"].data.item()
-                use_audio_in_video = bool(
-                    mm_feature.data.get("use_audio_in_video") and mm_feature.data["use_audio_in_video"].data.item()
-                )
-
-                yield (
-                    offset,
-                    "video",
-                    {
-                        "grid_t": t,
-                        "grid_h": h // spatial_merge_size,
-                        "grid_w": w // spatial_merge_size,
-                        "t_factor": second_per_grid_ts * position_id_per_seconds,
-                        "use_audio_in_video": use_audio_in_video,
-                        "audio_feature_length": audio_for_video.get(offset),
-                    },
-                )
-            elif modality == "audio":
-                if offset not in paired_audio_offsets:
-                    audio_len = mm_feature.data["audio_feature_lengths"].data.item()
-                    yield offset, "audio", {"audio_feature_length": audio_len}
-
-    def _compute_interleaved_positions(self, start_idx: int, data: dict[str, Any]) -> tuple[np.ndarray, int]:
-        """
-        Compute positions for interleaved video+audio using Qwen3 token-by-token
-        interleaving logic.
-
-        Returns: (position_ids [3, N], total_token_count)
-        """
-        grid_t = data["grid_t"]
-        grid_h = data["grid_h"]
-        grid_w = data["grid_w"]
-        t_factor = data["t_factor"]
-        audio_feature_length = data["audio_feature_length"]
-
-        audio_len = self._compute_audio_token_count(audio_feature_length)
-
-        h_index = np.tile(np.arange(grid_h).reshape(1, -1, 1), (grid_t, 1, grid_w)).flatten()
-        w_index = np.tile(np.arange(grid_w).reshape(1, 1, -1), (grid_t, grid_h, 1)).flatten()
-        t_index_raw = np.arange(grid_t)
-        t_index_scaled = (t_index_raw * t_factor).astype(np.int64)
-        t_index = np.repeat(t_index_scaled, grid_h * grid_w)
-
-        video_pos = np.stack([t_index, h_index, w_index]) + start_idx
-        audio_pos = np.broadcast_to(np.arange(audio_len), (3, audio_len)) + start_idx
-
-        video_t_values = video_pos[0]
-        audio_t_values = audio_pos[0]
-
-        pos_ids_list: list[np.ndarray] = []
-        video_idx, audio_idx = 0, 0
-        num_video = grid_t * grid_h * grid_w
-
-        while video_idx < num_video and audio_idx < audio_len:
-            if video_t_values[video_idx] <= audio_t_values[audio_idx]:
-                pos_ids_list.append(video_pos[:, video_idx : video_idx + 1])
-                video_idx += 1
-            else:
-                pos_ids_list.append(audio_pos[:, audio_idx : audio_idx + 1])
-                audio_idx += 1
-
-        if video_idx < num_video:
-            pos_ids_list.append(video_pos[:, video_idx:])
-        if audio_idx < audio_len:
-            pos_ids_list.append(audio_pos[:, audio_idx:])
-
-        total_tokens = num_video + audio_len
-        return np.concatenate(pos_ids_list, axis=1), total_tokens
-
-    @classmethod
-    def get_speech_to_text_config(cls, model_config: ModelConfig, task_type: str) -> SpeechToTextConfig:
-        processor = cached_processor_from_config(model_config, processor_cls=Qwen3OmniMoeProcessor)
-        return SpeechToTextConfig(
-            max_audio_clip_s=processor.feature_extractor.chunk_length,
-            sample_rate=processor.feature_extractor.sampling_rate,
-            min_energy_split_window_size=None,
-        )
-
-    @classmethod
-    def get_generation_prompt(
-        cls,
-        audio: np.ndarray,
-        stt_config: SpeechToTextConfig,
-        model_config: ModelConfig,
-        language: str | None,
-        task_type: Literal["transcribe", "translate"],
-        request_prompt: str,
-        to_language: str | None,
-    ) -> PromptType:
-        """
-        Construct a transcription/translation prompt for Qwen3-Omni.
-        """
-        # Transcribe this audio [into <language>] | for transcription
-        # Translate this audio [from <language> into <to_language>] | for translation
-        instruction = "Transcribe" if task_type == "transcribe" else "Translate"
-        instruction += " this audio"
-
-        # Default to_language to English for translation
-        if task_type == "translate" and to_language is None:
-            to_language = "en"
-
-        # Get full language names from supported_languages mapping
-        full_lang_name = cls.supported_languages.get(language, "")
-        full_lang_name_to = cls.supported_languages.get(to_language, "")
-
-        if task_type == "transcribe" and full_lang_name:
-            instruction += f" into {full_lang_name}"
-        elif task_type == "translate":
-            if full_lang_name:
-                instruction += f" from {full_lang_name}"
-            if full_lang_name_to:
-                instruction += f" into {full_lang_name_to}"
-
-        instruction += "."
-
-        if request_prompt:
-            instruction += f" {request_prompt}"
-
-        processor = cached_processor_from_config(model_config, processor_cls=Qwen3OmniMoeProcessor)
-        # Audio placeholder format: <|audio_start|><|audio_pad|><|audio_end|>
-        audio_placeholder = "<|audio_start|><|audio_pad|><|audio_end|>"
-        user_content = f"{audio_placeholder}{instruction}"
-
-        messages = [{"role": "user", "content": user_content}]
-        prompt = processor.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True,
-        )
-
-        audio_data = (audio, stt_config.sample_rate)
-        prompts_dict = {"multi_modal_data": {"audio": audio_data}, "prompt": prompt}
-        return cast(PromptType, prompts_dict)
-
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
         mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        """Compute M-RoPE input positions using mm_features directly."""
-        seq_len = len(input_tokens)
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {
+                "image_grid_thw",
+                "video_grid_thw",
+                "second_per_grid_ts",
+                "audio_feature_lengths",
+                "use_audio_in_video",
+            },
+        )
+        image_grid_thw = kwargs.get("image_grid_thw", [])
+        video_grid_thw = kwargs.get("video_grid_thw", [])
+        second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
+        audio_feature_lengths = kwargs.get("audio_feature_lengths", [])
+        use_audio_in_video = any(kwargs.get("use_audio_in_video", []))
 
-        llm_pos_ids_list: list[np.ndarray] = []
-        st = 0
+        image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(image_grid_thw)
+        video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(video_grid_thw)
 
-        for offset, modality, data in self.iter_mm_features(mm_features):
-            text_len = offset - st
-            st_idx = int(llm_pos_ids_list[-1].max()) + 1 if llm_pos_ids_list else 0
+        input_ids = torch.tensor(input_tokens)
+        if input_ids is None or input_ids.ndim != 1:
+            raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids")
 
-            if text_len > 0:
-                llm_pos_ids_list.append(np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx)
-                st_idx += text_len
+        seq_len = input_ids.shape[0]
 
-            bos_pos = np.broadcast_to(np.array([st_idx]), (3, 1))
-            llm_pos_ids_list.append(bos_pos)
-            st_idx += 1
+        if isinstance(audio_feature_lengths, list):
+            audio_feature_lengths = torch.tensor(audio_feature_lengths, dtype=torch.long)
 
-            if modality == "audio":
-                audio_tokens = self._compute_audio_token_count(data["audio_feature_length"])
-                audio_pos = np.broadcast_to(np.arange(audio_tokens), (3, audio_tokens)) + st_idx
-                llm_pos_ids_list.append(audio_pos)
-                st_idx = int(audio_pos.max()) + 1
-
-                eos_pos = np.broadcast_to(np.array([st_idx]), (3, 1))
-                llm_pos_ids_list.append(eos_pos)
-                st = offset + 1 + audio_tokens + 1
-
-            elif modality == "image":
-                grid_t = data["grid_t"]
-                grid_h = data["grid_h"]
-                grid_w = data["grid_w"]
-                t_factor = data["t_factor"]
-
-                grid_indices = np.indices((grid_t, grid_h, grid_w))
-                if t_factor != 1.0:
-                    grid_indices[0] = (grid_indices[0] * t_factor).astype(np.int64)
-                llm_pos_ids_list.append(grid_indices.reshape(3, -1) + st_idx)
-
-                image_len = grid_t * grid_h * grid_w
-                st_idx = int(llm_pos_ids_list[-1].max()) + 1
-
-                eos_pos = np.broadcast_to(np.array([st_idx]), (3, 1))
-                llm_pos_ids_list.append(eos_pos)
-                st = offset + 1 + image_len + 1
-
-            elif modality == "video":
-                grid_t = data["grid_t"]
-                grid_h = data["grid_h"]
-                grid_w = data["grid_w"]
-                t_factor = data["t_factor"]
-
-                if not data["use_audio_in_video"]:
-                    grid_indices = np.indices((grid_t, grid_h, grid_w))
-                    if t_factor != 1.0:
-                        grid_indices[0] = (grid_indices[0] * t_factor).astype(np.int64)
-                    llm_pos_ids_list.append(grid_indices.reshape(3, -1) + st_idx)
-
-                    video_len = grid_t * grid_h * grid_w
-                    st_idx = int(llm_pos_ids_list[-1].max()) + 1
-
-                    eos_pos = np.broadcast_to(np.array([st_idx]), (3, 1))
-                    llm_pos_ids_list.append(eos_pos)
-                    st = offset + 1 + video_len + 1
-                else:
-                    audio_bos_pos = np.broadcast_to(np.array([st_idx - 1]), (3, 1))
-                    llm_pos_ids_list.append(audio_bos_pos)
+        if not len(second_per_grid_ts) and len(video_grid_thw):
+            second_per_grid_ts = 2.0
+            second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32) * second_per_grid_ts
+        else:
+            second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
 
-                    pos_ids, _ = self._compute_interleaved_positions(st_idx, data)
-                    llm_pos_ids_list.append(pos_ids)
-                    st_idx = int(pos_ids.max()) + 1
+        config = self.config
+        spatial_merge_size = config.vision_config.spatial_merge_size
+        image_token_id = config.image_token_id
+        video_token_id = config.video_token_id
+        audio_token_id = config.audio_token_id
+        vision_start_token_id = config.vision_start_token_id
+        audio_start_token_id = config.audio_start_token_id
+        position_id_per_seconds = config.position_id_per_seconds
 
-                    eos_pos = np.broadcast_to(np.array([st_idx]), (3, 1))
-                    llm_pos_ids_list.append(eos_pos)
-                    llm_pos_ids_list.append(eos_pos)
+        vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+        if vision_start_indices.numel() > 0:
+            vision_tokens = input_ids[vision_start_indices + 1]
+        else:
+            vision_tokens = input_ids.new_empty((0,), dtype=input_ids.dtype)
+        audio_nums = torch.sum(input_ids == audio_start_token_id)
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (
+            (vision_tokens == audio_start_token_id).sum()
+            if use_audio_in_video
+            else (vision_tokens == video_token_id).sum()
+        )
 
-                    video_len = grid_t * grid_h * grid_w
-                    audio_len = self._compute_audio_token_count(data["audio_feature_length"])
-                    st = offset + 2 + video_len + audio_len + 2
+        llm_pos_ids_list: list[torch.Tensor] = []
+        st = 0
+        image_idx = 0
+        video_idx = 0
+        audio_idx = 0
+        remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums  # noqa: E501
+        multimodal_nums = image_nums + audio_nums if use_audio_in_video else image_nums + video_nums + audio_nums  # noqa: E501
+
+        for _ in range(multimodal_nums):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+            if (image_token_id in input_tokens or video_token_id in input_tokens) and (
+                remain_videos > 0 or remain_images > 0
+            ):
+                ed_vision_start = input_tokens.index(vision_start_token_id, st)
+            else:
+                ed_vision_start = len(input_tokens) + 1
+            if audio_token_id in input_tokens and remain_audios > 0:
+                ed_audio_start = input_tokens.index(audio_start_token_id, st)
+            else:
+                ed_audio_start = len(input_tokens) + 1
+            min_ed = min(ed_vision_start, ed_audio_start)
+
+            if min_ed == ed_audio_start:
+                text_len = min_ed - st
+                if text_len != 0:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                bos_len = 1
+                llm_pos_ids_list.append(torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                _, audio_len = _get_feat_extract_output_lengths(audio_feature_lengths[audio_idx])
+                llm_pos_ids = torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx
+                llm_pos_ids_list.append(llm_pos_ids)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                eos_len = 1
+                llm_pos_ids_list.append(torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx)
+                st += text_len + bos_len + audio_len + eos_len
+                audio_idx += 1
+                remain_audios -= 1
+            elif min_ed == ed_vision_start and input_ids[ed_vision_start + 1] == image_token_id:
+                text_len = min_ed - st
+                if text_len != 0:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                bos_len = 1
+                llm_pos_ids_list.append(torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                grid_t = image_grid_thw[image_idx][0]
+                grid_hs = image_grid_thw[:, 1]
+                grid_ws = image_grid_thw[:, 2]
+                t_index = torch.arange(grid_t) * position_id_per_seconds
+                llm_pos_ids = get_llm_pos_ids_for_vision(
+                    st_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                image_len = image_grid_thw[image_idx].prod() // (spatial_merge_size**2)
+                llm_pos_ids_list.append(llm_pos_ids)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                eos_len = 1
+                llm_pos_ids_list.append(torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx)
+                st += text_len + bos_len + image_len + eos_len
+                image_idx += 1
+                remain_images -= 1
+            elif (
+                min_ed == ed_vision_start
+                and input_ids[ed_vision_start + 1] == video_token_id
+                and not use_audio_in_video
+            ):
+                text_len = min_ed - st
+                if text_len != 0:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                bos_len = 1
+                llm_pos_ids_list.append(torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = torch.arange(grid_t) * float(second_per_grids[video_idx].item()) * position_id_per_seconds
+                llm_pos_ids = get_llm_pos_ids_for_vision(
+                    st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
+                llm_pos_ids_list.append(llm_pos_ids)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                eos_len = 1
+                llm_pos_ids_list.append(torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx)
+                st += text_len + bos_len + video_len + eos_len
+                video_idx += 1
+                remain_videos -= 1
+            elif min_ed == ed_vision_start and ed_vision_start + 1 == ed_audio_start and use_audio_in_video:
+                text_len = min_ed - st
+                if text_len != 0:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                bos_len = 1
+                bos_block = torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx
+                llm_pos_ids_list.append(bos_block)
+                llm_pos_ids_list.append(bos_block)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                _, audio_len = _get_feat_extract_output_lengths(audio_feature_lengths[audio_idx])
+                audio_llm_pos_ids = torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = torch.arange(grid_t) * float(second_per_grids[video_idx].item()) * position_id_per_seconds
+                video_llm_pos_ids = get_llm_pos_ids_for_vision(
+                    st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                video_data_index, audio_data_index = 0, 0
+                while video_data_index < video_llm_pos_ids.shape[-1] and audio_data_index < audio_llm_pos_ids.shape[-1]:
+                    if video_llm_pos_ids[0][video_data_index] <= audio_llm_pos_ids[0][audio_data_index]:
+                        llm_pos_ids_list.append(video_llm_pos_ids[:, video_data_index : video_data_index + 1])
+                        video_data_index += 1
+                    else:
+                        llm_pos_ids_list.append(audio_llm_pos_ids[:, audio_data_index : audio_data_index + 1])
+                        audio_data_index += 1
+                if video_data_index < video_llm_pos_ids.shape[-1]:
+                    llm_pos_ids_list.append(video_llm_pos_ids[:, video_data_index : video_llm_pos_ids.shape[-1]])
+                if audio_data_index < audio_llm_pos_ids.shape[-1]:
+                    llm_pos_ids_list.append(audio_llm_pos_ids[:, audio_data_index : audio_llm_pos_ids.shape[-1]])
+                video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                eos_len = 1
+                eos_block = torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx
+                llm_pos_ids_list.append(eos_block)
+                llm_pos_ids_list.append(eos_block)
+                st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2  # noqa: E501
+                audio_idx += 1
+                video_idx += 1
+                remain_videos -= 1
+                remain_audios -= 1
 
-        if st < seq_len:
-            st_idx = int(llm_pos_ids_list[-1].max()) + 1 if llm_pos_ids_list else 0
-            text_len = seq_len - st
-            llm_pos_ids_list.append(np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx)
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx)
 
-        llm_positions = np.concatenate(llm_pos_ids_list, axis=1).reshape(3, -1)
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
         if llm_positions.shape[1] != seq_len:
             raise RuntimeError("Position ids length mismatch with input ids length")
 
-        mrope_position_delta = int(llm_positions.max()) + 1 - seq_len
-        return torch.from_numpy(llm_positions), mrope_position_delta
-
-    def get_mm_mapping(self) -> MultiModelKeys:
-        """
-        Get the module prefix in multimodal models
-        """
-        return MultiModelKeys.from_string_field(
-            language_model="language_model",
-            connector="visual.merger",
-            tower_model=["visual.", "audio_tower."],
-        )
+        mrope_position_delta = llm_positions.max() + 1 - seq_len
+        return llm_positions, mrope_position_delta
diff --git a/vllm_omni/model_executor/models/registry.py b/vllm_omni/model_executor/models/registry.py
index 3407b428695..99b79eeab7e 100644
--- a/vllm_omni/model_executor/models/registry.py
+++ b/vllm_omni/model_executor/models/registry.py
@@ -1,8 +1,4 @@
-from vllm.model_executor.models.registry import (
-    _VLLM_MODELS,
-    _LazyRegisteredModel,
-    _ModelRegistry,
-)
+from vllm.model_executor.models.registry import _VLLM_MODELS, _LazyRegisteredModel, _ModelRegistry
 
 _OMNI_MODELS = {
     "Qwen2_5OmniForConditionalGeneration": (
@@ -57,11 +53,6 @@
         "cosyvoice3",
         "CosyVoice3Model",
     ),
-    "OmniVoiceModel": (
-        "omnivoice",
-        "omnivoice",
-        "OmniVoiceModel",
-    ),
     "MammothModa2Qwen2ForCausalLM": (
         "mammoth_moda2",
         "mammoth_moda2",
@@ -174,14 +165,26 @@
         "dynin_omni",
         "DyninOmniForConditionalGeneration",
     ),
+    ## HyperCLOVAX-SEED-Omni-8B
+    # stage 0 (thinker LLM) — stages 1/2 use DiffusionModelRegistry via model_class_name
+    "HCXVisionV2ForCausalLM": (
+        "hcx_omni",
+        "hcx_omni",
+        "HCXOmniForConditionalGeneration",
+    ),
+    "HCXOmniForCausalLM": (
+        "hcx_omni",
+        "hcx_omni",
+        "HCXOmniForConditionalGeneration",
+    ),
 }
 
-
 _VLLM_OMNI_MODELS = {
     **_VLLM_MODELS,
     **_OMNI_MODELS,
 }
 
+
 OmniModelRegistry = _ModelRegistry(
     {
         **{
diff --git a/vllm_omni/model_executor/stage_configs/hcx_omni.yaml b/vllm_omni/model_executor/stage_configs/hcx_omni.yaml
new file mode 100644
index 00000000000..6a76fa2791d
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/hcx_omni.yaml
@@ -0,0 +1,102 @@
+runtime:
+  connectors:
+    shared_memory_connector:
+      extra:
+        shm_threshold_bytes: 65536
+      name: SharedMemoryConnector
+  defaults:
+    max_inflight: 8
+    window_size: -1
+  edges:
+  - from: 0
+    to: 1
+    window_size: -1
+  - from: 0
+    to: 2
+    window_size: -1
+  - from: 1
+    to: 2
+    window_size: -1
+  enabled: true
+stage_args:
+- default_sampling_params:
+    detokenize: true
+    max_tokens: 2048
+    repetition_penalty: 1.0
+    seed: 42
+    temperature: 0.1
+    top_k: -1
+    top_p: 1.0
+  engine_args:
+    enable_prefix_caching: false
+    enforce_eager: true
+    engine_output_type: latent
+    gpu_memory_utilization: 0.15
+    limit_mm_per_prompt:
+      audio: 1
+      image: 1
+    max_model_len: 8192
+    max_num_seqs: 8
+    model_arch: HCXVisionV2ForCausalLM
+    model_stage: thinker
+    scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+    tensor_parallel_size: 4
+    trust_remote_code: true
+    worker_type: ar
+  final_output: true
+  final_output_type: text
+  is_comprehension: true
+  runtime:
+    devices: 0,1,2,3
+    max_batch_size: 8
+    process: true
+  stage_id: 0
+  stage_type: llm
+- custom_process_input_func: vllm_omni.model_executor.stage_input_processors.hyperclovax_seed_omni.thinker2vision_decoder
+  default_sampling_params:
+    guidance_scale: 0.0
+    height: 768
+    num_inference_steps: 50
+    seed: 42
+    width: 768
+  engine_args:
+    distributed_executor_backend: mp
+    enforce_eager: true
+    engine_output_type: image
+    gpu_memory_utilization: 0.75
+    model_class_name: HyperCLOVAXVisionPipeline
+    model_stage: decoder/vision
+    model_subdir: decoder/vision
+    trust_remote_code: true
+  engine_input_source:
+  - 0
+  final_output: true
+  final_output_type: image
+  runtime:
+    devices: '4'
+    max_batch_size: 1
+    process: true
+  stage_id: 1
+  stage_type: diffusion
+- custom_process_input_func: vllm_omni.model_executor.stage_input_processors.hyperclovax_seed_omni.thinker2audio_decoder
+  default_sampling_params:
+    seed: 42
+  engine_args:
+    distributed_executor_backend: mp
+    enforce_eager: true
+    engine_output_type: audio
+    gpu_memory_utilization: 0.4
+    model_class_name: HyperCLOVAXAudioPipeline
+    model_stage: decoder/audio
+    model_subdir: decoder/audio/NCZSCosybigvganDecoder.mar
+    trust_remote_code: true
+  engine_input_source:
+  - 0
+  final_output: true
+  final_output_type: audio
+  runtime:
+    devices: '5'
+    max_batch_size: 1
+    process: true
+  stage_id: 2
+  stage_type: diffusion
diff --git a/vllm_omni/model_executor/stage_input_processors/hyperclovax_seed_omni.py b/vllm_omni/model_executor/stage_input_processors/hyperclovax_seed_omni.py
new file mode 100644
index 00000000000..e066b03403a
--- /dev/null
+++ b/vllm_omni/model_executor/stage_input_processors/hyperclovax_seed_omni.py
@@ -0,0 +1,145 @@
+"""Stage input processors for HyperCLOVAX-SEED-Omni-8B pipeline.
+
+The thinker generates a mixed token sequence containing:
+  - Regular text tokens (< 128606)
+  - Discrete audio tokens (128606 ~ 135167)
+  - Discrete vision tokens (135168 ~ 135168+255)
+
+These processors extract the relevant discrete tokens and route them
+to the appropriate decoder stage.
+"""
+
+from vllm.inputs import TextPrompt
+
+from vllm_omni.inputs.data import OmniTokensPrompt
+
+# Token ID boundaries from config.json
+DISCRETE_AUDIO_UNIT_0_ID = 128606
+DISCRETE_IMAGE_UNIT_0_ID = 135168
+DISCRETE_AUDIO_VOCAB_SIZE = 6561  # CosyVoice2 FSQ codebook
+DISCRETE_IMAGE_VOCAB_SIZE = 65536  # TA-Tok SimVQ codebook (2^16)
+DISCRETE_IMAGE_TOKEN_LENGTH = 729  # 27x27 latent tokens per image
+
+
+def _extract_discrete_tokens(token_ids: list[int], start_id: int, vocab_size: int) -> list[int]:
+    """Extract and remap discrete tokens from a mixed token sequence.
+
+    Returns tokens remapped to [0, vocab_size) range.
+    """
+    return [tid - start_id for tid in token_ids if start_id <= tid < start_id + vocab_size]
+
+
+def thinker2vision_decoder(
+    stage_list,
+    engine_input_source,
+    prompt: OmniTokensPrompt | TextPrompt = None,
+    requires_multimodal_data: bool = False,
+):
+    """Extract discrete vision tokens from thinker output → vision decoder.
+
+    The vision decoder (HyperCLOVAXVisionPipeline) takes 256 discrete codes
+    per image and converts them to pixel images via diffusion.
+    """
+    if not engine_input_source:
+        raise ValueError("engine_input_source cannot be empty")
+
+    source_stage_id = engine_input_source[0]
+    thinker_outputs = stage_list[source_stage_id].engine_outputs
+    if thinker_outputs is None:
+        raise RuntimeError(f"Stage {source_stage_id} has no outputs yet")
+
+    vision_decoder_inputs = []
+    for thinker_output in thinker_outputs:
+        # Text-only iterations can produce an empty outputs list.
+        if not thinker_output.outputs:
+            continue
+        output = thinker_output.outputs[0]
+        output_token_ids = list(output.token_ids)
+        vision_codes = _extract_discrete_tokens(output_token_ids, DISCRETE_IMAGE_UNIT_0_ID, DISCRETE_IMAGE_VOCAB_SIZE)
+
+        if not vision_codes:
+            continue
+
+        # Truncate/pad to exact DISCRETE_IMAGE_TOKEN_LENGTH (27x27=729).
+        # The LLM may generate slightly more or fewer tokens than expected;
+        # the vision decoder rearranges as (h w) → (h, w) so the length must be
+        # a perfect square == DISCRETE_IMAGE_TOKEN_LENGTH.
+        vision_codes = vision_codes[:DISCRETE_IMAGE_TOKEN_LENGTH]
+        vision_codes += [0] * (DISCRETE_IMAGE_TOKEN_LENGTH - len(vision_codes))
+
+        # Pipeline expects vision_tokens key in req.extra
+        vision_decoder_inputs.append(
+            OmniTokensPrompt(
+                prompt_token_ids=vision_codes,
+                additional_information={
+                    "request_id": thinker_output.request_id,
+                    "vision_tokens": vision_codes,
+                    "num_images": 1,
+                },
+                multi_modal_data=None,
+                mm_processor_kwargs=None,
+            )
+        )
+
+    return vision_decoder_inputs
+
+
+def thinker2audio_decoder(
+    stage_list,
+    engine_input_source,
+    prompt: OmniTokensPrompt | TextPrompt = None,
+    requires_multimodal_data: bool = False,
+):
+    """Extract discrete audio tokens from thinker output → audio decoder.
+
+    The audio decoder (Unit-BigVGAN) takes discrete audio codes (6561 vocab)
+    and converts them to 24kHz waveforms.
+    """
+    if not engine_input_source:
+        raise ValueError("engine_input_source cannot be empty")
+
+    source_stage_id = engine_input_source[0]
+    thinker_outputs = stage_list[source_stage_id].engine_outputs
+    if thinker_outputs is None:
+        raise RuntimeError(f"Stage {source_stage_id} has no outputs yet")
+
+    audio_decoder_inputs = []
+    for thinker_output in thinker_outputs:
+        # Text-only iterations can produce an empty outputs list.
+        if not thinker_output.outputs:
+            continue
+        output = thinker_output.outputs[0]
+        output_token_ids = list(output.token_ids)
+
+        audio_codes = _extract_discrete_tokens(output_token_ids, DISCRETE_AUDIO_UNIT_0_ID, DISCRETE_AUDIO_VOCAB_SIZE)
+
+        if not audio_codes:
+            continue
+
+        # Pipeline expects audio_tokens as list[list[int]] (batch),
+        # speakers as list[str], formats as list[str], and optional
+        # ref_audio_tokens for zero-shot TTS (ECAPA-TDNN speaker embedding).
+        # ref_audio_b64 is the raw base64 audio from the user's input message,
+        # injected by serving_chat.py into the engine_prompt dict.
+        _ref = None
+        if isinstance(prompt, dict):
+            _ref = prompt.get("ref_audio_b64")
+        elif isinstance(prompt, list) and prompt:
+            _p = prompt[0]
+            if isinstance(_p, dict):
+                _ref = _p.get("ref_audio_b64")
+        audio_decoder_inputs.append(
+            OmniTokensPrompt(
+                prompt_token_ids=audio_codes,
+                additional_information={
+                    "request_id": thinker_output.request_id,
+                    "audio_tokens": [audio_codes],
+                    "speakers": ["fkms"],
+                    "ref_audio_tokens": [_ref],
+                },
+                multi_modal_data=None,
+                mm_processor_kwargs=None,
+            )
+        )
+
+    return audio_decoder_inputs
diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py
index f37b2224efb..4278478e5dc 100644
--- a/vllm_omni/worker/gpu_ar_model_runner.py
+++ b/vllm_omni/worker/gpu_ar_model_runner.py
@@ -6,38 +6,29 @@
 
 from __future__ import annotations
 
-from contextlib import nullcontext
 from copy import copy
-from dataclasses import replace
 from typing import Any, NamedTuple
 
 import numpy as np
 import torch
 from vllm.config import CUDAGraphMode
-from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer
-from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group
-from vllm.distributed.parallel_state import get_pp_group, get_tp_group
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
-    RoutedExpertsCapturer,
-)
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
-from vllm.v1.outputs import AsyncModelRunnerOutput, make_empty_encoder_model_runner_output
-from vllm.v1.spec_decode.draft_model import DraftModelProposer
+from vllm.v1.outputs import AsyncModelRunnerOutput
 from vllm.v1.spec_decode.eagle import EagleProposer
-from vllm.v1.spec_decode.extract_hidden_states import ExtractHiddenStatesProposer
 from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import record_function_or_nullcontext
 from vllm.v1.worker.gpu_model_runner import (
     EMPTY_MODEL_RUNNER_OUTPUT,
     AsyncGPUModelRunnerOutput,
     IntermediateTensors,
+    get_pp_group,
+    get_tp_group,
+    has_kv_transfer_group,
 )
-from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices
 from vllm.v1.worker.utils import is_residual_scattered_for_sp
 
-from vllm_omni.distributed.omni_connectors.kv_transfer_manager import OmniKVTransferManager
 from vllm_omni.outputs import OmniModelRunnerOutput
 from vllm_omni.utils.mm_outputs import build_mm_cpu, to_payload_element
 from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner
@@ -55,11 +46,7 @@ class ExecuteModelState(NamedTuple):
     sample_hidden_states: torch.Tensor
     aux_hidden_states: list[torch.Tensor] | None
     ec_connector_output: Any
-    cudagraph_stats: Any
-    # OMNI: multimodal_outputs field for omni-specific multimodal handling
     multimodal_outputs: Any
-    # slot_mappings for attention/drafter (aligned with upstream v1 API)
-    slot_mappings: dict[str, torch.Tensor] | list[dict[str, torch.Tensor]] | None = None
 
 
 class GPUARModelRunner(OmniGPUModelRunner, OmniConnectorModelRunnerMixin):
@@ -77,8 +64,6 @@ def __init__(self, *args, **kwargs):
         # each model stage has their own hidden size
         self.hidden_size = self.model_config.hf_text_config.hidden_size
         self.inputs_embeds = self._make_buffer(self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False)
-        # Initialize KV cache manager (preserve vllm_config fallback behavior)
-        self.kv_transfer_manager = OmniKVTransferManager.from_vllm_config(self.vllm_config, self.model_config)
 
     def _make_buffer(self, *size, dtype, numpy=True):
         # Prevent ray from pinning the buffer due to large size
@@ -265,52 +250,28 @@ def execute_model(
         scheduler_output: SchedulerOutput,
         intermediate_tensors: IntermediateTensors | None = None,
     ) -> OmniModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors | None:
-        if self.execute_model_state is not None:
-            raise RuntimeError("State error: sample_tokens() must be called after execute_model() returns None.")
-
-        if not getattr(self, "_warmup_state_cleared", False):
-            self._warmup_state_cleared = True
-            if hasattr(self.model, "_clear_warmup_state"):
-                self.model._clear_warmup_state()
-
-        # [Omni] Handle KV transfer BEFORE updating states (which removes finished requests)
-        finished_reqs = getattr(scheduler_output, "finished_requests_needing_kv_transfer", {})
-        if finished_reqs and hasattr(self.model, "get_kv_transfer_metadata"):
-            for req_id, data in finished_reqs.items():
-                try:
-                    req_idx = self.input_batch.req_id_to_index.get(req_id)
-                    num_computed = (
-                        int(self.input_batch.num_computed_tokens_cpu[req_idx]) if req_idx is not None else None
-                    )
-                    model_meta = self.model.get_kv_transfer_metadata(
-                        req_id,
-                        num_computed_tokens=num_computed,
+        with record_function_or_nullcontext("Preprocess"):
+            with self.synchronize_input_prep():
+                self._update_states(scheduler_output)
+                self._decode_and_store_request_payloads(scheduler_output)
+
+                if not scheduler_output.total_num_scheduled_tokens:
+                    if not has_kv_transfer_group():
+                        return EMPTY_MODEL_RUNNER_OUTPUT
+                    return self.kv_connector_no_forward(scheduler_output, self.vllm_config)
+                if self.cache_config.kv_sharing_fast_prefill:
+                    assert not self.input_batch.num_prompt_logprobs, (
+                        "--kv-sharing-fast-prefill produces incorrect "
+                        "logprobs for prompt tokens, tokens, please disable "
+                        "it when the requests need prompt logprobs"
                     )
-                    if model_meta:
-                        existing = data.get("custom_metadata") or {}
-                        existing.update(model_meta)
-                        data["custom_metadata"] = existing
-                except Exception as e:
-                    logger.warning(f"Failed to get custom metadata from model for {req_id}: {e}")
-        self.kv_extracted_req_ids = self.kv_transfer_manager.handle_finished_requests_kv_transfer(
-            finished_reqs=finished_reqs,
-            kv_caches=self.kv_caches,
-            block_size=self.cache_config.block_size,
-            cache_dtype=str(self.cache_config.cache_dtype),
-            request_id_resolver=self._resolve_global_request_id,
-        )
-
-        if self.routed_experts_initialized:
-            capturer = RoutedExpertsCapturer.get_instance()
-            if capturer is not None:
-                capturer.clear_buffer()  # noqa
-            else:
-                logger.error("RoutedExpertsCapturer not initialized.")
 
-        if has_kv_transfer_group():
-            kv_connector_metadata = scheduler_output.kv_connector_metadata
-            if kv_connector_metadata is not None:
-                get_kv_transfer_group().handle_preemptions(kv_connector_metadata)
+                num_reqs = self.input_batch.num_reqs
+                req_ids = self.input_batch.req_ids
+                tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
+                num_scheduled_tokens_np = np.array(tokens, dtype=np.int32)
+                max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
+                num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
 
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         with (
@@ -383,77 +344,39 @@ def execute_model(
                 num_scheduled_tokens_np,
             )
 
-            cascade_attn_prefix_lens = None
-            # Disable cascade attention when using microbatching (DBO)
-            if self.cascade_attn_enabled and not self.parallel_config.use_ubatching:
-                # Pre-compute cascade attention prefix lengths
-                cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens(
-                    num_scheduled_tokens_np,
-                    self.input_batch.num_computed_tokens_cpu[:num_reqs],
-                    scheduler_output.num_common_prefix_blocks,
+                (
+                    cudagraph_mode,
+                    batch_desc,
+                    ubatch_slices,
+                    num_tokens_across_dp,
+                ) = self._determine_batch_execution_and_padding(
+                    num_tokens=num_tokens_unpadded,
+                    num_reqs=num_reqs,
+                    num_scheduled_tokens_np=num_scheduled_tokens_np,
+                    max_num_scheduled_tokens=max_num_scheduled_tokens,
+                    use_cascade_attn=False,
                 )
 
-            (
-                cudagraph_mode,
-                batch_desc,
-                should_ubatch,
-                num_tokens_across_dp,
-                cudagraph_stats,
-            ) = self._determine_batch_execution_and_padding(
-                num_tokens=num_tokens_unpadded,
-                num_reqs=num_reqs,
-                num_scheduled_tokens_np=num_scheduled_tokens_np,
-                max_num_scheduled_tokens=max_num_scheduled_tokens,
-                use_cascade_attn=cascade_attn_prefix_lens is not None,
-                num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
-            )
-
-            num_tokens_padded = batch_desc.num_tokens
-            num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
-            ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices(
-                should_ubatch,
-                num_scheduled_tokens_np,
-                num_tokens_padded,
-                num_reqs_padded,
-                self.parallel_config.num_ubatches,
-            )
-
-            pad_attn = cudagraph_mode == CUDAGraphMode.FULL
-
-            use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
-            ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
-
-            # True if any attention backend handles KV cache update separately
-            # from forward() (i.e., forward_includes_kv_cache_update=False). When true,
-            # slot_mappings must use padded dimensions to match the key/value tensors.
-            from vllm.v1.kv_cache_interface import EncoderOnlyAttentionSpec
-
-            has_separate_kv_update = not all(
-                all(g.backend.forward_includes_kv_cache_update for g in self.attn_groups[id])
-                for id, spec in enumerate(self.kv_cache_config.kv_cache_groups)
-                if not isinstance(spec.kv_cache_spec, EncoderOnlyAttentionSpec)
-            )
-
-            slot_mappings_by_group, slot_mappings = self._get_slot_mappings(
-                num_tokens_padded=num_tokens_padded if pad_attn or has_separate_kv_update else num_tokens_unpadded,
-                num_reqs_padded=(num_reqs_padded if pad_attn or has_separate_kv_update else num_reqs),
-                num_tokens_unpadded=num_tokens_unpadded,
-                ubatch_slices=ubatch_slices_padded,
-            )
+                num_tokens_padded = batch_desc.num_tokens
+                num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
+                use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
+                pad_attn = cudagraph_mode == CUDAGraphMode.FULL
 
-            attn_metadata, spec_decode_common_attn_metadata = self._build_attention_metadata(
-                num_tokens=num_tokens_unpadded,
-                num_tokens_padded=num_tokens_padded if pad_attn else None,
-                num_reqs=num_reqs,
-                num_reqs_padded=num_reqs_padded if pad_attn else None,
-                max_query_len=max_num_scheduled_tokens,
-                ubatch_slices=ubatch_slices_attn,
-                logits_indices=logits_indices,
-                use_spec_decode=use_spec_decode,
-                num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
-                cascade_attn_prefix_lens=cascade_attn_prefix_lens,
-                slot_mappings=slot_mappings_by_group,
-            )
+                (
+                    attn_metadata,
+                    spec_decode_common_attn_metadata,
+                ) = self._build_attention_metadata(
+                    num_tokens=num_tokens_unpadded,
+                    num_tokens_padded=num_tokens_padded if pad_attn else None,
+                    num_reqs=num_reqs,
+                    num_reqs_padded=num_reqs_padded if pad_attn else None,
+                    max_query_len=max_num_scheduled_tokens,
+                    ubatch_slices=ubatch_slices,
+                    logits_indices=logits_indices,
+                    use_spec_decode=use_spec_decode,
+                    num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
+                    cascade_attn_prefix_lens=None,
+                )
 
             (
                 input_ids,
@@ -462,36 +385,17 @@ def execute_model(
                 intermediate_tensors,
                 model_kwargs,
                 ec_connector_output,
-            ) = self._preprocess(scheduler_output, num_tokens_padded, intermediate_tensors)
-
-        # Let the model adjust inputs before forward (e.g. restore input_ids
-        # for multimodal position detection, fix decode position offsets).
-        if hasattr(self.model, "prepare_runner_inputs"):
-            input_ids, positions = self.model.prepare_runner_inputs(
-                input_ids=input_ids,
-                positions=positions,
-                inputs_embeds=inputs_embeds,
-                req_ids=req_ids[:num_reqs],
-                num_computed_tokens=[int(self.input_batch.num_computed_tokens_cpu[i]) for i in range(num_reqs)],
-                num_scheduled_tokens=[int(num_scheduled_tokens_np[i]) for i in range(num_reqs)],
-                input_ids_buffer=self.input_ids.gpu[:num_tokens_padded],
+            ) = self._preprocess(
+                scheduler_output,
+                num_tokens_padded,
+                intermediate_tensors,
             )
 
-        # Set cudagraph mode to none if calc_kv_scales is true.
-        # KV scales calculation involves dynamic operations that are incompatible
-        # with CUDA graph capture.
         if self.calculate_kv_scales:
             cudagraph_mode = CUDAGraphMode.NONE
-            # Mark KV scales as calculated after the first forward pass
             self.calculate_kv_scales = False
 
-        # Run the model.
-        # Use persistent buffers for CUDA graphs.
-        # When spec decode is enabled, defer connector finalization
-        # (wait_for_save + clear metadata) until after draft model runs.
-        defer_kv_connector_finalize = self.speculative_config is not None
         with (
-            nullcontext(),
             set_forward_context(
                 attn_metadata,
                 self.vllm_config,
@@ -499,14 +403,10 @@ def execute_model(
                 num_tokens_across_dp=num_tokens_across_dp,
                 cudagraph_runtime_mode=cudagraph_mode,
                 batch_descriptor=batch_desc,
-                ubatch_slices=ubatch_slices_padded,
-                slot_mapping=slot_mappings,  # OMNI: required for KV cache operations
+                ubatch_slices=ubatch_slices,
             ),
-            record_function_or_nullcontext("gpu_model_runner: forward"),
-            self.maybe_get_kv_connector_output(
-                scheduler_output,
-                defer_finalize=defer_kv_connector_finalize,
-            ) as kv_connector_output,
+            record_function_or_nullcontext("Forward"),
+            self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
         ):
             model_output = self._model_forward(
                 input_ids=input_ids,
@@ -519,10 +419,6 @@ def execute_model(
                 sampler=self.sampler,
             )
 
-            # [Omni] Map pending ropes metadata to req_ids.
-            if hasattr(self.model, "flush_pending_metadata"):
-                self.model.flush_pending_metadata(list(req_ids))
-
         with record_function_or_nullcontext("gpu_model_runner: postprocess"):
             if self.use_aux_hidden_state_outputs:
                 # True when EAGLE 3 is used.
@@ -532,7 +428,18 @@ def execute_model(
                 hidden_states = model_output
                 aux_hidden_states = None
 
-            hidden_states, multimodal_outputs = self.extract_multimodal_outputs(model_output)
+            multimodal_outputs = model_output.multimodal_outputs
+            hidden_states = model_output.text_hidden_states
+
+            if multimodal_outputs is not None:
+                keys_or_type = (
+                    list(multimodal_outputs.keys())
+                    if isinstance(multimodal_outputs, dict)
+                    else type(multimodal_outputs)
+                )
+                logger.debug(f"[AR] execute_model: multimodal_outputs keys = {keys_or_type}")
+            else:
+                logger.debug("[AR] execute_model: multimodal_outputs is None")
 
             # Cache hidden states & multimodal outputs if we've enabled hidden state
             # prefix caching unless this isn't the last pipeline parallelism rank.
@@ -544,36 +451,27 @@ def execute_model(
             )
 
             if not self.broadcast_pp_output:
-                # Common case.
                 if not get_pp_group().is_last_rank:
-                    # Return the intermediate tensors.
                     assert isinstance(hidden_states, IntermediateTensors)
                     hidden_states.kv_connector_output = kv_connector_output
-                    self.kv_connector_output = kv_connector_output
                     return hidden_states
 
                 if self.is_pooling_model:
-                    # Return the pooling output.
-                    return self._pool(
+                    output = self._pool(
                         hidden_states,
-                        num_scheduled_tokens,
+                        num_tokens_padded,
                         num_scheduled_tokens_np,
-                        kv_connector_output,
                     )
+                    output.kv_connector_output = kv_connector_output
+                    return output
 
                 sample_hidden_states = hidden_states[logits_indices]
-                # Try with sampling_metadata first; fall back to without for models that don't support it
-                try:
-                    logits = self.model.compute_logits(
-                        sample_hidden_states, sampling_metadata=self.input_batch.sampling_metadata
-                    )
-                except TypeError:
-                    logits = self.model.compute_logits(sample_hidden_states)
+                logits = self.model.compute_logits(
+                    sample_hidden_states, sampling_metadata=self.input_batch.sampling_metadata
+                )
             else:
-                # Rare case.
                 assert not self.is_pooling_model
 
-                sample_hidden_states = hidden_states[logits_indices]
                 if not get_pp_group().is_last_rank:
                     all_gather_tensors = {
                         "residual": not is_residual_scattered_for_sp(self.vllm_config, num_tokens_padded)
@@ -585,13 +483,10 @@ def execute_model(
                     )
                     logits = None
                 else:
-                    # Try with sampling_metadata first; fall back to without for models that don't support it
-                    try:
-                        logits = self.model.compute_logits(
-                            sample_hidden_states, sampling_metadata=self.input_batch.sampling_metadata
-                        )
-                    except TypeError:
-                        logits = self.model.compute_logits(sample_hidden_states)
+                    sample_hidden_states = hidden_states[logits_indices]
+                    logits = self.model.compute_logits(
+                        sample_hidden_states, sampling_metadata=self.input_batch.sampling_metadata
+                    )
 
                 model_output_broadcast_data: dict[str, Any] = {}
                 if logits is not None:
@@ -612,15 +507,9 @@ def execute_model(
             sample_hidden_states,
             aux_hidden_states,
             ec_connector_output,
-            cudagraph_stats,
             multimodal_outputs,
-            slot_mappings,  # OMNI: pass slot_mappings for drafter
         )
         self.kv_connector_output = kv_connector_output
-
-        if deferred_state_corrections_fn:
-            deferred_state_corrections_fn()
-
         return None
 
     def _sample(
@@ -678,8 +567,8 @@ def sample_tokens(
         self,
         grammar_output: GrammarOutput | None,
     ) -> OmniModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors:
-        kv_extracted_req_ids = getattr(self, "kv_extracted_req_ids", None)
-        self.kv_extracted_req_ids = None
+        kv_connector_output = self.kv_connector_output
+        self.kv_connector_output = None
 
         # Used for prefix cache
         combined_hidden_states = None
@@ -689,22 +578,14 @@ def sample_tokens(
         mm_cpu = {}
 
         if self.execute_model_state is None:
-            kv_connector_output = self.kv_connector_output
-            self.kv_connector_output = None
-            # Nothing to do (PP non-final rank case), output isn't used.
             if not kv_connector_output:
                 return None  # type: ignore[return-value]
-
-            # In case of PP with kv transfer, we need to pass through the
-            # kv_connector_output
             if kv_connector_output.is_empty():
                 return EMPTY_MODEL_RUNNER_OUTPUT
-
             output = copy(EMPTY_MODEL_RUNNER_OUTPUT)
             output.kv_connector_output = kv_connector_output
             return output
 
-        # Unpack ephemeral state.
         (
             scheduler_output,
             logits,
@@ -714,33 +595,17 @@ def sample_tokens(
             sample_hidden_states,
             aux_hidden_states,
             ec_connector_output,
-            cudagraph_stats,
             multimodal_outputs,
-            slot_mappings,  # OMNI: unpack slot_mappings for drafter
         ) = self.execute_model_state
         self.execute_model_state = None
         seq_len = hidden_states.shape[0]
 
-        # Apply structured output bitmasks if present.
         if grammar_output is not None:
             apply_grammar_bitmask(scheduler_output, grammar_output, self.input_batch, logits)
 
-        # Correct padding values of prompt_token_ids to match the logits vocabulary size
-        if logits is not None and not self.input_batch.sampling_metadata.no_penalties:
-            smd = self.input_batch.sampling_metadata
-            if smd.prompt_token_ids is not None:
-                logits_vocab = logits.shape[-1]
-                if self.input_batch.vocab_size > logits_vocab:
-                    smd.prompt_token_ids = smd.prompt_token_ids.clamp(max=logits_vocab)
-
         with record_function_or_nullcontext("gpu_model_runner: sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
 
-        self._update_states_after_model_execute(sampler_output.sampled_token_ids, scheduler_output)
-
-        self._draft_token_ids = None
-        self._draft_token_req_ids = None
-        self.valid_sampled_token_count_gpu = None
         self.input_batch.prev_sampled_token_ids = None
 
         def propose_draft_token_ids(sampled_token_ids):
@@ -755,46 +620,40 @@ def propose_draft_token_ids(sampled_token_ids):
                     aux_hidden_states,
                     spec_decode_metadata,
                     spec_decode_common_attn_metadata,
-                    slot_mappings,  # OMNI: pass slot_mappings to drafter (upstream v1 API)
                 )
-                self._copy_draft_token_ids_to_cpu(scheduler_output)
 
         spec_config = self.speculative_config
-        propose_drafts_after_bookkeeping = False
-        if spec_config is not None:
-            input_fits_in_drafter = spec_decode_common_attn_metadata is not None and (
-                spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens
-                <= self.effective_drafter_max_model_len
-            )
-            use_gpu_toks = (
-                spec_config.use_eagle() or spec_config.uses_draft_model() or spec_config.uses_extract_hidden_states()
-            ) and not spec_config.disable_padded_drafter_batch
-            if use_gpu_toks:
-                assert isinstance(
-                    self.drafter,
-                    EagleProposer | DraftModelProposer | ExtractHiddenStatesProposer,
+        use_padded_batch_for_eagle = (
+            spec_config is not None and spec_config.use_eagle() and not spec_config.disable_padded_drafter_batch
+        )
+        effective_drafter_max_model_len = self.max_model_len
+        if effective_drafter_max_model_len is None:
+            effective_drafter_max_model_len = self.model_config.max_model_len
+        if (
+            spec_config is not None
+            and spec_config.draft_model_config is not None
+            and spec_config.draft_model_config.max_model_len is not None
+        ):
+            effective_drafter_max_model_len = spec_config.draft_model_config.max_model_len
+        input_fits_in_drafter = spec_decode_common_attn_metadata and (
+            spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens <= effective_drafter_max_model_len
+        )
+        if use_padded_batch_for_eagle:
+            assert self.speculative_config is not None
+            assert isinstance(self.drafter, EagleProposer)
+            sampled_token_ids = sampler_output.sampled_token_ids
+            if input_fits_in_drafter:
+                propose_draft_token_ids(sampled_token_ids)
+            elif self.valid_sampled_token_count_event is not None:
+                assert spec_decode_common_attn_metadata is not None
+                next_token_ids, valid_sampled_tokens_count = self.drafter.prepare_next_token_ids_padded(
+                    spec_decode_common_attn_metadata,
+                    sampled_token_ids,
+                    self.requests,
+                    self.input_batch,
+                    self.discard_request_mask.gpu,
                 )
-                sampled_token_ids = sampler_output.sampled_token_ids
-                if input_fits_in_drafter:
-                    propose_draft_token_ids(sampled_token_ids)
-                elif self.valid_sampled_token_count_event is not None:
-                    assert spec_decode_common_attn_metadata is not None
-                    next_token_ids, valid_sampled_tokens_count = self.drafter.prepare_next_token_ids_padded(
-                        self.optimistic_seq_lens_cpu,
-                        sampled_token_ids,
-                        self.requests,
-                        self.input_batch,
-                        self.discard_request_mask.gpu,
-                    )
-                    self._copy_valid_sampled_token_count(next_token_ids, valid_sampled_tokens_count)
-                    # Since we couldn't run the drafter,
-                    # just use zeros for the draft tokens.
-                    self._draft_token_ids = torch.zeros(1, device=self.device, dtype=torch.int32).expand(
-                        len(self.input_batch.req_ids), self.num_spec_tokens
-                    )
-                    self._copy_draft_token_ids_to_cpu(scheduler_output, zeros_only=True)
-            else:
-                propose_drafts_after_bookkeeping = input_fits_in_drafter
+                self._copy_valid_sampled_token_count(next_token_ids, valid_sampled_tokens_count)
 
         with record_function_or_nullcontext("gpu_model_runner: bookkeep"):
             (
@@ -814,24 +673,12 @@ def propose_draft_token_ids(sampled_token_ids):
                 spec_decode_metadata,
             )
 
-        if propose_drafts_after_bookkeeping:
-            # ngram and other speculative decoding methods use the sampled
-            # tokens on the CPU, so they are run after bookkeeping.
+        if self.speculative_config and not use_padded_batch_for_eagle and input_fits_in_drafter:
             propose_draft_token_ids(valid_sampled_token_ids)
 
-        # Finalize KV connector (wait_for_save + clear metadata) after
-        # draft model runs. Deferred from target model forward to allow
-        # draft model to also save its KV cache.
-        if self.speculative_config is not None:
-            self.finalize_kv_connector()
-
         with record_function_or_nullcontext("gpu_model_runner: eplb"):
             self.eplb_step()
 
-        # kv_connector_output may be modified during drafting
-        kv_connector_output = self.kv_connector_output
-        self.kv_connector_output = None
-
         hidden_states_cpu = hidden_states.detach().to("cpu").contiguous()
         num_scheduled_tokens_np = getattr(self, "_omni_num_scheduled_tokens_np", None)
         if num_scheduled_tokens_np is None:
@@ -911,25 +758,21 @@ def propose_draft_token_ids(sampled_token_ids):
                 payload.update(mm_payload)
             pooler_output.append(payload)
         with record_function_or_nullcontext("gpu_model_runner: ModelRunnerOutput"):
-            if self.routed_experts_initialized:
-                capturer = RoutedExpertsCapturer.get_instance()
-                if capturer is not None:
-                    capturer.save_captured_experts(indices=self.slot_mapping)  # noqa
-                else:
-                    logger.error("RoutedExpertsCapturer not initialized.")
             output = OmniModelRunnerOutput(
                 req_ids=req_ids_output_copy,
                 req_id_to_index=req_id_to_index_output_copy,
                 sampled_token_ids=valid_sampled_token_ids,
                 logprobs=logprobs_lists,
                 prompt_logprobs_dict=prompt_logprobs_dict,
-                pooler_output=(pooler_output if self.vllm_config.model_config.engine_output_type != "text" else None),
+                pooler_output=(
+                    pooler_output
+                    if getattr(self.vllm_config.model_config, "engine_output_type", "text") != "text"
+                    else None
+                ),
                 kv_connector_output=kv_connector_output,
                 ec_connector_output=ec_connector_output if self.supports_mm_inputs else None,
                 num_nans_in_logits=num_nans_in_logits,
-                cudagraph_stats=cudagraph_stats,
             )
-            output.kv_extracted_req_ids = kv_extracted_req_ids
 
         if not self.use_async_scheduling:
             return output
@@ -951,19 +794,3 @@ def propose_draft_token_ids(sampled_token_ids):
             )
 
         return async_output
-
-    def _resolve_global_request_id(self, req_id: str) -> str:
-        """Resolve global request ID from request state."""
-        req_state = self.requests.get(req_id)
-        if not req_state:
-            return req_id
-
-        add_info = self.model_intermediate_buffer.get(req_id, {})
-        global_id = add_info.get("global_request_id")
-        if global_id:
-            if isinstance(global_id, list) and global_id:
-                global_id = global_id[0]
-            if isinstance(global_id, bytes):
-                return global_id.decode("utf-8")
-            return str(global_id)
-        return req_id

From 379050344a2bc8881a95fe5d66c02e72ee302d94 Mon Sep 17 00:00:00 2001
From: "jaeeun.kil" <jaeeun.kil@navercorp.com>
Date: Tue, 7 Apr 2026 08:27:24 +0900
Subject: [PATCH 2/4] fix: clarify flash-attn branch intent in vision decoder
 layers

The if/else branches in attention() are semantically distinct:
- if: sdpa_kernel(FLASH_ATTENTION) forces the Flash Attention kernel (Ampere+)
- else: lets PyTorch select any available SDPA backend

Added clarifying comments to make this explicit, addressing reviewer feedback.

Signed-off-by: jaeeun.kil <jaeeun.kil@navercorp.com>
---
 vllm_omni/diffusion/models/hyperclovax_vision/layers.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/diffusion/models/hyperclovax_vision/layers.py b/vllm_omni/diffusion/models/hyperclovax_vision/layers.py
index e5018af8954..761c1c29cd7 100644
--- a/vllm_omni/diffusion/models/hyperclovax_vision/layers.py
+++ b/vllm_omni/diffusion/models/hyperclovax_vision/layers.py
@@ -65,11 +65,13 @@ def attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, pe: torch.Tenso
     """Attention with rotary position embedding and Flash Attention optimization."""
     q, k = apply_rope(q, k, pe)
 
-    # Use Flash Attention when available (Ampere+), otherwise let PyTorch pick
+    # Explicitly request Flash Attention backend on Ampere+ GPUs.
+    # Falls back to PyTorch's default SDPA selection on other hardware.
     if FLASH_ATTN_AVAILABLE and q.is_cuda:
         with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
             x = F.scaled_dot_product_attention(q, k, v)
     else:
+        # Let PyTorch choose the best available SDPA kernel (math, mem-efficient, etc.)
         x = F.scaled_dot_product_attention(q, k, v)
 
     x = rearrange(x, "B H L D -> B L (H D)")

From cb68d2466dac4219b8c3316b20bc59cca21905cd Mon Sep 17 00:00:00 2001
From: kje <rha3122@naver.com>
Date: Tue, 7 Apr 2026 17:10:04 +0900
Subject: [PATCH 3/4] fix: HyperCLOVAX-SEED-Omni-8B e2e inference compatibility
 fixes

- Fan-out routing in omni.py: replace linear stage_id+1 with
  connector-key-based routing so thinker can send to stage-1 (vision)
  and stage-2 (audio) independently
- Bridge OmniTokensPrompt.additional_information to
  OmniDiffusionRequest.extra in omni_diffusion.py so vision_tokens and
  audio_tokens reach decoder pipelines
- Guard empty batch in omni_stage.py to avoid "empty request list" error
  when the thinker produces no tokens for a modality
- Add renderer sync and output_type param in omni_llm.py for base-class
  compatibility
- Add optional ExtractHiddenStatesProposer import and
  routed_experts_initialized attribute in ar/generation/model runners
- Fix clear_metadata kwarg (renamed from defer_finalize) in model runners
- Add _model. prefix in load_weights return set for strict-loading check
- Add embed_multimodal delegation in hcx_omni_thinker
- Add dummy tokens for HyperCLOVAX pipelines in diffusion_engine
---
 vllm_omni/diffusion/diffusion_engine.py       | 14 +++
 vllm_omni/entrypoints/omni.py                 | 48 +++++++++-
 vllm_omni/entrypoints/omni_diffusion.py       | 79 +++++++++++-----
 vllm_omni/entrypoints/omni_llm.py             | 11 ++-
 vllm_omni/entrypoints/omni_stage.py           | 19 ++--
 .../models/hcx_omni/hcx_omni.py               | 19 +++-
 .../models/hcx_omni/hcx_omni_thinker.py       | 17 +++-
 vllm_omni/worker/gpu_ar_model_runner.py       | 89 ++++++++++++-------
 .../worker/gpu_generation_model_runner.py     | 17 ++--
 vllm_omni/worker/gpu_model_runner.py          | 12 ++-
 10 files changed, 250 insertions(+), 75 deletions(-)

diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py
index c28fe0943ce..dda301d056e 100644
--- a/vllm_omni/diffusion/diffusion_engine.py
+++ b/vllm_omni/diffusion/diffusion_engine.py
@@ -486,6 +486,20 @@ def _dummy_run(self):
             "prompt": "dummy run",
             "multi_modal_data": {"image": dummy_image, "audio": dummy_audio},
         }
+        # For HyperCLOVAX vision/audio decoders, provide dummy tokens in req.extra
+        # since these models receive discrete tokens from the thinker (not text prompts).
+        dummy_extra: dict = {}
+        model_cls = self.od_config.model_class_name
+        if model_cls == "HyperCLOVAXVisionPipeline":
+            # 729 = 27x27 discrete image tokens (DISCRETE_IMAGE_TOKEN_LENGTH)
+            dummy_extra["vision_tokens"] = [0] * 729
+        elif model_cls == "HyperCLOVAXAudioPipeline":
+            # Minimal sequence of audio tokens for warmup
+            dummy_extra["audio_tokens"] = [[0] * 10]
+            dummy_extra["speakers"] = ["fkms"]
+            dummy_extra["formats"] = ["wav"]
+            dummy_extra["ref_audio_tokens"] = [None]
+
         req = OmniDiffusionRequest(
             prompts=[prompt],
             request_ids=["dummy_req_id"],
diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py
index c2482ae5f15..e9050c695f6 100644
--- a/vllm_omni/entrypoints/omni.py
+++ b/vllm_omni/entrypoints/omni.py
@@ -1103,8 +1103,38 @@ def _run_generation(
 
                     yield output_to_yield
 
-                next_stage_id = stage_id + 1
-                if next_stage_id <= final_stage_id_to_prompt[req_id]:
+                # Fan-out routing: forward to all downstream stages that have
+                # a connector from this stage, rather than assuming a linear
+                # stage_id+1 chain.
+                #
+                # Background: HyperCLOVAX-SEED-Omni (and similar models) use a
+                # fan-out topology where the thinker (stage 0) sends vision tokens
+                # to the vision decoder (stage 1) *and* audio tokens to the audio
+                # decoder (stage 2) independently:
+                #
+                #   stage-0 (thinker) ──► stage-1 (vision decoder)
+                #                    └──► stage-2 (audio decoder)
+                #
+                # The former linear `next_stage_id = stage_id + 1` assumption
+                # caused stage-1 to incorrectly try to forward to stage-2 after
+                # completing, which raised a RuntimeError because no connector
+                # exists for the (1→2) edge.
+                #
+                # With this fix:
+                #  - After stage-0: forward only to stages that actually have
+                #    connector edges AND non-empty inputs (e.g. skip vision stage
+                #    for text-only replies that produced no vision tokens).
+                #  - After stage-1 or stage-2 (leaf nodes): no outgoing connectors
+                #    exist, so `any_forwarded` stays False and the request is
+                #    marked complete.
+                #  - Linear pipelines (0→1→2) still work because each stage has
+                #    exactly one outgoing connector.
+                downstream_stage_ids = sorted([
+                    int(to) for (frm, to) in self.connectors.keys()
+                    if frm == str(stage_id)
+                ])
+                any_forwarded = False
+                for next_stage_id in downstream_stage_ids:
                     next_stage: OmniStage = self.stage_list[next_stage_id]
                     try:
                         # Derive inputs for the next stage, record preprocess time
@@ -1118,6 +1148,16 @@ def _run_generation(
                             f" at stage {next_stage_id}: {e}",
                         )
                         continue
+
+                    if not next_inputs:
+                        # No tokens for this modality (e.g. text-only reply has no
+                        # vision/audio tokens), skip forwarding to this stage.
+                        logger.debug(
+                            f"[{self._name}] No inputs for stage-{next_stage_id} "
+                            f"from stage-{stage_id}, skipping",
+                        )
+                        continue
+
                     sp_next = sampling_params_list[next_stage_id]  # type: ignore[index]
 
                     # Check if we have a connector for this edge
@@ -1146,7 +1186,9 @@ def _run_generation(
                         f"[{self._name}] Forwarded request {req_id} to stage-{next_stage_id}",
                     )
                     remaining_by_stage[next_stage_id] += 1
-                else:
+                    any_forwarded = True
+
+                if not any_forwarded:
                     completed_requests += 1
                     if pbar:
                         final_mod = self.output_modalities[final_stage_id_to_prompt[req_id]]
diff --git a/vllm_omni/entrypoints/omni_diffusion.py b/vllm_omni/entrypoints/omni_diffusion.py
index 0f8049f43f5..126150e4d71 100644
--- a/vllm_omni/entrypoints/omni_diffusion.py
+++ b/vllm_omni/entrypoints/omni_diffusion.py
@@ -51,13 +51,18 @@ def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs):
         # Diffusers-style models expose `model_index.json` with `_class_name`.
         # Non-diffusers models (e.g. Bagel, NextStep, GLM-Image) only have `config.json`,
         # so we fall back to reading that and mapping model_type manually.
+        # If model_class_name is already specified (e.g. HyperCLOVAXVisionPipeline),
+        # skip auto-detection and only load optional transformer config.
+        _user_specified_class = od_config.model_class_name
         try:
             config_dict = get_hf_file_to_dict(
                 "model_index.json",
                 od_config.model,
             )
             if config_dict is not None:
-                od_config.model_class_name = config_dict.get("_class_name", None)
+                # Only set model_class_name from config if not already specified by user
+                if _user_specified_class is None:
+                    od_config.model_class_name = config_dict.get("_class_name", None)
                 od_config.update_multimodal_support()
 
                 tf_config_dict = get_hf_file_to_dict(
@@ -68,31 +73,36 @@ def __init__(self, od_config: OmniDiffusionConfig | None = None, **kwargs):
             else:
                 raise FileNotFoundError("model_index.json not found")
         except (AttributeError, OSError, ValueError, FileNotFoundError):
-            cfg = get_hf_file_to_dict("config.json", od_config.model)
-            if cfg is None:
-                raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}")
-
-            # Map model_type or architecture to pipeline class
-            model_type = cfg.get("model_type")
-            architectures = cfg.get("architectures") or []
-            pipeline_class = None
-            # Bagel/NextStep models don't have a model_index.json, so we set the pipeline class name manually
-            if model_type == "bagel" or "BagelForConditionalGeneration" in architectures:
-                pipeline_class = "BagelPipeline"
-            elif model_type == "nextstep":
-                if od_config.model_class_name is None:
+            # If model_class_name was already provided (e.g. for .mar audio models),
+            # skip config detection and use an empty TransformerConfig.
+            if _user_specified_class is not None:
+                od_config.tf_model_config = TransformerConfig()
+                od_config.update_multimodal_support()
+            else:
+                cfg = get_hf_file_to_dict("config.json", od_config.model)
+                if cfg is None:
+                    raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}")
+
+                # Map model_type or architecture to pipeline class
+                model_type = cfg.get("model_type")
+                architectures = cfg.get("architectures") or []
+                pipeline_class = None
+                # Bagel/NextStep models don't have a model_index.json, so we set the pipeline class name manually
+                if model_type == "bagel" or "BagelForConditionalGeneration" in architectures:
+                    pipeline_class = "BagelPipeline"
+                elif model_type == "nextstep":
                     pipeline_class = "NextStep11Pipeline"
-            elif model_type == "glm-image" or "GlmImageForConditionalGeneration" in architectures:
-                pipeline_class = "GlmImagePipeline"
-            elif architectures and len(architectures) == 1:
-                pipeline_class = architectures[0]
+                elif model_type == "glm-image" or "GlmImageForConditionalGeneration" in architectures:
+                    pipeline_class = "GlmImagePipeline"
+                elif architectures and len(architectures) == 1:
+                    pipeline_class = architectures[0]
 
-            if pipeline_class is None:
-                raise ValueError(f"Unknown model type: {model_type}, architectures: {architectures}")
+                if pipeline_class is None:
+                    raise ValueError(f"Unknown model type: {model_type}, architectures: {architectures}")
 
-            od_config.model_class_name = pipeline_class
-            od_config.tf_model_config = TransformerConfig()
-            od_config.update_multimodal_support()
+                od_config.model_class_name = pipeline_class
+                od_config.tf_model_config = TransformerConfig()
+                od_config.update_multimodal_support()
 
         self.engine: DiffusionEngine = DiffusionEngine.make_engine(od_config)
 
@@ -111,7 +121,28 @@ def generate(
         if len(request_ids) < len(prompts):
             request_ids.extend(f"{i + len(request_ids)}_{uuid.uuid4()}" for i in range(len(prompts) - len(request_ids)))
 
-        request = OmniDiffusionRequest(prompts, sampling_params, request_ids)
+        # Propagate stage-specific payload from OmniTokensPrompt into
+        # OmniDiffusionRequest.extra.
+        #
+        # Stage input processors (e.g. thinker2vision_decoder) store
+        # model-specific data such as `vision_tokens` and `audio_tokens`
+        # in OmniTokensPrompt.additional_information.  The underlying
+        # diffusion pipelines (HyperCLOVAXVisionPipeline, etc.) read these
+        # values from req.extra, so we must bridge the two here.
+        #
+        # For batched prompts the first occurrence of each key wins; this
+        # mirrors the single-request behaviour where the dict is built from
+        # one prompt only.
+        extra: dict = {}
+        for prompt in prompts:
+            if isinstance(prompt, dict):
+                ai = prompt.get("additional_information")
+                if isinstance(ai, dict):
+                    for k, v in ai.items():
+                        if k not in extra:
+                            extra[k] = v
+
+        request = OmniDiffusionRequest(prompts, sampling_params, request_ids, extra=extra)
         return self._run_engine(request)
 
     def _run_engine(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]:
diff --git a/vllm_omni/entrypoints/omni_llm.py b/vllm_omni/entrypoints/omni_llm.py
index ec9248e3041..107eb5905e4 100644
--- a/vllm_omni/entrypoints/omni_llm.py
+++ b/vllm_omni/entrypoints/omni_llm.py
@@ -157,6 +157,12 @@ def __init__(
 
         # Create the Engine (autoselects V0 vs V1)
         self.llm_engine = LLMEngine.from_engine_args(engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
+        # OmniLLM does not call super().__init__(), so instance attributes
+        # normally set by LLM.__init__ must be populated manually.
+        # `renderer` is accessed by several base-class helper methods; copy it
+        # from the engine when available (V1 engine exposes it, V0 does not).
+        if hasattr(self.llm_engine, "renderer"):
+            self.renderer = self.llm_engine.renderer
         self.llm_engine.output_processor = MultimodalOutputProcessor(
             tokenizer=self.llm_engine.tokenizer,
             log_stats=self.llm_engine.log_stats,
@@ -197,7 +203,10 @@ def __del__(self) -> None:  # best-effort
         except Exception as e:
             logger.debug("[Orchestrator] __del__ close() raised: %s", e, exc_info=True)
 
-    def _run_engine(self, *, use_tqdm: bool | Callable[..., tqdm] = True) -> list[RequestOutput | PoolingRequestOutput]:
+    def _run_engine(self, output_type=None, *, use_tqdm: bool | Callable[..., tqdm] = True) -> list[RequestOutput | PoolingRequestOutput]:
+        # `output_type` is forwarded by the base LLM.generate() call-site
+        # (added in a later vLLM commit); accept and ignore it so the
+        # overridden signature stays compatible.
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py
index ba10dbcc1c0..4aa2095311f 100644
--- a/vllm_omni/entrypoints/omni_stage.py
+++ b/vllm_omni/entrypoints/omni_stage.py
@@ -977,11 +977,20 @@ def handle_profiler_task_local(task_type: OmniStageTaskType) -> dict:
             if stage_type == "diffusion":
                 stage_engine = cast(OmniDiffusion, stage_engine)
                 batch_engine_sampling_params = cast(OmniDiffusionSamplingParams, batch_engine_sampling_params)
-                # Diffusion generate returns results directly, not an iterator
-                diffusion_results = stage_engine.generate(
-                    batch_engine_inputs, batch_engine_sampling_params, batch_request_ids
-                )
-                gen_outputs.extend(diffusion_results)
+                # Guard against empty batches that can arise when the thinker
+                # produced no tokens for this modality (e.g. a text-only reply
+                # generates neither vision nor audio discrete tokens, so
+                # thinker2vision_decoder / thinker2audio_decoder return []).
+                # Calling generate() with an empty prompt list would raise
+                # "Cannot execute model with empty request list".
+                if not batch_engine_inputs:
+                    logger.debug("[Stage-%s] Skipping diffusion: empty input batch (no tokens from thinker)", stage_id)
+                else:
+                    # Diffusion generate returns results directly, not an iterator
+                    diffusion_results = stage_engine.generate(
+                        batch_engine_inputs, batch_engine_sampling_params, batch_request_ids
+                    )
+                    gen_outputs.extend(diffusion_results)
                 # Assign request_ids if not present
                 for idx, result in enumerate(gen_outputs):
                     if not hasattr(result, "request_id") or result.request_id is None:
diff --git a/vllm_omni/model_executor/models/hcx_omni/hcx_omni.py b/vllm_omni/model_executor/models/hcx_omni/hcx_omni.py
index d05fad00a6d..43c301f5539 100644
--- a/vllm_omni/model_executor/models/hcx_omni/hcx_omni.py
+++ b/vllm_omni/model_executor/models/hcx_omni/hcx_omni.py
@@ -97,6 +97,13 @@ def iter_mm_grid_thw(self, *args: Any, **kwargs: Any):
     def get_multimodal_embeddings(self, **kwargs: Any):
         return self._model.get_multimodal_embeddings(**kwargs)
 
+    def embed_multimodal(self, **kwargs: Any):
+        # Required by vLLM's multimodal embedding path; the base class
+        # SupportsMultiModal declares get_multimodal_embeddings(), but some
+        # call-sites (e.g. GPUModelRunner) also call embed_multimodal() directly
+        # and expect a tensor rather than the raw embedding list.
+        return self._model.embed_multimodal(**kwargs)
+
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
@@ -127,8 +134,16 @@ def forward(
     def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         return self._model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        return self._model.load_weights(weights)
+    def load_weights(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> set[str]:
+        loaded = self._model.load_weights(weights)
+        # The model stores sub-modules under self._model, so vllm's named_parameters()
+        # sees "_model.<name>" while self._model.load_weights() returns "<name>".
+        # Re-prefix so the strict-loading check in default_loader.py matches.
+        if loaded is not None:
+            return {"_model." + name for name in loaded}
+        return loaded
 
     def get_mm_mapping(self):
         return self._model.get_mm_mapping()
diff --git a/vllm_omni/model_executor/models/hcx_omni/hcx_omni_thinker.py b/vllm_omni/model_executor/models/hcx_omni/hcx_omni_thinker.py
index d5ab6396231..9f13d710762 100644
--- a/vllm_omni/model_executor/models/hcx_omni/hcx_omni_thinker.py
+++ b/vllm_omni/model_executor/models/hcx_omni/hcx_omni_thinker.py
@@ -85,6 +85,13 @@ def iter_mm_grid_thw(self, *args, **kwargs):
     def get_multimodal_embeddings(self, **kwargs: Any) -> MultiModalEmbeddings | None:
         return self._model.get_multimodal_embeddings(**kwargs)
 
+    def embed_multimodal(self, **kwargs: Any) -> MultiModalEmbeddings:
+        # Required by vLLM's multimodal embedding path; the base class
+        # SupportsMultiModal declares get_multimodal_embeddings(), but some
+        # call-sites (e.g. GPUModelRunner) also call embed_multimodal() directly
+        # and expect a tensor rather than the raw embedding list.
+        return self._model.embed_multimodal(**kwargs)
+
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
@@ -116,7 +123,15 @@ def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         return self._model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        return self._model.load_weights(weights)
+        loaded = self._model.load_weights(weights)
+        # vLLM's default_loader validates that every named_parameter() is
+        # present in the set returned by load_weights().  Because weights are
+        # stored under self._model, named_parameters() yields "_model.<name>"
+        # while the inner load_weights() returns bare "<name>".  Re-prefix the
+        # returned set so the strict-loading check passes.
+        if loaded is not None:
+            return {"_model." + name for name in loaded}
+        return loaded
 
     def get_mm_mapping(self):
         return self._model.get_mm_mapping()
diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py
index 4278478e5dc..a76e95e8453 100644
--- a/vllm_omni/worker/gpu_ar_model_runner.py
+++ b/vllm_omni/worker/gpu_ar_model_runner.py
@@ -17,6 +17,14 @@
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.outputs import AsyncModelRunnerOutput
 from vllm.v1.spec_decode.eagle import EagleProposer
+# ExtractHiddenStatesProposer was introduced in a later vLLM commit and is not
+# present in the v0.18.0 base this branch targets.  Make the import optional so
+# the rest of the file can reference it in isinstance() checks without breaking
+# on the older base.
+try:
+    from vllm.v1.spec_decode.extract_hidden_states import ExtractHiddenStatesProposer
+except ImportError:
+    ExtractHiddenStatesProposer = None  # type: ignore[assignment,misc]
 from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import record_function_or_nullcontext
 from vllm.v1.worker.gpu_model_runner import (
@@ -64,6 +72,12 @@ def __init__(self, *args, **kwargs):
         # each model stage has their own hidden size
         self.hidden_size = self.model_config.hf_text_config.hidden_size
         self.inputs_embeds = self._make_buffer(self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False)
+        # Initialize KV cache manager (preserve vllm_config fallback behavior)
+        self.kv_transfer_manager = OmniKVTransferManager.from_vllm_config(self.vllm_config, self.model_config)
+        # `routed_experts_initialized` gates MoE expert-routing capture in
+        # execute_model().  Read from model_config with a safe default so this
+        # runner works with models that don't set the flag (e.g. dense models).
+        self.routed_experts_initialized = getattr(self.model_config, "enable_return_routed_experts", False)
 
     def _make_buffer(self, *size, dtype, numpy=True):
         # Prevent ray from pinning the buffer due to large size
@@ -405,8 +419,15 @@ def execute_model(
                 batch_descriptor=batch_desc,
                 ubatch_slices=ubatch_slices,
             ),
-            record_function_or_nullcontext("Forward"),
-            self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
+            record_function_or_nullcontext("gpu_model_runner: forward"),
+            self.maybe_get_kv_connector_output(
+                scheduler_output,
+                # vLLM renamed the kwarg from `defer_finalize` to
+                # `clear_metadata` (with inverted semantics) in a commit after
+                # v0.18.0.  Pass the updated name to stay compatible with the
+                # base version this branch is built on.
+                clear_metadata=not defer_kv_connector_finalize,
+            ) as kv_connector_output,
         ):
             model_output = self._model_forward(
                 input_ids=input_ids,
@@ -623,37 +644,41 @@ def propose_draft_token_ids(sampled_token_ids):
                 )
 
         spec_config = self.speculative_config
-        use_padded_batch_for_eagle = (
-            spec_config is not None and spec_config.use_eagle() and not spec_config.disable_padded_drafter_batch
-        )
-        effective_drafter_max_model_len = self.max_model_len
-        if effective_drafter_max_model_len is None:
-            effective_drafter_max_model_len = self.model_config.max_model_len
-        if (
-            spec_config is not None
-            and spec_config.draft_model_config is not None
-            and spec_config.draft_model_config.max_model_len is not None
-        ):
-            effective_drafter_max_model_len = spec_config.draft_model_config.max_model_len
-        input_fits_in_drafter = spec_decode_common_attn_metadata and (
-            spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens <= effective_drafter_max_model_len
-        )
-        if use_padded_batch_for_eagle:
-            assert self.speculative_config is not None
-            assert isinstance(self.drafter, EagleProposer)
-            sampled_token_ids = sampler_output.sampled_token_ids
-            if input_fits_in_drafter:
-                propose_draft_token_ids(sampled_token_ids)
-            elif self.valid_sampled_token_count_event is not None:
-                assert spec_decode_common_attn_metadata is not None
-                next_token_ids, valid_sampled_tokens_count = self.drafter.prepare_next_token_ids_padded(
-                    spec_decode_common_attn_metadata,
-                    sampled_token_ids,
-                    self.requests,
-                    self.input_batch,
-                    self.discard_request_mask.gpu,
+        propose_drafts_after_bookkeeping = False
+        if spec_config is not None:
+            input_fits_in_drafter = spec_decode_common_attn_metadata is not None and (
+                spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens
+                <= self.effective_drafter_max_model_len
+            )
+            use_gpu_toks = (
+                spec_config.use_eagle() or spec_config.uses_draft_model() or spec_config.uses_extract_hidden_states()
+            ) and not spec_config.disable_padded_drafter_batch
+            if use_gpu_toks:
+                _valid_proposers = (EagleProposer, DraftModelProposer) + (
+                    (ExtractHiddenStatesProposer,) if ExtractHiddenStatesProposer is not None else ()
                 )
-                self._copy_valid_sampled_token_count(next_token_ids, valid_sampled_tokens_count)
+                assert isinstance(self.drafter, _valid_proposers)
+                sampled_token_ids = sampler_output.sampled_token_ids
+                if input_fits_in_drafter:
+                    propose_draft_token_ids(sampled_token_ids)
+                elif self.valid_sampled_token_count_event is not None:
+                    assert spec_decode_common_attn_metadata is not None
+                    next_token_ids, valid_sampled_tokens_count = self.drafter.prepare_next_token_ids_padded(
+                        spec_decode_common_attn_metadata,
+                        sampled_token_ids,
+                        self.requests,
+                        self.input_batch,
+                        self.discard_request_mask.gpu,
+                    )
+                    self._copy_valid_sampled_token_count(next_token_ids, valid_sampled_tokens_count)
+                    # Since we couldn't run the drafter,
+                    # just use zeros for the draft tokens.
+                    self._draft_token_ids = torch.zeros(1, device=self.device, dtype=torch.int32).expand(
+                        len(self.input_batch.req_ids), self.num_spec_tokens
+                    )
+                    self._copy_draft_token_ids_to_cpu(scheduler_output, zeros_only=True)
+            else:
+                propose_drafts_after_bookkeeping = input_fits_in_drafter
 
         with record_function_or_nullcontext("gpu_model_runner: bookkeep"):
             (
diff --git a/vllm_omni/worker/gpu_generation_model_runner.py b/vllm_omni/worker/gpu_generation_model_runner.py
index f10115c8e90..32ba4692671 100644
--- a/vllm_omni/worker/gpu_generation_model_runner.py
+++ b/vllm_omni/worker/gpu_generation_model_runner.py
@@ -25,7 +25,12 @@
 from vllm.v1.outputs import AsyncModelRunnerOutput, make_empty_encoder_model_runner_output
 from vllm.v1.spec_decode.draft_model import DraftModelProposer
 from vllm.v1.spec_decode.eagle import EagleProposer
-from vllm.v1.spec_decode.extract_hidden_states import ExtractHiddenStatesProposer
+# ExtractHiddenStatesProposer was introduced after vLLM v0.18.0; guard the
+# import so this runner works on the v0.18.0 base without the symbol.
+try:
+    from vllm.v1.spec_decode.extract_hidden_states import ExtractHiddenStatesProposer
+except ImportError:
+    ExtractHiddenStatesProposer = None  # type: ignore[assignment,misc]
 from vllm.v1.utils import record_function_or_nullcontext
 from vllm.v1.worker.gpu_model_runner import (
     EMPTY_MODEL_RUNNER_OUTPUT,
@@ -285,7 +290,9 @@ def execute_model(
             record_function_or_nullcontext("Forward"),
             self.maybe_get_kv_connector_output(
                 scheduler_output,
-                defer_finalize=defer_kv_connector_finalize,
+                # vLLM renamed `defer_finalize` → `clear_metadata` (inverted
+                # semantics) after v0.18.0.  Use the new name unconditionally.
+                clear_metadata=not defer_kv_connector_finalize,
             ) as kv_connector_output,
         ):
             outputs = self._run_generation_model(
@@ -759,10 +766,10 @@ def _dummy_run(
                 or self.speculative_config.uses_draft_model()
                 or self.speculative_config.uses_extract_hidden_states()
             ):
-                assert isinstance(
-                    self.drafter,
-                    EagleProposer | DraftModelProposer | ExtractHiddenStatesProposer,
+                _valid_proposers = (EagleProposer, DraftModelProposer) + (
+                    (ExtractHiddenStatesProposer,) if ExtractHiddenStatesProposer is not None else ()
                 )
+                assert isinstance(self.drafter, _valid_proposers)
                 assert self.speculative_config is not None
                 # Eagle currently only supports PIECEWISE cudagraphs.
                 # Therefore only use cudagraphs if the main model uses PIECEWISE
diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
index de78011c75a..96031533f2b 100644
--- a/vllm_omni/worker/gpu_model_runner.py
+++ b/vllm_omni/worker/gpu_model_runner.py
@@ -15,7 +15,12 @@
 from vllm.utils.math_utils import cdiv
 from vllm.v1.spec_decode.draft_model import DraftModelProposer
 from vllm.v1.spec_decode.eagle import EagleProposer
-from vllm.v1.spec_decode.extract_hidden_states import ExtractHiddenStatesProposer
+# ExtractHiddenStatesProposer was introduced after vLLM v0.18.0; guard the
+# import so this runner works on the v0.18.0 base without the symbol.
+try:
+    from vllm.v1.spec_decode.extract_hidden_states import ExtractHiddenStatesProposer
+except ImportError:
+    ExtractHiddenStatesProposer = None  # type: ignore[assignment,misc]
 from vllm.v1.worker.gpu_input_batch import CachedRequestState
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner, IntermediateTensors, PerLayerAttnMetadata
 from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices
@@ -868,9 +873,12 @@ def _dummy_run(
                 or self.speculative_config.uses_draft_model()
                 or self.speculative_config.uses_extract_hidden_states()
             ):
+                _valid_proposers = (EagleProposer, DraftModelProposer) + (
+                    (ExtractHiddenStatesProposer,) if ExtractHiddenStatesProposer is not None else ()
+                )
                 assert isinstance(
                     self.drafter,
-                    EagleProposer | DraftModelProposer | ExtractHiddenStatesProposer,
+                    _valid_proposers,
                 )
                 assert self.speculative_config is not None
                 # Eagle currently only supports PIECEWISE cudagraphs.

From 09ff661ffbe9e6bfc076b93f979ba72d7d861144 Mon Sep 17 00:00:00 2001
From: kje <rha3122@naver.com>
Date: Tue, 7 Apr 2026 17:37:56 +0900
Subject: [PATCH 4/4] test: add unit tests for HCX-Omni fan-out routing and
 diffusion bridge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cover the three bugs fixed for HyperCLOVAX-SEED-Omni-8B e2e inference:

1. Fan-out routing (omni.py):
   Verify that downstream_stage_ids is computed from connector keys so
   stage-0 fans out to stage-1 AND stage-2 independently, leaf stages
   return no downstream IDs, and linear topologies still work.

2. additional_information → extra bridge (omni_diffusion.py):
   Verify that vision_tokens / audio_tokens stored in OmniTokensPrompt.
   additional_information are propagated to OmniDiffusionRequest.extra,
   first-occurrence wins for batched prompts, and non-dict values are
   skipped gracefully.

3. Empty batch guard:
   Verify that text-only thinker output results in any_forwarded=False
   (no diffusion stages invoked), and that per-modality skipping works
   correctly when only one of vision/audio tokens is present.
---
 tests/unit/entrypoints/__init__.py            |   0
 .../unit/entrypoints/test_hcx_omni_routing.py | 321 ++++++++++++++++++
 2 files changed, 321 insertions(+)
 create mode 100644 tests/unit/entrypoints/__init__.py
 create mode 100644 tests/unit/entrypoints/test_hcx_omni_routing.py

diff --git a/tests/unit/entrypoints/__init__.py b/tests/unit/entrypoints/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/unit/entrypoints/test_hcx_omni_routing.py b/tests/unit/entrypoints/test_hcx_omni_routing.py
new file mode 100644
index 00000000000..75a03951e21
--- /dev/null
+++ b/tests/unit/entrypoints/test_hcx_omni_routing.py
@@ -0,0 +1,321 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for HyperCLOVAX-SEED-Omni-8B routing and diffusion bridge.
+
+These tests target the specific bugs fixed for HCX-Omni e2e inference:
+
+  1. Fan-out routing (omni.py):
+     HyperCLOVAX-SEED-Omni uses a fan-out topology where the thinker
+     (stage 0) sends vision tokens to stage 1 AND audio tokens to stage 2
+     independently.  The former linear `next_stage_id = stage_id + 1`
+     assumption caused a RuntimeError when stage-1 tried to forward to
+     stage-2 via a non-existent connector.  The fix computes downstream
+     stage IDs from connector keys instead.
+
+  2. additional_information → extra bridge (omni_diffusion.py):
+     Stage input processors (e.g. thinker2vision_decoder) store model-
+     specific data such as `vision_tokens` and `audio_tokens` in
+     OmniTokensPrompt.additional_information.  The diffusion pipelines
+     read these values from OmniDiffusionRequest.extra, so the generate()
+     method must bridge the two.
+
+  3. Empty batch guard (omni_stage.py / omni.py):
+     When the thinker produces a text-only reply (no vision/audio tokens),
+     the downstream diffusion stages must be skipped gracefully rather than
+     raising "Cannot execute model with empty request list".
+"""
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# 1. Fan-out routing — downstream_stage_ids computation
+# ---------------------------------------------------------------------------
+
+class TestFanoutRouting:
+    """Verify connector-key-based downstream stage discovery.
+
+    The core fix in omni.py:
+        downstream_stage_ids = sorted([
+            int(to) for (frm, to) in self.connectors.keys()
+            if frm == str(stage_id)
+        ])
+    """
+
+    def _compute_downstream(self, connectors: dict, stage_id: int) -> list[int]:
+        """Replicate the fixed routing logic from omni.py."""
+        return sorted([
+            int(to) for (frm, to) in connectors.keys()
+            if frm == str(stage_id)
+        ])
+
+    def test_fanout_topology_stage0(self):
+        """Stage-0 (thinker) fans out to stage-1 AND stage-2 independently."""
+        # HCX-Omni YAML: edges: [{from:0,to:1}, {from:0,to:2}]
+        connectors = {
+            ("0", "1"): object(),
+            ("0", "2"): object(),
+        }
+        downstream = self._compute_downstream(connectors, stage_id=0)
+        assert downstream == [1, 2], (
+            "Thinker must forward to both vision decoder (1) and audio decoder (2)"
+        )
+
+    def test_fanout_topology_leaf_stages(self):
+        """Stage-1 and stage-2 are leaf nodes with no outgoing edges."""
+        connectors = {
+            ("0", "1"): object(),
+            ("0", "2"): object(),
+        }
+        assert self._compute_downstream(connectors, stage_id=1) == [], (
+            "Vision decoder (stage-1) must not forward to any further stage"
+        )
+        assert self._compute_downstream(connectors, stage_id=2) == [], (
+            "Audio decoder (stage-2) must not forward to any further stage"
+        )
+
+    def test_linear_topology_still_works(self):
+        """Linear pipelines (0→1→2) continue to work correctly."""
+        connectors = {
+            ("0", "1"): object(),
+            ("1", "2"): object(),
+        }
+        assert self._compute_downstream(connectors, stage_id=0) == [1]
+        assert self._compute_downstream(connectors, stage_id=1) == [2]
+        assert self._compute_downstream(connectors, stage_id=2) == []
+
+    def test_no_connectors_from_stage(self):
+        """Stage with no outgoing connectors returns empty list (terminal)."""
+        connectors = {("0", "1"): object()}
+        assert self._compute_downstream(connectors, stage_id=1) == []
+
+    def test_downstream_ids_are_sorted(self):
+        """Downstream stage IDs are returned in ascending order."""
+        # Insert in reverse order to verify sorting
+        connectors = {
+            ("0", "3"): object(),
+            ("0", "1"): object(),
+            ("0", "2"): object(),
+        }
+        downstream = self._compute_downstream(connectors, stage_id=0)
+        assert downstream == [1, 2, 3]
+
+    def test_fanout_with_multiple_source_stages(self):
+        """Connector map with multiple source stages: each sees only its own edges."""
+        connectors = {
+            ("0", "1"): object(),
+            ("0", "2"): object(),
+            ("1", "3"): object(),
+            ("2", "3"): object(),
+        }
+        assert self._compute_downstream(connectors, stage_id=0) == [1, 2]
+        assert self._compute_downstream(connectors, stage_id=1) == [3]
+        assert self._compute_downstream(connectors, stage_id=2) == [3]
+        assert self._compute_downstream(connectors, stage_id=3) == []
+
+
+# ---------------------------------------------------------------------------
+# 2. additional_information → extra bridge (omni_diffusion.py)
+# ---------------------------------------------------------------------------
+
+class TestAdditionalInfoBridge:
+    """Verify that OmniTokensPrompt.additional_information is merged into
+    OmniDiffusionRequest.extra.
+
+    The fixed generate() in OmniDiffusion:
+        extra: dict = {}
+        for prompt in prompts:
+            if isinstance(prompt, dict):
+                ai = prompt.get("additional_information")
+                if isinstance(ai, dict):
+                    for k, v in ai.items():
+                        if k not in extra:
+                            extra[k] = v
+        request = OmniDiffusionRequest(prompts, ..., extra=extra)
+    """
+
+    def _build_extra(self, prompts: list) -> dict:
+        """Replicate the bridge logic from omni_diffusion.py."""
+        extra: dict = {}
+        for prompt in prompts:
+            if isinstance(prompt, dict):
+                ai = prompt.get("additional_information")
+                if isinstance(ai, dict):
+                    for k, v in ai.items():
+                        if k not in extra:
+                            extra[k] = v
+        return extra
+
+    def test_vision_tokens_propagated(self):
+        """vision_tokens from additional_information must reach extra."""
+        vision_tokens = list(range(729))
+        prompts = [
+            {
+                "prompt_token_ids": [1, 2, 3],
+                "additional_information": {"vision_tokens": vision_tokens},
+            }
+        ]
+        extra = self._build_extra(prompts)
+        assert "vision_tokens" in extra
+        assert extra["vision_tokens"] == vision_tokens
+
+    def test_audio_tokens_propagated(self):
+        """audio_tokens and speakers from additional_information must reach extra."""
+        audio_tokens = [[10, 20, 30]]
+        prompts = [
+            {
+                "prompt_token_ids": [1, 2, 3],
+                "additional_information": {
+                    "audio_tokens": audio_tokens,
+                    "speakers": ["fkms"],
+                },
+            }
+        ]
+        extra = self._build_extra(prompts)
+        assert extra["audio_tokens"] == audio_tokens
+        assert extra["speakers"] == ["fkms"]
+
+    def test_no_additional_information_gives_empty_extra(self):
+        """Prompts without additional_information produce an empty extra dict."""
+        prompts = [{"prompt_token_ids": [1, 2, 3]}]
+        extra = self._build_extra(prompts)
+        assert extra == {}
+
+    def test_string_prompt_ignored(self):
+        """Plain string prompts are skipped (only dict prompts are inspected)."""
+        prompts = ["hello world"]
+        extra = self._build_extra(prompts)
+        assert extra == {}
+
+    def test_first_occurrence_wins_for_batched_prompts(self):
+        """When multiple prompts carry the same key, the first value is used.
+
+        This mirrors the single-request behaviour and prevents later prompts
+        from silently overwriting earlier ones in a batch.
+        """
+        prompts = [
+            {"additional_information": {"vision_tokens": [1, 2, 3]}},
+            {"additional_information": {"vision_tokens": [4, 5, 6]}},  # must be ignored
+        ]
+        extra = self._build_extra(prompts)
+        assert extra["vision_tokens"] == [1, 2, 3]
+
+    def test_different_keys_merged_across_prompts(self):
+        """Different keys from different prompts are all collected."""
+        prompts = [
+            {"additional_information": {"vision_tokens": [1, 2, 3]}},
+            {"additional_information": {"audio_tokens": [[10, 20]]}},
+        ]
+        extra = self._build_extra(prompts)
+        assert "vision_tokens" in extra
+        assert "audio_tokens" in extra
+
+    def test_non_dict_additional_information_ignored(self):
+        """Non-dict additional_information values are skipped gracefully."""
+        prompts = [
+            {"additional_information": "invalid_value"},
+            {"additional_information": None},
+            {"additional_information": 42},
+        ]
+        extra = self._build_extra(prompts)
+        assert extra == {}
+
+
+# ---------------------------------------------------------------------------
+# 3. Empty batch guard — text-only reply skips diffusion stages
+# ---------------------------------------------------------------------------
+
+class TestEmptyBatchGuard:
+    """Verify that the empty batch guard in the routing loop works correctly.
+
+    When the thinker produces a text-only reply (no image or audio tokens),
+    thinker2vision_decoder / thinker2audio_decoder return an empty list.
+    The routing loop must detect this and skip the diffusion stage (setting
+    `any_forwarded = False` for that modality) rather than forwarding an
+    empty request list to the diffusion engine.
+
+    The property being tested: if ALL downstream stages return empty inputs,
+    `any_forwarded` must remain False and the request is counted as complete.
+    """
+
+    def _simulate_routing(
+        self,
+        connectors: dict,
+        stage_id: int,
+        stage_inputs: dict[int, list],
+    ) -> tuple[bool, list[int]]:
+        """Simulate the routing loop body for one completed request.
+
+        Returns:
+            (any_forwarded, forwarded_stage_ids)
+        """
+        downstream_stage_ids = sorted([
+            int(to) for (frm, to) in connectors.keys()
+            if frm == str(stage_id)
+        ])
+        any_forwarded = False
+        forwarded_to = []
+        for next_stage_id in downstream_stage_ids:
+            next_inputs = stage_inputs.get(next_stage_id, [])
+            if not next_inputs:
+                # Replicate: "No inputs for this modality, skip"
+                continue
+            # Would send here; record as forwarded
+            any_forwarded = True
+            forwarded_to.append(next_stage_id)
+        return any_forwarded, forwarded_to
+
+    def test_text_only_reply_skips_all_diffusion_stages(self):
+        """Text-only thinker output → no forwarding to vision or audio decoders."""
+        connectors = {("0", "1"): object(), ("0", "2"): object()}
+        # Both downstream processors return empty lists (no tokens for either)
+        stage_inputs = {1: [], 2: []}
+        any_forwarded, forwarded_to = self._simulate_routing(connectors, 0, stage_inputs)
+        assert not any_forwarded, (
+            "Text-only reply must not forward to any diffusion stage"
+        )
+        assert forwarded_to == []
+
+    def test_image_only_output_skips_audio_stage(self):
+        """Vision tokens present but no audio → only vision decoder receives request."""
+        connectors = {("0", "1"): object(), ("0", "2"): object()}
+        stage_inputs = {
+            1: [{"prompt_token_ids": list(range(729))}],  # vision tokens present
+            2: [],  # no audio tokens
+        }
+        any_forwarded, forwarded_to = self._simulate_routing(connectors, 0, stage_inputs)
+        assert any_forwarded
+        assert forwarded_to == [1]
+        assert 2 not in forwarded_to
+
+    def test_audio_only_output_skips_vision_stage(self):
+        """Audio tokens present but no vision → only audio decoder receives request."""
+        connectors = {("0", "1"): object(), ("0", "2"): object()}
+        stage_inputs = {
+            1: [],  # no vision tokens
+            2: [{"additional_information": {"audio_tokens": [[1, 2, 3]]}}],
+        }
+        any_forwarded, forwarded_to = self._simulate_routing(connectors, 0, stage_inputs)
+        assert any_forwarded
+        assert forwarded_to == [2]
+        assert 1 not in forwarded_to
+
+    def test_both_modalities_forward_to_both_stages(self):
+        """Both vision and audio tokens → both decoders receive requests."""
+        connectors = {("0", "1"): object(), ("0", "2"): object()}
+        stage_inputs = {
+            1: [{"prompt_token_ids": list(range(729))}],
+            2: [{"additional_information": {"audio_tokens": [[1, 2, 3]]}}],
+        }
+        any_forwarded, forwarded_to = self._simulate_routing(connectors, 0, stage_inputs)
+        assert any_forwarded
+        assert sorted(forwarded_to) == [1, 2]
+
+    def test_leaf_stage_never_forwards(self):
+        """Leaf stages (no outgoing connectors) always result in any_forwarded=False."""
+        connectors = {("0", "1"): object(), ("0", "2"): object()}
+        # stage_id=1 (vision decoder) has no outgoing edges
+        stage_inputs = {}  # irrelevant since no downstream stages exist
+        any_forwarded, forwarded_to = self._simulate_routing(connectors, 1, stage_inputs)
+        assert not any_forwarded
+        assert forwarded_to == []