From 487345c6bc8b6517ee47bcd8faafd31ef8626e6c Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Sat, 18 Apr 2026 22:40:42 +0530
Subject: [PATCH 01/54] feat(ming-tts): add dense omni pipeline

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 docs/models/supported_models.md               |   1 +
 .../examples/offline_inference/ming_tts.md    | 131 +++
 .../examples/online_serving/ming_tts.md       | 163 ++++
 examples/offline_inference/ming_tts/README.md | 224 +++++
 .../offline_inference/ming_tts/end2end.py     | 654 +++++++++++++
 examples/online_serving/ming_tts/README.md    | 312 +++++++
 .../ming_tts/openai_speech_client.py          | 223 +++++
 examples/online_serving/ming_tts/run_curl.sh  | 217 +++++
 .../online_serving/ming_tts/run_server.sh     |  22 +
 .../test_chunk_transfer_adapter.py            |  25 +
 tests/e2e/offline_inference/test_ming_tts.py  | 231 +++++
 tests/e2e/online_serving/test_ming_tts.py     |  95 ++
 tests/engine/test_async_omni_engine_input.py  |  54 ++
 .../openai_api/test_serving_speech.py         | 378 ++++++++
 .../ming_tts/test_ming_tts_components.py      | 505 ++++++++++
 .../ming_tts/test_ming_tts_config_shim.py     |  51 ++
 .../models/ming_tts/test_ming_tts_loaders.py  | 524 +++++++++++
 .../ming_tts/test_ming_tts_prompt_builder.py  | 375 ++++++++
 .../test_ming_tts_async_chunk.py              | 421 +++++++++
 tests/worker/test_ming_tts_runner.py          | 674 ++++++++++++++
 tests/worker/test_omni_gpu_model_runner.py    |  22 +
 vllm_omni/engine/arg_utils.py                 |   4 +
 vllm_omni/engine/async_omni_engine.py         |  39 +-
 vllm_omni/engine/stage_init_utils.py          |   8 +
 .../entrypoints/openai/protocol/audio.py      |  66 +-
 .../entrypoints/openai/serving_speech.py      | 287 +++++-
 vllm_omni/inputs/preprocess.py                |  26 +
 .../models/ming_tts/__init__.py               |  13 +
 .../ming_tts/audio_tokenizer/__init__.py      |   2 +
 .../ming_tts/audio_tokenizer/audio_encoder.py | 135 +++
 .../configuration_audio_vae.py                |  40 +
 .../models/ming_tts/audio_tokenizer/istft.py  | 188 ++++
 .../audio_tokenizer/modeling_audio_vae.py     | 178 ++++
 .../ming_tts/audio_tokenizer/vae_modules.py   | 208 +++++
 .../models/ming_tts/config_ming_tts.py        | 364 ++++++++
 .../ming_tts/configuration_ming_dense.py      |  57 ++
 .../models/ming_tts/fm/__init__.py            |   2 +
 .../model_executor/models/ming_tts/fm/cfm.py  | 207 +++++
 .../model_executor/models/ming_tts/fm/dit.py  | 216 +++++
 .../models/ming_tts/fm/flowloss.py            |  54 ++
 .../models/ming_tts/fm/modules.py             | 147 +++
 .../model_executor/models/ming_tts/ingress.py | 177 ++++
 .../models/ming_tts/ming_tts.py               | 581 ++++++++++++
 .../models/ming_tts/ming_tts_audio_vae.py     | 318 +++++++
 .../models/ming_tts/ming_tts_llm.py           | 864 ++++++++++++++++++
 .../models/ming_tts/prompt_builder.py         | 429 +++++++++
 .../models/ming_tts/speaker_extractor.py      |  66 ++
 vllm_omni/model_executor/models/registry.py   |  16 +
 .../stage_configs/ming_tts.yaml               |  65 ++
 .../stage_configs/ming_tts_async_chunk.yaml   |  86 ++
 .../stage_input_processors/ming_tts.py        | 278 ++++++
 vllm_omni/worker/gpu_model_runner.py          |   2 +
 52 files changed, 10402 insertions(+), 23 deletions(-)
 create mode 100644 docs/user_guide/examples/offline_inference/ming_tts.md
 create mode 100644 docs/user_guide/examples/online_serving/ming_tts.md
 create mode 100644 examples/offline_inference/ming_tts/README.md
 create mode 100644 examples/offline_inference/ming_tts/end2end.py
 create mode 100644 examples/online_serving/ming_tts/README.md
 create mode 100644 examples/online_serving/ming_tts/openai_speech_client.py
 create mode 100755 examples/online_serving/ming_tts/run_curl.sh
 create mode 100755 examples/online_serving/ming_tts/run_server.sh
 create mode 100644 tests/e2e/offline_inference/test_ming_tts.py
 create mode 100644 tests/e2e/online_serving/test_ming_tts.py
 create mode 100644 tests/model_executor/models/ming_tts/test_ming_tts_components.py
 create mode 100644 tests/model_executor/models/ming_tts/test_ming_tts_config_shim.py
 create mode 100644 tests/model_executor/models/ming_tts/test_ming_tts_loaders.py
 create mode 100644 tests/model_executor/models/ming_tts/test_ming_tts_prompt_builder.py
 create mode 100644 tests/model_executor/stage_input_processors/test_ming_tts_async_chunk.py
 create mode 100644 tests/worker/test_ming_tts_runner.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/__init__.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/audio_tokenizer/__init__.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/audio_tokenizer/audio_encoder.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/audio_tokenizer/configuration_audio_vae.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/audio_tokenizer/modeling_audio_vae.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/configuration_ming_dense.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/fm/__init__.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/fm/cfm.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/fm/dit.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/fm/flowloss.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/fm/modules.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/ingress.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/ming_tts.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/prompt_builder.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/speaker_extractor.py
 create mode 100644 vllm_omni/model_executor/stage_configs/ming_tts.yaml
 create mode 100644 vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml
 create mode 100644 vllm_omni/model_executor/stage_input_processors/ming_tts.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 8ece14f9c00..b298415e177 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -55,6 +55,7 @@ th {
 | `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-CustomVoice | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-VoiceDesign | `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-Base | `Qwen/Qwen3-TTS-12Hz-0.6B-Base` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
+| `MingTTSForConditionalGeneration` | Ming-omni-tts-0.5B | `inclusionAI/Ming-omni-tts-0.5B` | ✅︎ | | | |
 | `NextStep11Pipeline` | NextStep-1.1 | `stepfun-ai/NextStep-1.1` | ✅︎ | ✅︎ | | ✅︎ |
 | `MiMoAudioForConditionalGeneration` | MiMo-Audio-7B-Instruct | `XiaomiMiMo/MiMo-Audio-7B-Instruct` | ✅︎ | ✅︎ | | |
 | `Flux2Pipeline` | FLUX.2-dev | `black-forest-labs/FLUX.2-dev` | ✅︎ | ✅︎ | | |
diff --git a/docs/user_guide/examples/offline_inference/ming_tts.md b/docs/user_guide/examples/offline_inference/ming_tts.md
new file mode 100644
index 00000000000..7a8cd65ed32
--- /dev/null
+++ b/docs/user_guide/examples/offline_inference/ming_tts.md
@@ -0,0 +1,131 @@
+# Ming-omni-tts
+
+Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/ming_tts>.
+
+This directory contains an offline Ming example that uses the in-repo Ming prompt builder directly. It now covers the broader upstream dense TTS cookbook surface: style, IP, music-only generation, emotion, dialect, zero-shot clone, podcast, speech+bgm, and speech+sound.
+
+## Quick Start
+
+Run a zero-speaker style case:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case style \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --enforce-eager
+```
+
+Run emotion-controlled speech:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case emotion \
+    --ref-audio /path/to/emotion_prompt.wav \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --enforce-eager
+```
+
+Run zero-shot cloning with a transcript:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case zero_shot \
+    --ref-audio /path/to/reference.wav \
+    --ref-text "在此奉劝大家别乱打美白针。" \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --enforce-eager
+```
+
+Run podcast generation:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case podcast \
+    --ref-audio-paths /path/to/CTS-CN-F2F-2019-11-11-423-012-A.wav /path/to/CTS-CN-F2F-2019-11-11-423-012-B.wav \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --enforce-eager
+```
+
+Run with stats and a manifest:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case style \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --enforce-eager \
+    --enable-stats \
+    --stats-log-file output_audio/ming_style_pipeline.log \
+    --metadata-json output_audio/ming_style_manifest.json
+```
+
+## Built-in Cases
+
+- `style`: zero-speaker style-conditioned speech
+- `ip`: zero-speaker IP voice generation
+- `bgm`: music generation
+- `emotion`: reference-audio speech with emotion control
+- `basic`: reference-audio cloning with speed / pitch / volume control
+- `dialect`: reference-audio cloning with dialect control
+- `zero_shot`: reference-audio cloning with explicit transcript
+- `podcast`: multi-reference dialogue generation with automatic speaker embedding extraction
+- `speech_bgm`: speech with background music conditioning
+- `speech_sound`: speech with environment sound conditioning
+
+`TTA` from the upstream Ming notebook is not included here because it uses `inclusionAI/Ming-omni-tta-0.5B`, not the dense TTS model covered by this example.
+
+## Streaming
+
+Use async_chunk streaming with `AsyncOmni`:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case basic \
+    --ref-audio /path/to/10002287-00000095.wav \
+    --streaming \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml \
+    --enforce-eager
+```
+
+`--streaming` currently supports one prompt per process invocation. Use
+blocking mode for `--num-prompts > 1`.
+
+## Validation matrix
+
+The example is intended to cover the dense TTS workflows used by the Ming
+validation helper:
+
+| Case | Blocking | Async chunk | Extra inputs |
+|---|---:|---:|---|
+| `style` | Yes | Optional smoke test | none |
+| `ip` | Yes | Optional smoke test | none |
+| `bgm` | Yes | Optional smoke test | none |
+| `emotion` | Yes | Yes | reference WAV |
+| `basic` | Yes | Yes | reference WAV |
+| `dialect` | Yes | Yes | reference WAV |
+| `zero_shot` | Yes | Yes | reference WAV and transcript |
+| `podcast` | Yes | Yes | two reference WAVs |
+| `speech_bgm` | Yes | Yes | reference WAV |
+| `speech_sound` | Yes | Yes | reference WAV |
+
+The offline example also exposes vLLM-Omni runtime/reporting controls such as:
+
+- `--num-prompts`
+- `--enable-stats`
+- `--stats-log-file`
+- `--metadata-json`
+- `--stage-init-timeout`
+- `--init-timeout`
+- `--batch-timeout`
+- `--worker-backend`
+- `--ray-address`
+
+## Example materials
+
+??? abstract "README.md"
+    ``````md
+    --8<-- "examples/offline_inference/ming_tts/README.md"
+    ``````
+??? abstract "end2end.py"
+    ``````py
+    --8<-- "examples/offline_inference/ming_tts/end2end.py"
+    ``````
diff --git a/docs/user_guide/examples/online_serving/ming_tts.md b/docs/user_guide/examples/online_serving/ming_tts.md
new file mode 100644
index 00000000000..e5bc5144dda
--- /dev/null
+++ b/docs/user_guide/examples/online_serving/ming_tts.md
@@ -0,0 +1,163 @@
+# Ming-omni-tts
+
+Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serving/ming_tts>.
+
+This example shows how to serve Ming through the OpenAI-compatible `/v1/audio/speech` endpoint. The server builds Ming prompts directly with the in-repo prompt builder, so online requests support Ming-specific structured controls instead of the Qwen placeholder path.
+
+## Installation
+
+Please refer to [README.md](https://github.com/vllm-project/vllm-omni/tree/main/README.md)
+
+## Launch the Server
+
+```bash
+vllm-omni serve inclusionAI/Ming-omni-tts-0.5B \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml \
+    --omni \
+    --port 8091 \
+    --enforce-eager
+```
+
+Or:
+
+```bash
+cd examples/online_serving/ming_tts
+./run_server.sh
+```
+
+The canonical Ming online client is `openai_speech_client.py`. It targets the
+local vLLM-Omni server, not OpenAI's cloud API, so `api_key=EMPTY` is enough
+for local testing.
+
+## Example Requests
+
+Basic TTS:
+
+```bash
+python openai_speech_client.py \
+    --text "你好，这是 Ming 在线语音合成测试。"
+```
+
+Style-conditioned speech:
+
+```bash
+python openai_speech_client.py \
+    --text "我会一直在这里陪着你。" \
+    --instructions "轻柔的ASMR耳语，慢速，贴近麦克风"
+```
+
+Structured Ming control:
+
+```bash
+python openai_speech_client.py \
+    --text "我觉得社会企业同个人都有责任" \
+    --instruction-json '{"方言":"广粤话"}'
+```
+
+IP voice generation:
+
+```bash
+python openai_speech_client.py \
+    --text "这款产品的名字，叫变态坑爹牛肉丸。" \
+    --voice 灵小甄
+```
+
+Reference-audio cloning:
+
+Use `ref_audio` by itself for Ming prompt-waveform conditioning. Add
+`ref_text` when the request is transcript cloning, such as zero-shot or
+podcast-style prompts.
+
+```bash
+python openai_speech_client.py \
+    --task-type Base \
+    --text "我们的愿景是构建未来服务业的数字化基础设施。" \
+    --ref-audio /path/to/reference.wav \
+    --ref-text "在此奉劝大家别乱打美白针。"
+```
+
+Speaker-embedding cloning:
+
+```bash
+python openai_speech_client.py \
+    --task-type Base \
+    --text "你好，这是一段使用说话人向量的合成语音。" \
+    --speaker-embedding /path/to/ming_speaker_embedding.json
+```
+
+Streaming PCM:
+
+```bash
+python openai_speech_client.py \
+    --text "你好，这是流式输出测试。" \
+    --instructions "平静，普通话" \
+    --stream \
+    --output ming_output.pcm
+```
+
+## Curl Helper
+
+Use the bundled helper for common request types:
+
+```bash
+./run_curl.sh basic
+./run_curl.sh style
+./run_curl.sh ip
+REF_AUDIO=/path/to/emotion_prompt.wav ./run_curl.sh emotion
+REF_AUDIO=/path/to/yue_prompt.wav ./run_curl.sh dialect
+REF_AUDIO=/path/to/reference.wav REF_TEXT="在此奉劝大家别乱打美白针。" ./run_curl.sh zero_shot
+REF_AUDIO=/path/to/speaker_1.wav REF_AUDIO_2=/path/to/speaker_2.wav REF_TEXT="speaker_1:你好。 speaker_2:你好。" ./run_curl.sh podcast
+REF_AUDIO=/path/to/00000309-00000300.wav ./run_curl.sh speech_bgm
+REF_AUDIO=/path/to/00000309-00000300.wav ./run_curl.sh speech_sound
+REF_AUDIO=/path/to/reference.wav REF_TEXT="在此奉劝大家别乱打美白针。" ./run_curl.sh clone_ref_audio
+SPEAKER_EMBEDDING=/path/to/ming_speaker_embedding.json ./run_curl.sh clone_embedding
+./run_curl.sh stream
+```
+
+## Audio Inputs
+
+- `ref_audio` accepts a local path, remote URL, or `data:` URL
+- The Python client converts local files into a base64 `data:` URL
+- `speaker_embedding` must be a JSON file with exactly 192 numeric values
+- Ming prompt-waveform cases can use `ref_audio` without `ref_text`
+- Zero-shot and podcast-style transcript cloning should include `ref_text`
+
+The bundled `run_curl.sh basic` mode is plain/default TTS and does not require
+`REF_AUDIO`. The upstream cookbook-style `basic` case uses `ref_audio` plus
+structured speed / pitch / volume instructions.
+
+## Field Mapping
+
+For Ming, the generic OpenAI request fields map to Ming controls like this:
+
+- `input` -> target text
+- `instructions` -> Ming instruction string, or a JSON string for the structured Ming control object
+- `voice` -> Ming `IP`
+- `language` -> Ming `方言`
+- `ref_audio` -> Ming prompt waveform
+- `ref_text` -> optional transcript for zero-shot and podcast-style cloning
+- `speaker_embedding` -> 192-d Ming speaker embedding
+
+## Voice Listing
+
+- `/v1/audio/voices` lists uploaded voices for Ming.
+- Built-in Ming IP labels can still be used as `voice`, but they are not enumerated by the API.
+
+## Example materials
+
+??? abstract "README.md"
+    ``````md
+    --8<-- "examples/online_serving/ming_tts/README.md"
+    ``````
+??? abstract "run_server.sh"
+    ``````sh
+    --8<-- "examples/online_serving/ming_tts/run_server.sh"
+    ``````
+??? abstract "openai_speech_client.py"
+    ``````py
+    --8<-- "examples/online_serving/ming_tts/openai_speech_client.py"
+    ``````
+??? abstract "run_curl.sh"
+    ``````sh
+    --8<-- "examples/online_serving/ming_tts/run_curl.sh"
+    ``````
diff --git a/examples/offline_inference/ming_tts/README.md b/examples/offline_inference/ming_tts/README.md
new file mode 100644
index 00000000000..c67772c43c1
--- /dev/null
+++ b/examples/offline_inference/ming_tts/README.md
@@ -0,0 +1,224 @@
+# Ming-omni-tts Offline Inference
+
+`end2end.py` runs Ming dense 0.5B end to end with vLLM-Omni. It uses the in-repo Ming prompt builder directly, so the example request shape matches the real integration instead of a simplified wrapper.
+
+## Model Overview
+
+Ming dense 0.5B is exposed here as a two-stage offline pipeline:
+
+- **Stage 0**: Qwen2-based AR generation with Ming prompt formatting and inline flow controls
+- **Stage 1**: audio VAE decode to mono 44.1 kHz waveform
+
+The example supports both:
+
+- **Sequential eager** via `ming_tts.yaml`
+- **Async chunk eager** via `ming_tts_async_chunk.yaml`
+
+## Setup
+
+Install vLLM-Omni with the platform requirements for your accelerator:
+
+```bash
+uv pip install -e .
+```
+
+The Ming offline example does not require a separate upstream Ming package.
+Reference-audio cases use the repo dependencies for audio loading,
+resampling, and CampPlus speaker extraction, including `soundfile`,
+`torchaudio`, and `onnxruntime`.
+
+## Supported Cases
+
+These cases cover the upstream dense TTS cookbook surface that maps cleanly onto the current vLLM-Omni example:
+
+- `style`: zero-speaker style-conditioned speech
+- `ip`: zero-speaker IP voice generation
+- `bgm`: music-only generation
+- `emotion`: reference-audio speech with emotion control
+- `basic`: reference-audio speech with speed / pitch / volume control
+- `dialect`: reference-audio speech with dialect control
+- `zero_shot`: reference-audio cloning with explicit transcript
+- `podcast`: multi-reference dialogue generation with automatic speaker embedding extraction
+- `speech_bgm`: speech with background music conditioning
+- `speech_sound`: speech with environmental sound conditioning
+
+Not included:
+
+- `TTA` from the upstream cookbook. That notebook switches to `inclusionAI/Ming-omni-tta-0.5B`, which is a different model family and is out of scope for this dense TTS example.
+
+## Quick Start
+
+Run the zero-speaker style example:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case style \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --enforce-eager
+```
+
+Run zero-shot cloning with a transcript:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case zero_shot \
+    --ref-audio /path/to/10002287-00000094.wav \
+    --ref-text "在此奉劝大家别乱打美白针。" \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --enforce-eager
+```
+
+Run emotion-controlled speech:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case emotion \
+    --ref-audio /path/to/emotion_prompt.wav \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --enforce-eager
+```
+
+Run podcast generation with two reference clips:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case podcast \
+    --ref-audio-paths /path/to/CTS-CN-F2F-2019-11-11-423-012-A.wav /path/to/CTS-CN-F2F-2019-11-11-423-012-B.wav \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --enforce-eager
+```
+
+The script automatically extracts one 192-d speaker embedding per reference WAV using the Ming model's `campplus.onnx`.
+
+If you already have precomputed multi-speaker embeddings, you can override extraction with:
+
+```bash
+--speaker-embedding /path/to/podcast_speaker_embeddings.json
+```
+
+where the JSON is a list of speaker embeddings, one 192-d vector per speaker.
+
+Use async_chunk streaming:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case basic \
+    --ref-audio /path/to/10002287-00000095.wav \
+    --streaming \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml \
+    --enforce-eager
+```
+
+`--streaming` uses `AsyncOmni` and the async_chunk stage config. It currently
+supports one prompt per process invocation; use blocking mode for
+`--num-prompts > 1`.
+
+Collect runtime stats and a manifest:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case style \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --enforce-eager \
+    --enable-stats \
+    --stats-log-file output_audio/ming_style_pipeline.log \
+    --metadata-json output_audio/ming_style_manifest.json
+```
+
+## Reference Fixtures
+
+The upstream Ming cookbook uses these public audio fixtures from `inclusionAI/Ming-omni-tts/data/wavs`:
+
+- `10002287-00000094.wav` for zero-shot cloning
+- `10002287-00000095.wav` for `basic`
+- `emotion_prompt.wav` for `emotion`
+- `yue_prompt.wav` for `dialect`
+- `00000309-00000300.wav` for `speech_bgm` and `speech_sound`
+- `CTS-CN-F2F-2019-11-11-423-012-A.wav` and `CTS-CN-F2F-2019-11-11-423-012-B.wav` for `podcast`
+
+## Validation Matrix
+
+The repo-facing example is intended to cover the same dense TTS workflows used
+by the local Ming validation script:
+
+| Case | Blocking `ming_tts.yaml` | Async chunk `ming_tts_async_chunk.yaml` | Extra inputs |
+|---|---:|---:|---|
+| `style` | Yes | Optional smoke test | none |
+| `ip` | Yes | Optional smoke test | none |
+| `bgm` | Yes | Optional smoke test | none |
+| `emotion` | Yes | Yes | `--ref-audio emotion_prompt.wav` |
+| `basic` | Yes | Yes | `--ref-audio 10002287-00000095.wav` |
+| `dialect` | Yes | Yes | `--ref-audio yue_prompt.wav` |
+| `zero_shot` | Yes | Yes | `--ref-audio 10002287-00000094.wav --ref-text ...` |
+| `podcast` | Yes | Yes | two `--ref-audio-paths` |
+| `speech_bgm` | Yes | Yes | `--ref-audio 00000309-00000300.wav` |
+| `speech_sound` | Yes | Yes | `--ref-audio 00000309-00000300.wav` |
+
+## Validated Outputs
+
+Validation on an L4 GPU completed the full blocking matrix and the default
+async_chunk matrix. Default async_chunk matched blocking output frame counts
+and Stage-1 patch counts for every case:
+
+| Case | Blocking frames / patches / sec | Async chunk frames / patches / sec |
+|---|---:|---:|
+| `style` | 409248 / 29 / 9.28 | 409248 / 29 / 9.28 |
+| `ip` | 183456 / 13 / 4.16 | 183456 / 13 / 4.16 |
+| `bgm` | 1326528 / 94 / 30.08 | 1326528 / 94 / 30.08 |
+| `emotion` | 324576 / 23 / 7.36 | 324576 / 23 / 7.36 |
+| `basic` | 211680 / 15 / 4.80 | 211680 / 15 / 4.80 |
+| `dialect` | 239904 / 17 / 5.44 | 239904 / 17 / 5.44 |
+| `zero_shot` | 409248 / 29 / 9.28 | 409248 / 29 / 9.28 |
+| `podcast` | 437472 / 31 / 9.92 | 437472 / 31 / 9.92 |
+| `speech_bgm` | 296352 / 21 / 6.72 | 296352 / 21 / 6.72 |
+| `speech_sound` | 352800 / 25 / 8.00 | 352800 / 25 / 8.00 |
+
+## Key Arguments
+
+| Argument | Description |
+|---|---|
+| `--model` | Hugging Face repo or local Ming checkpoint path |
+| `--stage-configs-path` | Stage config YAML. Use `ming_tts.yaml` for blocking generation or `ming_tts_async_chunk.yaml` for streaming |
+| `--case` | Built-in demo case |
+| `--ref-audio` | Single reference wav path for cloning-style cases |
+| `--ref-audio-paths` | Multiple reference wav paths, used by `podcast` |
+| `--ref-text` | Reference transcript. Required for `zero_shot` |
+| `--instructions` | Free-form Ming instruction string |
+| `--instruction-json` | Structured Ming instruction JSON |
+| `--speaker-embedding` | JSON file containing a 192-d speaker embedding |
+| `--extract-speaker-embeddings` | Force CampPlus speaker extraction from the provided reference audio paths |
+| `--max-decode-steps` | Override `ming_max_decode_steps` |
+| `--num-prompts` | Repeat the same case N times. Outputs are indexed when `N > 1` |
+| `--streaming` | Use `AsyncOmni` and async_chunk transport |
+| `--enforce-eager` | Recommended for Ming dense; non-eager is out of scope |
+| `--enable-stats` / `--log-stats` | Enable vLLM-Omni per-request stats logging |
+| `--stats-log-file` | Optional path for the stats log |
+| `--metadata-json` | Optional path for the run manifest JSON |
+| `--stage-init-timeout` | Per-stage initialization timeout in seconds |
+| `--init-timeout` | Total initialization timeout in seconds |
+| `--batch-timeout` | Batch timeout in seconds |
+| `--worker-backend` | `multi_process` or `ray` |
+| `--ray-address` | Ray cluster address when using `--worker-backend ray` |
+
+## Output
+
+- The script writes one mono 44.1 kHz WAV file per run
+- Default output directory: `output_audio/`
+- Default filename: `ming_<case>.wav`
+- When `--num-prompts > 1`, outputs are indexed as `ming_<case>_00000.wav`, `..._00001.wav`, etc.
+- When stats are enabled, the script can also write:
+  - a stats log file such as `ming_style_pipeline.log`
+  - a manifest JSON with per-output metadata, stage durations, peak memory info,
+    and streaming client latency metrics when `--streaming` is used
+
+## Notes
+
+- `style` and `ip` are zero-speaker paths and do not require a reference clip
+- `emotion`, `basic`, `dialect`, `speech_bgm`, and `speech_sound` require one reference clip
+- `zero_shot` requires both `--ref-audio` and `--ref-text`
+- `podcast` requires at least two reference clips via `--ref-audio-paths`
+- `podcast` automatically extracts one speaker embedding per reference clip
+- `--speaker-embedding` may contain either one 192-d vector or a list of 192-d vectors
+- `--enforce-eager` was used for the validated runs
+- Validation on the L4 GPU used SDPA for the Ming audio VAE instead of
+  FlashAttention2, which is the preferred default when available.
diff --git a/examples/offline_inference/ming_tts/end2end.py b/examples/offline_inference/ming_tts/end2end.py
new file mode 100644
index 00000000000..9e9742f4e7e
--- /dev/null
+++ b/examples/offline_inference/ming_tts/end2end.py
@@ -0,0 +1,654 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Offline inference demo for Ming-omni-tts via vLLM Omni."""
+
+import asyncio
+import json
+import os
+import time
+import uuid
+import wave
+from pathlib import Path
+
+import soundfile as sf
+import torch
+import torchaudio
+from transformers import AutoTokenizer
+from vllm import SamplingParams
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+from vllm_omni import AsyncOmni, Omni
+from vllm_omni.model_executor.models.ming_tts.config_ming_tts import (
+    KEY_CFG,
+    KEY_MAX_DECODE_STEPS,
+    KEY_SIGMA,
+    KEY_SPEAKER_EMBEDDING,
+    KEY_TEMPERATURE,
+    SAMPLE_RATE,
+    TEXT_EOS_TOKEN_ID,
+)
+from vllm_omni.model_executor.models.ming_tts.prompt_builder import build_ming_dense_prompt
+from vllm_omni.model_executor.models.ming_tts.speaker_extractor import MingSpeakerEmbeddingExtractor
+
+DEFAULT_MODEL = "inclusionAI/Ming-omni-tts-0.5B"
+DEFAULT_STAGE_CONFIG = "vllm_omni/model_executor/stage_configs/ming_tts.yaml"
+DEFAULT_STREAM_STAGE_CONFIG = "vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml"
+DEFAULT_OUTPUT_DIR = "output_audio"
+DEFAULT_SPEECH_PROMPT = "Please generate speech based on the following description.\n"
+DEFAULT_MUSIC_PROMPT = "Please generate music based on the following description.\n"
+DEFAULT_PODCAST_TEXT = (
+    " speaker_1:你可以说一下，就大概说一下，可能虽然我也不知道，我看过那部电影没有。\n"
+    " speaker_2:就是那个叫什么，变相一节课的嘛。\n"
+    " speaker_1:嗯。\n"
+    " speaker_2:一部搞笑的电影。\n"
+    " speaker_1:一部搞笑的。\n"
+)
+DEFAULT_PODCAST_PROMPT_TEXT = (
+    " speaker_1:并且我们还要进行每个月还要考核 笔试的话还要进行笔试，做个，当服务员还要去笔试了\n"
+    " speaker_2:对啊，这真的很奇怪，就是 单纯的因，单纯自己工资不高，只是因为可能人家那个店比较出名一点，就对你苛刻要求\n"
+)
+
+CASE_DEFAULTS = {
+    "style": {
+        "prompt": DEFAULT_SPEECH_PROMPT,
+        "text": "我会一直在这里陪着你，直到你慢慢、慢慢地沉入那个最温柔的梦里……好吗？",
+        "instruction": {
+            "风格": (
+                "这是一种ASMR耳语，属于一种旨在引发特殊感官体验的创意风格。"
+                "这个女性使用轻柔的普通话进行耳语，声音气音成分重。"
+                "音量极低，紧贴麦克风，语速极慢，旨在制造触发听者颅内快感的声学刺激。"
+            )
+        },
+        "use_zero_spk_emb": True,
+        "max_decode_steps": 200,
+    },
+    "ip": {
+        "prompt": DEFAULT_SPEECH_PROMPT,
+        "text": "这款产品的名字，叫变态坑爹牛肉丸。",
+        "instruction": {"IP": "灵小甄"},
+        "use_zero_spk_emb": True,
+        "max_decode_steps": 200,
+    },
+    "bgm": {
+        "prompt": DEFAULT_MUSIC_PROMPT,
+        "text": "Genre: 电子舞曲. Mood: 自信 / 坚定. Instrument: 架子鼓. Theme: 节日. Duration: 30s.",
+        "instruction": None,
+        "use_zero_spk_emb": False,
+        "max_decode_steps": 400,
+    },
+    "tta": {
+        "prompt": "Please generate audio events based on given text.\n",
+        "text": "Thunder and a gentle rain",
+        "instruction": None,
+        "use_zero_spk_emb": False,
+        "max_decode_steps": 200,
+        "cfg": 4.5,
+        "sigma": 0.3,
+        "temperature": 2.5,
+    },
+    "emotion": {
+        "prompt": DEFAULT_SPEECH_PROMPT,
+        "text": "我竟然抢到了陈奕迅的演唱会门票！太棒了！终于可以现场听一听他的歌声了！",
+        "instruction": {"情感": "高兴"},
+        "requires_ref_audio": True,
+        "auto_extract_speaker_embeddings": True,
+        "max_decode_steps": 200,
+    },
+    "basic": {
+        "prompt": DEFAULT_SPEECH_PROMPT,
+        "text": "简单地说，这相当于惠普把消费领域市场拱手相让了。",
+        "instruction": {"语速": "快速", "基频": "中", "音量": "高"},
+        "requires_ref_audio": True,
+        "auto_extract_speaker_embeddings": True,
+        "max_decode_steps": 200,
+    },
+    "dialect": {
+        "prompt": DEFAULT_SPEECH_PROMPT,
+        "text": "我觉得社会企业同个人都有责任",
+        "instruction": {"方言": "广粤话"},
+        "requires_ref_audio": True,
+        "auto_extract_speaker_embeddings": True,
+        "max_decode_steps": 200,
+    },
+    "zero_shot": {
+        "prompt": DEFAULT_SPEECH_PROMPT,
+        "text": "我们的愿景是构建未来服务业的数字化基础设施，为世界带来更多微小而美好的改变。",
+        "instruction": None,
+        "requires_ref_audio": True,
+        "requires_ref_text": True,
+        "auto_extract_speaker_embeddings": True,
+        "max_decode_steps": 200,
+    },
+    "podcast": {
+        "prompt": DEFAULT_SPEECH_PROMPT,
+        "text": DEFAULT_PODCAST_TEXT,
+        "instruction": None,
+        "prompt_text": DEFAULT_PODCAST_PROMPT_TEXT,
+        "requires_ref_audio_count": 2,
+        "auto_extract_speaker_embeddings": True,
+        "max_decode_steps": 200,
+    },
+    "speech_bgm": {
+        "prompt": DEFAULT_SPEECH_PROMPT,
+        "text": "此次业绩下滑原因，可归结为企业停止服务某些品牌，而带来的负面影响。",
+        "instruction": {
+            "BGM": {
+                "Genre": "当代古典音乐.",
+                "Mood": "温暖 / 友善.",
+                "Instrument": "电吉他",
+                "Theme": "节日.",
+                "SNR": 10.0,
+                "ENV": None,
+            }
+        },
+        "requires_ref_audio": True,
+        "auto_extract_speaker_embeddings": True,
+        "max_decode_steps": 200,
+    },
+    "speech_sound": {
+        "prompt": DEFAULT_SPEECH_PROMPT,
+        "text": "此次业绩下滑原因，可归结为企业停止服务某些品牌，而带来的负面影响。",
+        "instruction": {
+            "BGM": {
+                "ENV": "Birds chirping",
+                "SNR": 10.0,
+                "Genre": None,
+                "Mood": None,
+                "Instrument": None,
+                "Theme": None,
+            }
+        },
+        "requires_ref_audio": True,
+        "auto_extract_speaker_embeddings": True,
+        "max_decode_steps": 200,
+    },
+}
+
+
+def _load_reference_waveform(path: str) -> torch.Tensor:
+    samples, sample_rate = sf.read(path, dtype="float32")
+    waveform = torch.as_tensor(samples, dtype=torch.float32)
+    if waveform.ndim == 2:
+        waveform = waveform.mean(dim=1)
+    waveform = waveform.reshape(1, -1)
+    if int(sample_rate) != SAMPLE_RATE:
+        waveform = torchaudio.functional.resample(waveform, int(sample_rate), SAMPLE_RATE)
+    return waveform
+
+
+def _load_speaker_embedding(path: str) -> torch.Tensor:
+    data = json.loads(Path(path).read_text(encoding="utf-8"))
+    return torch.as_tensor(data, dtype=torch.float32)
+
+
+def _resolve_reference_inputs(args, case):
+    if args.ref_audio is not None and args.ref_audio_paths is not None:
+        raise RuntimeError("Use either --ref-audio or --ref-audio-paths, not both")
+
+    if args.ref_audio_paths is not None:
+        ref_audio_paths = list(args.ref_audio_paths)
+    elif args.ref_audio is not None:
+        ref_audio_paths = [args.ref_audio]
+    else:
+        ref_audio_paths = []
+
+    required_count = int(case.get("requires_ref_audio_count", 0))
+    if required_count > 0:
+        if len(ref_audio_paths) < required_count:
+            raise RuntimeError(
+                f"Case '{args.case}' requires at least {required_count} reference audio paths via --ref-audio-paths"
+            )
+    elif case.get("requires_ref_audio") and not ref_audio_paths:
+        raise RuntimeError(f"--ref-audio is required for case '{args.case}'")
+
+    if not ref_audio_paths:
+        return None
+    if len(ref_audio_paths) == 1:
+        return _load_reference_waveform(ref_audio_paths[0])
+    return [_load_reference_waveform(path) for path in ref_audio_paths]
+
+
+def _resolve_reference_audio_paths(args):
+    if args.ref_audio is not None and args.ref_audio_paths is not None:
+        raise RuntimeError("Use either --ref-audio or --ref-audio-paths, not both")
+    if args.ref_audio_paths is not None:
+        return list(args.ref_audio_paths)
+    if args.ref_audio is not None:
+        return [args.ref_audio]
+    return []
+
+
+def _resolve_speaker_embedding(args, case, ref_audio_paths):
+    if args.speaker_embedding:
+        return _load_speaker_embedding(args.speaker_embedding)
+
+    should_extract = bool(case.get("auto_extract_speaker_embeddings", False) or args.extract_speaker_embeddings)
+    if not should_extract or not ref_audio_paths:
+        return None
+
+    extractor = MingSpeakerEmbeddingExtractor(args.model)
+    embeddings = extractor.extract_many(ref_audio_paths)
+    if not embeddings:
+        raise RuntimeError("Speaker extraction produced no embeddings")
+    if len(embeddings) == 1:
+        return embeddings[0]
+    return torch.stack(embeddings, dim=0)
+
+
+def _coerce_audio_tensor(audio, *, async_chunk: bool) -> torch.Tensor:
+    if isinstance(audio, list):
+        if async_chunk:
+            parts = []
+            for item in audio:
+                tensor = torch.as_tensor(item, dtype=torch.float32).reshape(-1)
+                if tensor.numel() > 0:
+                    parts.append(tensor)
+            if not parts:
+                return torch.zeros((0,), dtype=torch.float32)
+            return torch.cat(parts, dim=0)
+
+        for item in reversed(audio):
+            tensor = torch.as_tensor(item, dtype=torch.float32).reshape(-1)
+            if tensor.numel() > 0:
+                return tensor
+        return torch.zeros((0,), dtype=torch.float32)
+
+    return torch.as_tensor(audio, dtype=torch.float32).reshape(-1)
+
+
+def _resolve_sr(sr) -> int:
+    if isinstance(sr, list):
+        sr = sr[-1]
+    if hasattr(sr, "item"):
+        return int(sr.item())
+    return int(sr)
+
+
+def _extract_sample_rate(multimodal_output: dict) -> int:
+    sr = multimodal_output.get("sr")
+    if sr is None:
+        raise RuntimeError("Expected multimodal_output['sr']")
+    return _resolve_sr(sr)
+
+
+def _write_wav(path: str, audio: torch.Tensor, sample_rate: int) -> None:
+    audio = audio.clamp(-1.0, 1.0)
+    pcm16 = (audio * 32767.0).round().to(torch.int16).cpu().numpy()
+    with wave.open(path, "wb") as wav_file:
+        wav_file.setnchannels(1)
+        wav_file.setsampwidth(2)
+        wav_file.setframerate(int(sample_rate))
+        wav_file.writeframes(pcm16.tobytes())
+
+
+def _request_index(request_id: str | None, fallback: int) -> int:
+    try:
+        return int(request_id)
+    except (TypeError, ValueError):
+        if isinstance(request_id, str):
+            head = request_id.split("_", 1)[0]
+            if head.isdigit():
+                return int(head)
+    return fallback
+
+
+def _audio_summary(audio: torch.Tensor, sample_rate: int) -> dict:
+    waveform = audio.detach().cpu().reshape(-1).to(torch.float32)
+    return {
+        "sample_rate": int(sample_rate),
+        "num_samples": int(waveform.numel()),
+        "duration_seconds": float(waveform.numel()) / float(sample_rate),
+        "max_abs_amplitude": float(waveform.abs().max().item()) if waveform.numel() > 0 else 0.0,
+    }
+
+
+def _resolve_output_name(output_name: str | None, case: str, index: int, total: int) -> str:
+    if total == 1:
+        return output_name or f"ming_{case}.wav"
+    base = Path(output_name or f"ming_{case}.wav")
+    return f"{base.stem}_{index:05d}{base.suffix or '.wav'}"
+
+
+def _resolve_stats_log_file(args) -> str | None:
+    if not args.log_stats:
+        return None
+    if args.stats_log_file:
+        return args.stats_log_file
+    base = Path(args.output_name or f"ming_{args.case}.wav").stem
+    return str(Path(args.output_dir) / f"{base}_pipeline.log")
+
+
+def _resolve_metadata_json(args) -> str | None:
+    if args.metadata_json:
+        return args.metadata_json
+    if args.log_stats:
+        base = Path(args.output_name or f"ming_{args.case}.wav").stem
+        return str(Path(args.output_dir) / f"{base}_manifest.json")
+    return None
+
+
+def _build_manifest(args, prompt_payload, stats_log_file: str | None, outputs: list[dict]) -> dict:
+    additional_information = {}
+    if isinstance(prompt_payload, dict):
+        additional_information = dict(prompt_payload.get("additional_information", {}))
+    return {
+        "model": args.model,
+        "case": args.case,
+        "streaming": bool(args.streaming),
+        "stage_configs_path": args.stage_configs_path,
+        "enforce_eager": bool(args.enforce_eager),
+        "num_prompts": int(args.num_prompts),
+        "log_stats": bool(args.log_stats),
+        "stats_log_file": stats_log_file,
+        "prompt_text": additional_information.get("prompt_text"),
+        "instruction": additional_information.get("instruction"),
+        "speaker_embedding_shape": (
+            list(additional_information[KEY_SPEAKER_EMBEDDING].shape)
+            if KEY_SPEAKER_EMBEDDING in additional_information
+            and hasattr(additional_information[KEY_SPEAKER_EMBEDDING], "shape")
+            else None
+        ),
+        "outputs": outputs,
+        "generated_at_unix": time.time(),
+    }
+
+
+def _build_engine_kwargs(args, stats_log_file: str | None) -> dict:
+    kwargs = {
+        "model": args.model,
+        "stage_configs_path": args.stage_configs_path,
+        "enforce_eager": args.enforce_eager,
+        "trust_remote_code": args.trust_remote_code,
+        "log_stats": args.log_stats,
+        "stage_init_timeout": args.stage_init_timeout,
+        "init_timeout": args.init_timeout,
+        "batch_timeout": args.batch_timeout,
+        "shm_threshold_bytes": args.shm_threshold_bytes,
+        "worker_backend": args.worker_backend,
+    }
+    if stats_log_file is not None:
+        kwargs["log_file"] = stats_log_file
+    if args.ray_address is not None:
+        kwargs["ray_address"] = args.ray_address
+    return kwargs
+
+
+def _extract_audio_output(outputs, *, async_chunk: bool):
+    output = next((item for item in outputs if item.final_output_type == "audio"), None)
+    if output is None:
+        raise RuntimeError("Expected one final output with final_output_type='audio'")
+
+    multimodal_output = output.multimodal_output or {}
+    audio = multimodal_output.get("audio")
+    sr = multimodal_output.get("sr")
+    if audio is None or sr is None:
+        raise RuntimeError("Expected multimodal_output['audio'] and multimodal_output['sr']")
+
+    waveform = _coerce_audio_tensor(audio, async_chunk=async_chunk)
+    if waveform.numel() == 0:
+        raise RuntimeError("Generated audio waveform is empty")
+    return waveform, _resolve_sr(sr)
+
+
+def _build_instruction(args, case):
+    if args.instruction_json is not None:
+        return json.loads(args.instruction_json)
+    if args.instructions is not None:
+        return args.instructions
+    return case.get("instruction")
+
+
+def _build_prompt(tokenizer, args):
+    case = CASE_DEFAULTS[args.case]
+    prompt = args.prompt or case["prompt"]
+    text = args.text or case["text"]
+    instruction = _build_instruction(args, case)
+    prompt_text = args.ref_text if args.ref_text is not None else case.get("prompt_text")
+    ref_audio_paths = _resolve_reference_audio_paths(args)
+    prompt_waveform = _resolve_reference_inputs(args, case) if prompt_text is not None else None
+
+    required_count = int(case.get("requires_ref_audio_count", 0))
+    if required_count > 0 and len(ref_audio_paths) < required_count:
+        raise RuntimeError(
+            f"Case '{args.case}' requires at least {required_count} reference audio paths via --ref-audio-paths"
+        )
+    if required_count <= 0 and case.get("requires_ref_audio") and not ref_audio_paths:
+        raise RuntimeError(f"--ref-audio is required for case '{args.case}'")
+
+    if case.get("requires_ref_text") and not prompt_text:
+        raise RuntimeError(f"--ref-text is required for case '{args.case}'")
+
+    speaker_embedding = _resolve_speaker_embedding(args, case, ref_audio_paths)
+    use_zero_spk_emb = (
+        bool(case.get("use_zero_spk_emb", False)) and prompt_waveform is None and speaker_embedding is None
+    )
+
+    runtime_controls = {
+        KEY_MAX_DECODE_STEPS: args.max_decode_steps or case["max_decode_steps"],
+    }
+    if "cfg" in case:
+        runtime_controls[KEY_CFG] = case["cfg"]
+    if "sigma" in case:
+        runtime_controls[KEY_SIGMA] = case["sigma"]
+    if "temperature" in case:
+        runtime_controls[KEY_TEMPERATURE] = case["temperature"]
+    return build_ming_dense_prompt(
+        tokenizer,
+        prompt=prompt,
+        text=text,
+        runtime_controls=runtime_controls,
+        instruction=instruction,
+        prompt_text=prompt_text,
+        prompt_waveform=prompt_waveform,
+        speaker_embedding=speaker_embedding,
+        use_zero_spk_emb=use_zero_spk_emb,
+    )
+
+
+async def _run_streaming(args, prompt_payload, sampling_params_list, output_dir, stats_log_file):
+    engine = AsyncOmni(**_build_engine_kwargs(args, stats_log_file))
+    try:
+        all_audio_chunks = []
+        accumulated_samples = 0
+        chunk_idx = 0
+        start_time = time.time()
+        chunk_times = []
+        ttfp_seconds = None
+        final_stage_output = None
+        async for stage_output in engine.generate(
+            prompt=prompt_payload,
+            request_id=str(uuid.uuid4()),
+            sampling_params_list=sampling_params_list,
+        ):
+            final_stage_output = stage_output
+            multimodal_output = stage_output.multimodal_output or {}
+            audio = multimodal_output.get("audio")
+            if audio is None:
+                continue
+
+            finished = stage_output.finished
+            if isinstance(audio, torch.Tensor):
+                if finished:
+                    audio_chunk = audio[accumulated_samples:].float().detach().cpu()
+                else:
+                    audio_chunk = audio.float().detach().cpu()
+            elif isinstance(audio, list):
+                audio_chunk = torch.as_tensor(audio[chunk_idx], dtype=torch.float32).reshape(-1).cpu()
+            else:
+                audio_chunk = torch.as_tensor(audio, dtype=torch.float32).reshape(-1).cpu()
+
+            accumulated_samples += int(audio_chunk.numel())
+            chunk_idx += 1
+            if audio_chunk.numel() > 0:
+                now = time.time()
+                if ttfp_seconds is None:
+                    ttfp_seconds = now - start_time
+                chunk_times.append(now)
+                all_audio_chunks.append(audio_chunk)
+
+        if not all_audio_chunks:
+            raise RuntimeError("Streaming Ming example produced no audio chunks")
+
+        waveform = torch.cat(all_audio_chunks, dim=0)
+        output_name = _resolve_output_name(args.output_name, args.case, 0, 1)
+        output_path = str(Path(output_dir) / output_name)
+        _write_wav(output_path, waveform, SAMPLE_RATE)
+        summary = {
+            "request_id": getattr(final_stage_output, "request_id", None),
+            "stage_id": getattr(final_stage_output, "stage_id", None),
+            "output_path": output_path,
+            "stage_durations": getattr(final_stage_output, "stage_durations", {}),
+            "peak_memory_mb": getattr(final_stage_output, "peak_memory_mb", 0.0),
+            "ttfp_seconds": ttfp_seconds,
+            "mean_inter_chunk_seconds": (
+                sum(t1 - t0 for t0, t1 in zip(chunk_times, chunk_times[1:])) / (len(chunk_times) - 1)
+                if len(chunk_times) > 1
+                else None
+            ),
+        }
+        summary.update(_audio_summary(waveform, SAMPLE_RATE))
+        print(f"Saved streaming output to {output_path}")
+        print(json.dumps(summary, ensure_ascii=False, indent=2))
+        return [summary]
+    finally:
+        engine.shutdown()
+
+
+def _run_non_streaming(args, prompt_payload, sampling_params_list, output_dir, stats_log_file):
+    engine = Omni(**_build_engine_kwargs(args, stats_log_file))
+    try:
+        outputs = engine.generate(
+            prompts=[prompt_payload for _ in range(args.num_prompts)],
+            sampling_params_list=sampling_params_list,
+            py_generator=False,
+        )
+        summaries = []
+        for fallback_index, output in enumerate(outputs):
+            if output.final_output_type != "audio":
+                continue
+            multimodal_output = output.multimodal_output or {}
+            waveform = _coerce_audio_tensor(multimodal_output.get("audio"), async_chunk=False)
+            sample_rate = _extract_sample_rate(multimodal_output)
+            request_index = _request_index(output.request_id, fallback_index)
+            output_name = _resolve_output_name(args.output_name, args.case, request_index, args.num_prompts)
+            output_path = str(Path(output_dir) / output_name)
+            _write_wav(output_path, waveform, sample_rate)
+            summary = {
+                "request_id": output.request_id,
+                "stage_id": output.stage_id,
+                "output_path": output_path,
+                "stage_durations": output.stage_durations,
+                "peak_memory_mb": output.peak_memory_mb,
+            }
+            summary.update(_audio_summary(waveform, sample_rate))
+            summaries.append(summary)
+            print(f"Saved output to {output_path}")
+            print(json.dumps(summary, ensure_ascii=False, indent=2))
+        if not summaries:
+            raise RuntimeError("Non-streaming Ming example produced no audio outputs")
+        return summaries
+    finally:
+        engine.close()
+
+
+def main():
+    parser = FlexibleArgumentParser(description="Offline Ming-omni-tts example")
+    parser.add_argument("--model", default=DEFAULT_MODEL, help="Model name or local path")
+    parser.add_argument(
+        "--stage-configs-path",
+        default=None,
+        help="Stage config path. Defaults to ming_tts.yaml or ming_tts_async_chunk.yaml when --streaming is set.",
+    )
+    parser.add_argument("--case", choices=sorted(CASE_DEFAULTS), default="style", help="Built-in demo case")
+    parser.add_argument("--text", default=None, help="Override case text")
+    parser.add_argument("--prompt", default=None, help="Override the system prompt prefix")
+    parser.add_argument("--instructions", default=None, help="Free-form Ming instruction string")
+    parser.add_argument(
+        "--instruction-json",
+        default=None,
+        help='Structured Ming instruction JSON, for example \'{"方言":"广粤话"}\'',
+    )
+    parser.add_argument("--ref-audio", default=None, help="Reference audio path for cloning")
+    parser.add_argument(
+        "--ref-audio-paths",
+        nargs="+",
+        default=None,
+        help="Multiple reference audio paths, used by multi-speaker cases like podcast",
+    )
+    parser.add_argument("--ref-text", default=None, help="Reference transcript for cloning")
+    parser.add_argument("--speaker-embedding", default=None, help="Path to a JSON speaker embedding file")
+    parser.add_argument(
+        "--extract-speaker-embeddings",
+        action="store_true",
+        help="Extract 192-d Ming speaker embeddings from --ref-audio or --ref-audio-paths using campplus.onnx",
+    )
+    parser.add_argument("--max-decode-steps", type=int, default=None, help="Override ming_max_decode_steps")
+    parser.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR, help="Directory for output wav files")
+    parser.add_argument("--output-name", default=None, help="Output wav filename")
+    parser.add_argument("--num-prompts", type=int, default=1, help="Repeat the same prompt N times")
+    parser.add_argument("--streaming", action="store_true", help="Use AsyncOmni with async_chunk streaming")
+    parser.add_argument("--trust-remote-code", action="store_true", help="Pass trust_remote_code to Omni")
+    parser.add_argument("--enforce-eager", action="store_true", help="Pass enforce_eager to Omni")
+    parser.add_argument(
+        "--log-stats", "--enable-stats", dest="log_stats", action="store_true", help="Enable Omni stats logging"
+    )
+    parser.add_argument("--stats-log-file", default=None, help="Optional path for the Omni stats log file")
+    parser.add_argument("--metadata-json", default=None, help="Optional path for a run manifest JSON file")
+    parser.add_argument(
+        "--stage-init-timeout", type=int, default=300, help="Per-stage initialization timeout in seconds"
+    )
+    parser.add_argument("--init-timeout", type=int, default=600, help="Total initialization timeout in seconds")
+    parser.add_argument("--batch-timeout", type=int, default=5, help="Batch timeout in seconds")
+    parser.add_argument("--shm-threshold-bytes", type=int, default=65536, help="Shared memory threshold in bytes")
+    parser.add_argument(
+        "--worker-backend",
+        type=str,
+        default="multi_process",
+        choices=["multi_process", "ray"],
+        help="Worker backend",
+    )
+    parser.add_argument("--ray-address", default=None, help="Ray cluster address when --worker-backend ray is used")
+    args = parser.parse_args()
+
+    if args.instructions is not None and args.instruction_json is not None:
+        raise RuntimeError("Use either --instructions or --instruction-json, not both")
+    if args.num_prompts < 1:
+        raise RuntimeError("--num-prompts must be at least 1")
+    if args.streaming and args.num_prompts != 1:
+        raise RuntimeError("--streaming currently supports exactly one prompt")
+
+    if args.stage_configs_path is None:
+        args.stage_configs_path = DEFAULT_STREAM_STAGE_CONFIG if args.streaming else DEFAULT_STAGE_CONFIG
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=False)
+    prompt_payload = _build_prompt(tokenizer, args)
+
+    max_decode_steps = args.max_decode_steps or CASE_DEFAULTS[args.case]["max_decode_steps"]
+    sampling_params_list = [
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=max_decode_steps + 1,
+            stop_token_ids=[int(TEXT_EOS_TOKEN_ID)],
+        ),
+        SamplingParams(temperature=0.0, max_tokens=1),
+    ]
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    stats_log_file = _resolve_stats_log_file(args)
+
+    if args.streaming:
+        summaries = asyncio.run(_run_streaming(args, prompt_payload, sampling_params_list, output_dir, stats_log_file))
+    else:
+        summaries = _run_non_streaming(args, prompt_payload, sampling_params_list, output_dir, stats_log_file)
+
+    metadata_json = _resolve_metadata_json(args)
+    manifest = _build_manifest(args, prompt_payload, stats_log_file, summaries)
+    if metadata_json is not None:
+        Path(metadata_json).write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
+        print(f"Saved run manifest to {metadata_json}")
+
+
+if __name__ == "__main__":
+    os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+    main()
diff --git a/examples/online_serving/ming_tts/README.md b/examples/online_serving/ming_tts/README.md
new file mode 100644
index 00000000000..76f8521a4fe
--- /dev/null
+++ b/examples/online_serving/ming_tts/README.md
@@ -0,0 +1,312 @@
+# Ming-omni-tts
+
+## Installation
+
+Please refer to [README.md](../../../README.md)
+
+## Model
+
+| Model | Description |
+|-------|-------------|
+| `inclusionAI/Ming-omni-tts-0.5B` | Dense 0.5B Ming two-stage TTS model for speech generation with dialect, style, IP voice, and cloning controls |
+
+## Launch the Server
+
+```bash
+vllm-omni serve inclusionAI/Ming-omni-tts-0.5B \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml \
+    --omni \
+    --port 8091 \
+    --enforce-eager
+```
+
+Or use the convenience script:
+
+```bash
+cd examples/online_serving/ming_tts
+./run_server.sh
+```
+
+The recommended online-serving path is eager async-chunk mode through
+`/v1/audio/speech`. `run_server.sh` defaults to:
+
+- model: `inclusionAI/Ming-omni-tts-0.5B`
+- stage config: `vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml`
+- auth: local testing only, no real OpenAI key required
+
+## Send Requests
+
+The canonical Ming online client is:
+
+```bash
+cd examples/online_serving/ming_tts
+python openai_speech_client.py --text "你好，世界"
+```
+
+This talks to the local vLLM-Omni server at `http://localhost:8091/v1` and
+uses `api_key=EMPTY`. It does not call OpenAI's cloud API.
+
+### Basic TTS
+
+```bash
+python openai_speech_client.py \
+    --text "你好，这是 Ming 在线语音合成测试。" \
+    --max-new-tokens 200
+```
+
+### Style-conditioned speech without a reference clip
+
+```bash
+python openai_speech_client.py \
+    --text "我会一直在这里陪着你。" \
+    --instructions "轻柔的ASMR耳语，慢速，贴近麦克风" \
+    --max-new-tokens 200
+```
+
+### Structured Ming control via JSON
+
+```bash
+python openai_speech_client.py \
+    --text "我觉得社会企业同个人都有责任" \
+    --instruction-json '{"方言":"广粤话"}' \
+    --max-new-tokens 200
+```
+
+### IP voice generation
+
+```bash
+python openai_speech_client.py \
+    --text "这款产品的名字，叫变态坑爹牛肉丸。" \
+    --voice 灵小甄 \
+    --max-new-tokens 200
+```
+
+### Reference-audio cloning
+
+Ming has two reference-audio paths:
+
+- prompt-waveform conditioning, where `ref_audio` steers the voice/style and
+  `ref_text` is not required
+- transcript cloning, where `ref_audio` and `ref_text` are paired
+
+```bash
+python openai_speech_client.py \
+    --task-type Base \
+    --text "我们的愿景是构建未来服务业的数字化基础设施。" \
+    --ref-audio /path/to/reference.wav \
+    --max-new-tokens 200
+```
+
+Pass `--ref-text` when the prompt case needs a transcript, such as zero-shot
+voice cloning:
+
+```bash
+python openai_speech_client.py \
+    --task-type Base \
+    --text "我们的愿景是构建未来服务业的数字化基础设施。" \
+    --ref-audio /path/to/reference.wav \
+    --ref-text "在此奉劝大家别乱打美白针。" \
+    --max-new-tokens 200
+```
+
+### Podcast-style multi-speaker prompt
+
+```bash
+python openai_speech_client.py \
+    --text "speaker_1:你可以说一下。 speaker_2:我也不知道。" \
+    --ref-audio /path/to/speaker_1.wav \
+    --ref-audio /path/to/speaker_2.wav \
+    --ref-text "在此奉劝大家别乱打美白针。"
+```
+
+### x-vector style cloning with a precomputed embedding
+
+```bash
+python openai_speech_client.py \
+    --task-type Base \
+    --text "你好，这是一段使用说话人向量的合成语音。" \
+    --speaker-embedding /path/to/ming_speaker_embedding.json \
+    --max-new-tokens 200
+```
+
+### Curl examples
+
+Use the helper script for the common request types:
+
+```bash
+./run_curl.sh basic
+./run_curl.sh style
+./run_curl.sh ip
+REF_AUDIO=/path/to/emotion_prompt.wav ./run_curl.sh emotion
+REF_AUDIO=/path/to/yue_prompt.wav ./run_curl.sh dialect
+REF_AUDIO=/path/to/reference.wav REF_TEXT="在此奉劝大家别乱打美白针。" ./run_curl.sh zero_shot
+REF_AUDIO=/path/to/speaker_1.wav REF_AUDIO_2=/path/to/speaker_2.wav REF_TEXT="speaker_1:你好。 speaker_2:你好。" ./run_curl.sh podcast
+REF_AUDIO=/path/to/00000309-00000300.wav ./run_curl.sh speech_bgm
+REF_AUDIO=/path/to/00000309-00000300.wav ./run_curl.sh speech_sound
+REF_AUDIO=/path/to/reference.wav REF_TEXT="在此奉劝大家别乱打美白针。" ./run_curl.sh clone_ref_audio
+SPEAKER_EMBEDDING=/path/to/ming_speaker_embedding.json ./run_curl.sh clone_embedding
+./run_curl.sh stream
+```
+
+Or send a direct request:
+
+```bash
+curl -X POST http://localhost:8091/v1/audio/speech \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer EMPTY" \
+    -d '{
+        "model": "inclusionAI/Ming-omni-tts-0.5B",
+        "input": "你好，这是 Ming 在线语音合成测试。",
+        "response_format": "wav"
+    }' \
+    --output ming_output.wav
+```
+
+## Request Types
+
+Ming online serving supports these main request families through
+`/v1/audio/speech`:
+
+| Case | Online support | Required fields |
+|------|----------------|-----------------|
+| default TTS | Supported | `input`, `max_new_tokens=200` |
+| `style` | Supported | `input`, `instructions`, `max_new_tokens=200` |
+| `ip` | Supported | `input`, `voice`, `max_new_tokens=200` |
+| `basic` helper | Supported | `input`, `max_new_tokens=200` |
+| upstream `basic` case | Supported | `input`, `ref_audio`, structured speed / pitch / volume `instructions`, `max_new_tokens=200` |
+| `emotion` | Supported | `input`, `ref_audio`, structured emotion `instructions`, `max_new_tokens=200` |
+| `dialect` | Supported | `input`, `language` or structured `instructions`, `ref_audio`, `max_new_tokens=200` |
+| `zero_shot` | Supported | `input`, `ref_audio`, `ref_text`, `max_new_tokens=200` |
+| `podcast` | Supported | `input`, repeated/list `ref_audio`, `ref_text`, `max_new_tokens=200` |
+| `speech_bgm` | Supported | `input`, `ref_audio`, structured `instructions` with `{"BGM": ...}`, `max_new_tokens=200` |
+| `speech_sound` | Supported | `input`, `ref_audio`, structured `instructions` with `{"BGM": {"ENV": ...}}`, `max_new_tokens=200` |
+| `bgm` | Not supported online | Requires a future `prompt_mode=music` API extension |
+
+This matrix intentionally mirrors the local online validation flow. The
+music-only `bgm` case remains offline-only because `/v1/audio/speech` always
+uses Ming's speech prompt path today.
+
+## Output
+
+- Non-streaming requests return full audio bytes, usually written to `.wav`
+- WAV outputs are expected to be readable at 44.1kHz
+- Streaming requests return progressive PCM bytes; wrap or convert them to WAV
+  before browser playback
+- The default Python client outputs:
+  - `ming_output.wav` for non-streaming
+  - `ming_output.pcm` for streaming
+
+## Validated Outputs
+
+Validation on an L4 GPU passed the online async_chunk `/v1/audio/speech` flow
+for every speech-mode case in the local validation script:
+
+| Case | Output | Size bytes | Sample rate | Frames |
+|------|--------|-----------:|------------:|-------:|
+| `style` | WAV | 790316 | 44100 | 395136 |
+| `ip` | WAV | 366956 | 44100 | 183456 |
+| `basic` | WAV | 536300 | 44100 | 268128 |
+| `emotion` | WAV | 649196 | 44100 | 324576 |
+| `dialect` | WAV | 395180 | 44100 | 197568 |
+| `zero_shot` | WAV | 931436 | 44100 | 465696 |
+| `podcast` | WAV | 846764 | 44100 | 423360 |
+| `speech_bgm` | WAV | 677420 | 44100 | 338688 |
+| `speech_sound` | WAV | 649196 | 44100 | 324576 |
+| `streaming` | PCM | 338688 | N/A | N/A |
+
+`bgm` is intentionally not included in the online pass list. It is a
+music-prompt workflow, while `/v1/audio/speech` currently routes Ming through
+the speech prompt path.
+
+## Performance
+
+Benchmark via `/v1/audio/speech`, `inclusionAI/Ming-omni-tts-0.5B`,
+10 prompts, concurrency 1, eager mode:
+
+| Config | Mean TTFP | Mean E2E | Mean RTF |
+|--------|----------:|---------:|---------:|
+| Sequential eager | 3354.83ms | 3357.01ms | 0.561 |
+| Async chunk eager | 3450.28ms | 3452.35ms | 0.577 |
+
+## Audio Inputs
+
+- `ref_audio` accepts:
+  - a local file path
+  - a remote `http://` or `https://` URL
+  - a `data:` URL
+  - repeated values for podcast-style multi-speaker prompts
+- `openai_speech_client.py` converts local reference audio files into a base64
+  `data:` URL before sending them to the server
+- `speaker_embedding` must be a JSON file containing exactly 192 numeric values
+- Ming prompt-waveform cases can use `ref_audio` without `ref_text`
+- Zero-shot and podcast-style transcript cloning should include `ref_text`
+
+## API Field Mapping
+
+The OpenAI-compatible `/v1/audio/speech` endpoint stays generic. Ming-specific controls are mapped like this:
+
+- `input` -> target text
+- `instructions` -> Ming instruction string, or a JSON string that becomes the structured Ming control object
+- `voice` -> Ming `IP` field when using built-in character voices
+- `language` -> Ming `方言` field
+- `ref_audio` -> Ming `prompt_waveform`
+- `ref_text` -> Ming `prompt_text`
+- `speaker_embedding` -> 192-d Ming speaker embedding
+- `max_new_tokens` -> Ming `max_decode_steps`
+
+## Voice Listing
+
+- `/v1/audio/voices` reflects uploaded voices for Ming.
+- Built-in Ming IP labels like `灵小甄` are passed through as `voice` values, but they are not enumerated by the API.
+
+## Streaming
+
+Use `stream=true` to get progressive PCM output:
+
+```bash
+python openai_speech_client.py \
+    --text "你好，这是流式输出测试。" \
+    --instructions "平静，普通话" \
+    --stream \
+    --output ming_output.pcm
+```
+
+## Not Supported Online Yet
+
+`bgm` music-prompt generation is not exposed through `/v1/audio/speech` today.
+It needs a future `prompt_mode=music` API extension so the server can select
+Ming's music system prompt instead of the speech system prompt.
+
+## Troubleshooting
+
+### No real OpenAI key
+
+The example targets a local vLLM-Omni server. `api_key=EMPTY` is expected and
+is sufficient for local testing.
+
+### `--ref-audio` fails
+
+- Confirm the local file exists
+- If using zero-shot or podcast transcript cloning, also provide `--ref-text`
+- If passing a URL, make sure the server can fetch it
+
+### `--speaker-embedding` fails
+
+- Make sure the JSON file contains exactly 192 numeric values
+- Do not wrap the list in another object
+
+### Connection refused
+
+- Check that the server is running on `localhost:8091`
+- Confirm the stage config path is correct
+
+### No audio or wrong output file
+
+- Use non-streaming for `.wav`
+- Use `--stream` for `.pcm`
+
+### `bgm` is missing online
+
+Use the offline example for music-only `bgm`. Online support needs an explicit
+Ming prompt-mode API extension so the server can select the music prompt
+instead of the speech prompt.
diff --git a/examples/online_serving/ming_tts/openai_speech_client.py b/examples/online_serving/ming_tts/openai_speech_client.py
new file mode 100644
index 00000000000..af1a70685b6
--- /dev/null
+++ b/examples/online_serving/ming_tts/openai_speech_client.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""OpenAI-compatible client for Ming-omni-tts via /v1/audio/speech.
+
+Examples:
+    python openai_speech_client.py --text "你好，世界"
+    python openai_speech_client.py --text "我会一直在这里陪着你。" \
+        --instructions "轻柔的ASMR耳语，慢速，贴近麦克风" --max-new-tokens 200
+    python openai_speech_client.py --text "你好，这是零样本克隆测试。" \
+        --ref-audio prompt.wav --ref-text "参考音频的转录文本" --max-new-tokens 200
+    python openai_speech_client.py --text "speaker_1:你好。 speaker_2:你好。" \
+        --ref-audio speaker_1.wav --ref-audio speaker_2.wav --ref-text "speaker_1:你好。 speaker_2:你好。"
+    python openai_speech_client.py --text "你好，这是流式输出测试。" \
+        --stream --output ming_output.pcm
+"""
+
+import argparse
+import base64
+import json
+import os
+
+import httpx
+
+DEFAULT_API_BASE = "http://localhost:8091"
+DEFAULT_API_KEY = "EMPTY"
+DEFAULT_MODEL = "inclusionAI/Ming-omni-tts-0.5B"
+EXPECTED_SPEAKER_EMBEDDING_DIM = 192
+
+
+def encode_audio_to_base64(audio_path: str) -> str:
+    """Encode a local audio file to a base64 data URL."""
+    if not os.path.exists(audio_path):
+        raise FileNotFoundError(f"Audio file not found: {audio_path}")
+    ext = audio_path.lower().rsplit(".", 1)[-1]
+    mime_map = {
+        "wav": "audio/wav",
+        "mp3": "audio/mpeg",
+        "flac": "audio/flac",
+        "ogg": "audio/ogg",
+        "aac": "audio/aac",
+    }
+    mime_type = mime_map.get(ext, "audio/wav")
+    with open(audio_path, "rb") as f:
+        audio_b64 = base64.b64encode(f.read()).decode("utf-8")
+    return f"data:{mime_type};base64,{audio_b64}"
+
+
+def load_speaker_embedding(path: str) -> list[float]:
+    """Load and validate a 192-d Ming speaker embedding JSON file."""
+    with open(path, encoding="utf-8") as f:
+        data = json.load(f)
+
+    if not isinstance(data, list):
+        raise ValueError("speaker_embedding file must contain a JSON list")
+    if len(data) != EXPECTED_SPEAKER_EMBEDDING_DIM:
+        raise ValueError(
+            f"Ming dense speaker_embedding must have {EXPECTED_SPEAKER_EMBEDDING_DIM} values, got {len(data)}"
+        )
+
+    values = []
+    for index, value in enumerate(data):
+        try:
+            values.append(float(value))
+        except (TypeError, ValueError) as exc:
+            raise ValueError(f"speaker_embedding[{index}] must be a number, got {value!r}") from exc
+    return values
+
+
+def build_instruction_payload(args) -> str | None:
+    """Return a string payload for the API `instructions` field."""
+    if args.instructions and args.instruction_json:
+        raise ValueError("Use either --instructions or --instruction-json, not both")
+    if args.instruction_json:
+        parsed = json.loads(args.instruction_json)
+        return json.dumps(parsed, ensure_ascii=False)
+    return args.instructions
+
+
+def validate_args(args) -> None:
+    """Fail fast on invalid combinations before hitting the server."""
+    if args.ref_text and not args.ref_audio:
+        raise ValueError("--ref-audio is required when --ref-text is provided")
+    if args.speaker_embedding and args.ref_audio and len(args.ref_audio) > 1:
+        raise ValueError("--speaker-embedding cannot be combined with multiple --ref-audio values")
+
+
+def run_tts(args) -> None:
+    """Generate speech via the OpenAI-compatible /v1/audio/speech API."""
+    validate_args(args)
+
+    payload = {
+        "model": args.model,
+        "input": args.text,
+        "response_format": args.response_format,
+    }
+
+    if args.voice:
+        payload["voice"] = args.voice
+    if args.task_type:
+        payload["task_type"] = args.task_type
+    if args.dialect:
+        payload["language"] = args.dialect
+
+    instructions = build_instruction_payload(args)
+    if instructions:
+        payload["instructions"] = instructions
+
+    if args.ref_audio:
+        ref_audio = []
+        for audio in args.ref_audio:
+            if audio.startswith(("http://", "https://", "data:")):
+                ref_audio.append(audio)
+            else:
+                ref_audio.append(encode_audio_to_base64(audio))
+        payload["ref_audio"] = ref_audio[0] if len(ref_audio) == 1 else ref_audio
+    if args.ref_text:
+        payload["ref_text"] = args.ref_text
+    if args.speaker_embedding:
+        payload["speaker_embedding"] = load_speaker_embedding(args.speaker_embedding)
+    if args.max_new_tokens:
+        payload["max_new_tokens"] = args.max_new_tokens
+    if args.stream:
+        payload["stream"] = True
+        payload["response_format"] = "pcm"
+
+    api_url = f"{args.api_base}/v1/audio/speech"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {args.api_key}",
+    }
+
+    print(f"Model: {args.model}")
+    print(f"Text: {args.text}")
+    print(f"Payload keys: {sorted(payload)}")
+
+    if args.stream:
+        output_path = args.output or "ming_output.pcm"
+        with httpx.Client(timeout=300.0) as client:
+            with client.stream("POST", api_url, json=payload, headers=headers) as response:
+                if response.status_code != 200:
+                    print(f"Error: {response.status_code}")
+                    print(response.read().decode())
+                    return
+                with open(output_path, "wb") as f:
+                    for chunk in response.iter_bytes():
+                        f.write(chunk)
+        print(f"Streamed PCM audio to: {output_path}")
+        return
+
+    with httpx.Client(timeout=300.0) as client:
+        response = client.post(api_url, json=payload, headers=headers)
+
+    if response.status_code != 200:
+        print(f"Error: {response.status_code}")
+        print(response.text)
+        return
+
+    try:
+        text = response.content.decode("utf-8")
+        if text.startswith('{"error"'):
+            print(f"Error: {text}")
+            return
+    except UnicodeDecodeError:
+        pass
+
+    output_path = args.output or "ming_output.wav"
+    with open(output_path, "wb") as f:
+        f.write(response.content)
+    print(f"Audio saved to: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="OpenAI-compatible client for Ming-omni-tts via /v1/audio/speech")
+    parser.add_argument("--api-base", default=DEFAULT_API_BASE, help="API base URL")
+    parser.add_argument("--api-key", default=DEFAULT_API_KEY, help="API key")
+    parser.add_argument("--model", "-m", default=DEFAULT_MODEL, help="Model name or path")
+    parser.add_argument("--text", required=True, help="Text to synthesize")
+    parser.add_argument(
+        "--task-type",
+        default=None,
+        choices=["CustomVoice", "VoiceDesign", "Base"],
+        help="Optional compatibility task type. Ming accepts the same field but primarily uses prompt metadata.",
+    )
+    parser.add_argument(
+        "--voice",
+        default=None,
+        help="Maps to Ming `IP` when using built-in character voices, or to an uploaded voice sample name",
+    )
+    parser.add_argument("--dialect", default=None, help="Maps to Ming `方言`")
+    parser.add_argument("--instructions", default=None, help="Free-form Ming instruction string")
+    parser.add_argument(
+        "--instruction-json",
+        default=None,
+        help='Structured Ming instruction JSON, for example \'{"情感":"高兴"}\'',
+    )
+    parser.add_argument(
+        "--ref-audio",
+        action="append",
+        default=None,
+        help="Reference audio path, URL, or data URL. Repeat for podcast-style multi-speaker prompts.",
+    )
+    parser.add_argument("--ref-text", default=None, help="Reference transcript for cloning")
+    parser.add_argument(
+        "--speaker-embedding", default=None, help="Path to a JSON file containing a 192-d speaker embedding"
+    )
+    parser.add_argument("--max-new-tokens", type=int, default=None, help="Override ming_max_decode_steps")
+    parser.add_argument("--stream", action="store_true", help="Enable streaming PCM output")
+    parser.add_argument(
+        "--response-format",
+        default="wav",
+        choices=["wav", "mp3", "flac", "pcm", "aac", "opus"],
+        help="Audio format when not streaming",
+    )
+    parser.add_argument("--output", "-o", default=None, help="Output file path")
+    args = parser.parse_args()
+    try:
+        run_tts(args)
+    except (FileNotFoundError, ValueError, json.JSONDecodeError) as exc:
+        raise SystemExit(f"Error: {exc}") from exc
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/ming_tts/run_curl.sh b/examples/online_serving/ming_tts/run_curl.sh
new file mode 100755
index 00000000000..92762462e25
--- /dev/null
+++ b/examples/online_serving/ming_tts/run_curl.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+# Common curl examples for Ming-omni-tts via /v1/audio/speech.
+#
+# Usage:
+#   ./run_curl.sh basic
+#   ./run_curl.sh style
+#   ./run_curl.sh ip
+#   REF_AUDIO=/path/to/ref.wav ./run_curl.sh emotion
+#   REF_AUDIO=/path/to/ref.wav ./run_curl.sh dialect
+#   REF_AUDIO=/path/to/ref.wav REF_TEXT="参考文本" ./run_curl.sh zero_shot
+#   REF_AUDIO=/path/to/speaker1.wav REF_AUDIO_2=/path/to/speaker2.wav REF_TEXT="speaker_1:... speaker_2:..." ./run_curl.sh podcast
+#   REF_AUDIO=/path/to/mix_ref.wav ./run_curl.sh speech_bgm
+#   REF_AUDIO=/path/to/mix_ref.wav ./run_curl.sh speech_sound
+#   REF_AUDIO=/path/to/ref.wav REF_TEXT="参考文本" ./run_curl.sh clone_ref_audio
+#   SPEAKER_EMBEDDING=/path/to/ming_embedding.json ./run_curl.sh clone_embedding
+#   ./run_curl.sh stream
+
+set -euo pipefail
+
+MODE="${1:-basic}"
+HOST="${HOST:-localhost}"
+PORT="${PORT:-8091}"
+MODEL="${MODEL:-inclusionAI/Ming-omni-tts-0.5B}"
+API_URL="http://${HOST}:${PORT}/v1/audio/speech"
+TEXT="${TEXT:-你好，这是 Ming 在线语音合成测试。}"
+OUTPUT="${OUTPUT:-ming_output.wav}"
+STREAM_OUTPUT="${STREAM_OUTPUT:-ming_output.pcm}"
+REF_AUDIO="${REF_AUDIO:-}"
+REF_AUDIO_2="${REF_AUDIO_2:-}"
+REF_TEXT="${REF_TEXT:-}"
+SPEAKER_EMBEDDING="${SPEAKER_EMBEDDING:-}"
+
+build_payload() {
+    MODEL="$1" \
+    TEXT="$2" \
+    VOICE="$3" \
+    INSTRUCTIONS="$4" \
+    TASK_TYPE="$5" \
+    REF_AUDIO_PATH="$6" \
+    REF_TEXT="$7" \
+    SPEAKER_EMBEDDING_PATH="$8" \
+    STREAM="$9" \
+    REF_AUDIO_PATH_2="${10:-}" \
+    python - <<'PY'
+import base64
+import json
+import mimetypes
+import os
+import pathlib
+import sys
+
+payload = {
+    "model": os.environ["MODEL"],
+    "input": os.environ["TEXT"],
+}
+
+voice = os.environ["VOICE"]
+instructions = os.environ["INSTRUCTIONS"]
+task_type = os.environ["TASK_TYPE"]
+ref_audio_path = os.environ["REF_AUDIO_PATH"]
+ref_audio_path_2 = os.environ["REF_AUDIO_PATH_2"]
+ref_text = os.environ["REF_TEXT"]
+speaker_embedding_path = os.environ["SPEAKER_EMBEDDING_PATH"]
+
+if voice:
+    payload["voice"] = voice
+if instructions:
+    payload["instructions"] = instructions
+if task_type:
+    payload["task_type"] = task_type
+ref_audio_items = []
+if ref_audio_path:
+    path = pathlib.Path(ref_audio_path)
+    mime_type = mimetypes.guess_type(path.name)[0] or "audio/wav"
+    data = base64.b64encode(path.read_bytes()).decode("utf-8")
+    ref_audio_items.append(f"data:{mime_type};base64,{data}")
+if ref_audio_path_2:
+    path = pathlib.Path(ref_audio_path_2)
+    mime_type = mimetypes.guess_type(path.name)[0] or "audio/wav"
+    data = base64.b64encode(path.read_bytes()).decode("utf-8")
+    ref_audio_items.append(f"data:{mime_type};base64,{data}")
+if ref_audio_items:
+    payload["ref_audio"] = ref_audio_items[0] if len(ref_audio_items) == 1 else ref_audio_items
+if ref_text:
+    payload["ref_text"] = ref_text
+if speaker_embedding_path:
+    path = pathlib.Path(speaker_embedding_path)
+    data = json.loads(path.read_text(encoding="utf-8"))
+    if not isinstance(data, list):
+        raise SystemExit("speaker embedding file must contain a JSON list")
+    payload["speaker_embedding"] = data
+
+stream = os.environ["STREAM"] == "true"
+if stream:
+    payload["stream"] = True
+    payload["response_format"] = "pcm"
+else:
+    payload["response_format"] = "wav"
+
+print(json.dumps(payload, ensure_ascii=False))
+PY
+}
+
+require_file() {
+    local path="$1"
+    local flag_name="$2"
+    if [ -z "$path" ]; then
+        echo "Missing ${flag_name}" >&2
+        exit 1
+    fi
+    if [ ! -f "$path" ]; then
+        echo "File not found for ${flag_name}: $path" >&2
+        exit 1
+    fi
+}
+
+base_headers=(
+    -H "Content-Type: application/json"
+    -H "Authorization: Bearer EMPTY"
+)
+
+post_payload() {
+    local payload="$1"
+    local output_path="$2"
+    local payload_file
+    payload_file="$(mktemp)"
+    trap 'rm -f "$payload_file"' RETURN
+    printf '%s' "$payload" > "$payload_file"
+    curl -X POST "$API_URL" "${base_headers[@]}" \
+        --data-binary "@${payload_file}" \
+        --output "$output_path"
+}
+
+case "$MODE" in
+    basic)
+        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "" "" "" "" "" "false")"
+        post_payload "$PAYLOAD" "$OUTPUT"
+        ;;
+    style)
+        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "轻柔的ASMR耳语，慢速，贴近麦克风" "" "" "" "" "false")"
+        post_payload "$PAYLOAD" "$OUTPUT"
+        ;;
+    ip)
+        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "灵小甄" "" "" "" "" "" "false")"
+        post_payload "$PAYLOAD" "$OUTPUT"
+        ;;
+    emotion)
+        require_file "$REF_AUDIO" "REF_AUDIO"
+        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" '{"情感":"高兴"}' "" "$REF_AUDIO" "" "" "false")"
+        post_payload "$PAYLOAD" "$OUTPUT"
+        ;;
+    dialect)
+        require_file "$REF_AUDIO" "REF_AUDIO"
+        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "" "" "$REF_AUDIO" "" "" "false")"
+        PAYLOAD="$(TEXT="$PAYLOAD" python - <<'PY'
+import json
+import os
+payload = json.loads(os.environ["TEXT"])
+payload["language"] = "广粤话"
+print(json.dumps(payload, ensure_ascii=False))
+PY
+)"
+        post_payload "$PAYLOAD" "$OUTPUT"
+        ;;
+    zero_shot)
+        require_file "$REF_AUDIO" "REF_AUDIO"
+        if [ -z "$REF_TEXT" ]; then
+            echo "Missing REF_TEXT" >&2
+            exit 1
+        fi
+        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "" "Base" "$REF_AUDIO" "$REF_TEXT" "" "false")"
+        post_payload "$PAYLOAD" "$OUTPUT"
+        ;;
+    podcast)
+        require_file "$REF_AUDIO" "REF_AUDIO"
+        require_file "$REF_AUDIO_2" "REF_AUDIO_2"
+        if [ -z "$REF_TEXT" ]; then
+            echo "Missing REF_TEXT" >&2
+            exit 1
+        fi
+        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "" "Base" "$REF_AUDIO" "$REF_TEXT" "" "false" "$REF_AUDIO_2")"
+        post_payload "$PAYLOAD" "$OUTPUT"
+        ;;
+    speech_bgm)
+        require_file "$REF_AUDIO" "REF_AUDIO"
+        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" '{"BGM":"舒缓的背景音乐"}' "" "$REF_AUDIO" "" "" "false")"
+        post_payload "$PAYLOAD" "$OUTPUT"
+        ;;
+    speech_sound)
+        require_file "$REF_AUDIO" "REF_AUDIO"
+        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" '{"BGM":{"ENV":"轻微的环境声"}}' "" "$REF_AUDIO" "" "" "false")"
+        post_payload "$PAYLOAD" "$OUTPUT"
+        ;;
+    clone_ref_audio)
+        require_file "$REF_AUDIO" "REF_AUDIO"
+        if [ -z "$REF_TEXT" ]; then
+            echo "Missing REF_TEXT" >&2
+            exit 1
+        fi
+        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "" "Base" "$REF_AUDIO" "$REF_TEXT" "" "false")"
+        post_payload "$PAYLOAD" "$OUTPUT"
+        ;;
+    clone_embedding)
+        require_file "$SPEAKER_EMBEDDING" "SPEAKER_EMBEDDING"
+        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "" "Base" "" "" "$SPEAKER_EMBEDDING" "false")"
+        post_payload "$PAYLOAD" "$OUTPUT"
+        ;;
+    stream)
+        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "平静，普通话" "" "" "" "" "true")"
+        post_payload "$PAYLOAD" "$STREAM_OUTPUT"
+        ;;
+    *)
+        echo "Unknown mode: $MODE" >&2
+        echo "Supported: basic, style, ip, emotion, dialect, zero_shot, podcast, speech_bgm, speech_sound, clone_ref_audio, clone_embedding, stream" >&2
+        exit 1
+        ;;
+esac
diff --git a/examples/online_serving/ming_tts/run_server.sh b/examples/online_serving/ming_tts/run_server.sh
new file mode 100755
index 00000000000..a35d4abe512
--- /dev/null
+++ b/examples/online_serving/ming_tts/run_server.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Launch vLLM-Omni server for Ming-omni-tts.
+#
+# Usage:
+#   ./run_server.sh
+#   PORT=8000 ./run_server.sh
+
+set -e
+
+MODEL="${MODEL:-inclusionAI/Ming-omni-tts-0.5B}"
+PORT="${PORT:-8091}"
+STAGE_CONFIG="${STAGE_CONFIG:-vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml}"
+
+echo "Starting Ming-omni-tts server with model: $MODEL"
+echo "Stage config: $STAGE_CONFIG"
+
+vllm-omni serve "$MODEL" \
+    --stage-configs-path "$STAGE_CONFIG" \
+    --host 0.0.0.0 \
+    --port "$PORT" \
+    --enforce-eager \
+    --omni
diff --git a/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py b/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py
index 256e3e0a3f8..03df9ad4eb0 100644
--- a/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py
+++ b/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py
@@ -122,6 +122,31 @@ def test_load_poll(build_adapter):
     assert "req-1" not in adapter._pending_load_reqs
 
 
+def test_generation_load_preserves_payload_metadata(build_adapter):
+    adapter, connector = build_adapter(stage_id=1, model_mode="generation")
+    request = _req("req-1", RequestStatus.WAITING, external_req_id="external-1")
+    payload = {
+        "code_predictor_codes": [0],
+        "left_context_size": 3,
+        "ming_latent_patches": torch.ones((10, 4, 64), dtype=torch.float32),
+        "ming_request_id": "external-1",
+        "ming_chunk_id": 7,
+        "finished": torch.tensor(False),
+    }
+    connector.get.return_value = (payload, 16)
+
+    adapter._poll_single_request(request)
+
+    assert request.prompt_token_ids == [0]
+    assert request.additional_information["left_context_size"] == 3
+    assert request.additional_information["ming_request_id"] == "external-1"
+    assert request.additional_information["ming_chunk_id"] == 7
+    assert request.additional_information["ming_latent_patches"].shape == (10, 4, 64)
+    assert "code_predictor_codes" not in request.additional_information
+    assert "finished" not in request.additional_information
+    assert request.num_computed_tokens == 0
+
+
 def test_save_async(build_adapter):
     adapter, _ = build_adapter(stage_id=1)
     request = _req("req-1", RequestStatus.WAITING, external_req_id="external-1")
diff --git a/tests/e2e/offline_inference/test_ming_tts.py b/tests/e2e/offline_inference/test_ming_tts.py
new file mode 100644
index 00000000000..128b84e2896
--- /dev/null
+++ b/tests/e2e/offline_inference/test_ming_tts.py
@@ -0,0 +1,231 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""End-to-end offline inference tests for Ming-omni-tts."""
+
+import asyncio
+import os
+import uuid
+from pathlib import Path
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
+
+import numpy as np
+import pytest
+import torch
+from transformers import AutoTokenizer
+from vllm import SamplingParams
+
+from tests.utils import hardware_test
+from vllm_omni import AsyncOmni, Omni
+from vllm_omni.model_executor.models.ming_tts.config_ming_tts import (
+    KEY_MAX_DECODE_STEPS,
+    SAMPLE_RATE,
+    TEXT_EOS_TOKEN_ID,
+)
+from vllm_omni.model_executor.models.ming_tts.prompt_builder import build_ming_dense_prompt
+
+MODEL = "inclusionAI/Ming-omni-tts-0.5B"
+STAGE_CONFIG = str(
+    Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "ming_tts.yaml"
+)
+STREAM_STAGE_CONFIG = str(
+    Path(__file__).parent.parent.parent.parent
+    / "vllm_omni"
+    / "model_executor"
+    / "stage_configs"
+    / "ming_tts_async_chunk.yaml"
+)
+TEST_TEXT = "我会一直在这里陪着你，直到你慢慢地沉入那个最温柔的梦里。"
+TEST_INSTRUCTION = "轻柔的ASMR耳语，慢速，贴近麦克风"
+MIN_AUDIO_SAMPLES = 1000
+
+
+def _build_prompt(
+    *,
+    text: str = TEST_TEXT,
+    instruction=TEST_INSTRUCTION,
+    use_zero_spk_emb: bool = True,
+) -> dict:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=False)
+    return build_ming_dense_prompt(
+        tokenizer,
+        prompt="Please generate speech based on the following description.\n",
+        text=text,
+        instruction=instruction,
+        runtime_controls={KEY_MAX_DECODE_STEPS: 200},
+        use_zero_spk_emb=use_zero_spk_emb,
+    )
+
+
+def _sampling_params_list() -> list[SamplingParams]:
+    return [
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=201,
+            stop_token_ids=[int(TEXT_EOS_TOKEN_ID)],
+        ),
+        SamplingParams(temperature=0.0, max_tokens=1),
+    ]
+
+
+def _flatten_audio(audio) -> torch.Tensor:
+    if isinstance(audio, list):
+        parts = [torch.as_tensor(item, dtype=torch.float32).reshape(-1).cpu() for item in audio]
+        parts = [item for item in parts if item.numel() > 0]
+        if not parts:
+            return torch.zeros((0,), dtype=torch.float32)
+        return torch.cat(parts, dim=0)
+    return torch.as_tensor(audio, dtype=torch.float32).reshape(-1).cpu()
+
+
+def _extract_audio(multimodal_output: dict) -> torch.Tensor:
+    audio = multimodal_output.get("audio")
+    if audio is None:
+        raise RuntimeError("Expected multimodal_output['audio']")
+    waveform = _flatten_audio(audio)
+    if waveform.numel() == 0:
+        raise RuntimeError("Generated audio waveform is empty")
+    return waveform
+
+
+def _extract_sample_rate(multimodal_output: dict) -> int:
+    sample_rate = multimodal_output.get("sr")
+    if sample_rate is None:
+        raise RuntimeError("Expected multimodal_output['sr']")
+    if isinstance(sample_rate, list):
+        sample_rate = sample_rate[-1]
+    if hasattr(sample_rate, "item"):
+        sample_rate = sample_rate.item()
+    return int(sample_rate)
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+def test_ming_tts_offline_basic() -> None:
+    """Test blocking Ming generation through Omni."""
+    omni = Omni(
+        model=MODEL,
+        stage_configs_path=STAGE_CONFIG,
+        stage_init_timeout=300,
+        enforce_eager=True,
+    )
+    try:
+        outputs = omni.generate(
+            prompts=[_build_prompt()],
+            sampling_params_list=_sampling_params_list(),
+            py_generator=False,
+        )
+        final_output = next((item for item in outputs if item.final_output_type == "audio"), None)
+        assert final_output is not None, "No final audio output produced"
+        multimodal_output = final_output.multimodal_output or {}
+        waveform = _extract_audio(multimodal_output)
+        sample_rate = _extract_sample_rate(multimodal_output)
+        assert waveform.ndim == 1
+        assert waveform.shape[0] == waveform.numel()
+        assert waveform.numel() > MIN_AUDIO_SAMPLES
+        assert np.max(np.abs(waveform.numpy())) > 0.01, "Audio appears silent"
+        assert sample_rate == SAMPLE_RATE, f"Expected Ming output sample rate {SAMPLE_RATE}, got {sample_rate}"
+    finally:
+        omni.close()
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+def test_ming_tts_speaker_conditioning_differs() -> None:
+    """Test that different Ming speaker controls produce different waveform outputs."""
+    omni = Omni(
+        model=MODEL,
+        stage_configs_path=STAGE_CONFIG,
+        stage_init_timeout=300,
+        enforce_eager=True,
+    )
+    try:
+        style_outputs = omni.generate(
+            prompts=[_build_prompt()],
+            sampling_params_list=_sampling_params_list(),
+            py_generator=False,
+        )
+        ip_outputs = omni.generate(
+            prompts=[_build_prompt(text=TEST_TEXT, instruction={"IP": "灵小甄"}, use_zero_spk_emb=True)],
+            sampling_params_list=_sampling_params_list(),
+            py_generator=False,
+        )
+
+        style_final_output = next((item for item in style_outputs if item.final_output_type == "audio"), None)
+        ip_final_output = next((item for item in ip_outputs if item.final_output_type == "audio"), None)
+        assert style_final_output is not None, "No style audio output produced"
+        assert ip_final_output is not None, "No IP audio output produced"
+
+        style_waveform = _extract_audio(style_final_output.multimodal_output or {})
+        ip_waveform = _extract_audio(ip_final_output.multimodal_output or {})
+        assert style_waveform.numel() > MIN_AUDIO_SAMPLES
+        assert ip_waveform.numel() > MIN_AUDIO_SAMPLES
+        assert np.max(np.abs(style_waveform.numpy())) > 0.01, "Style audio appears silent"
+        assert np.max(np.abs(ip_waveform.numpy())) > 0.01, "IP audio appears silent"
+
+        overlap = min(int(style_waveform.numel()), int(ip_waveform.numel()))
+        mean_abs_diff = torch.mean(torch.abs(style_waveform[:overlap] - ip_waveform[:overlap])).item()
+        assert style_waveform.shape != ip_waveform.shape or mean_abs_diff > 1e-4, (
+            "Speaker-conditioned outputs should differ, but style and IP waveforms were effectively identical"
+        )
+    finally:
+        omni.close()
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+def test_ming_tts_offline_streaming() -> None:
+    """Test async_chunk streaming Ming generation through AsyncOmni."""
+
+    async def _run() -> None:
+        async_omni = AsyncOmni(
+            model=MODEL,
+            stage_configs_path=STREAM_STAGE_CONFIG,
+            stage_init_timeout=300,
+            enforce_eager=True,
+        )
+        try:
+            all_audio_chunks = []
+            accumulated_samples = 0
+            chunk_idx = 0
+            sample_rate = None
+            async for stage_output in async_omni.generate(
+                prompt=_build_prompt(),
+                request_id=str(uuid.uuid4()),
+                sampling_params_list=_sampling_params_list(),
+            ):
+                multimodal_output = stage_output.multimodal_output or {}
+                audio = multimodal_output.get("audio")
+                if "sr" in multimodal_output:
+                    sample_rate = _extract_sample_rate(multimodal_output)
+                if audio is None:
+                    continue
+                finished = stage_output.finished
+                if isinstance(audio, torch.Tensor):
+                    if finished:
+                        audio_chunk = audio[accumulated_samples:].float().detach().cpu()
+                    else:
+                        audio_chunk = audio.float().detach().cpu()
+                elif isinstance(audio, list):
+                    audio_chunk = torch.as_tensor(audio[chunk_idx], dtype=torch.float32).reshape(-1).cpu()
+                else:
+                    audio_chunk = torch.as_tensor(audio, dtype=torch.float32).reshape(-1).cpu()
+                accumulated_samples += int(audio_chunk.numel())
+                chunk_idx += 1
+                if audio_chunk.numel() > 0:
+                    all_audio_chunks.append(audio_chunk)
+            assert all_audio_chunks, "No streaming audio chunks received"
+            waveform = torch.cat(all_audio_chunks, dim=0)
+            assert waveform.numel() > MIN_AUDIO_SAMPLES
+            assert np.max(np.abs(waveform.numpy())) > 0.01, "Audio appears silent"
+            assert sample_rate is not None, "Streaming path did not return a sample rate"
+            assert sample_rate == SAMPLE_RATE, f"Expected Ming output sample rate {SAMPLE_RATE}, got {sample_rate}"
+        finally:
+            async_omni.shutdown()
+
+    asyncio.run(_run())
diff --git a/tests/e2e/online_serving/test_ming_tts.py b/tests/e2e/online_serving/test_ming_tts.py
new file mode 100644
index 00000000000..6b3e21c09bd
--- /dev/null
+++ b/tests/e2e/online_serving/test_ming_tts.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""E2E online-serving tests for Ming-omni-tts."""
+
+import concurrent.futures
+import io
+import os
+import wave
+from pathlib import Path
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
+
+import pytest
+
+from tests.conftest import OmniServerParams
+from tests.utils import hardware_test
+from vllm_omni.model_executor.models.ming_tts.config_ming_tts import SAMPLE_RATE
+
+MODEL = "inclusionAI/Ming-omni-tts-0.5B"
+STAGE_CONFIG = str(
+    Path(__file__).parent.parent.parent.parent
+    / "vllm_omni"
+    / "model_executor"
+    / "stage_configs"
+    / "ming_tts_async_chunk.yaml"
+)
+
+SERVER_PARAMS = [
+    pytest.param(
+        OmniServerParams(
+            model=MODEL,
+            stage_config_path=STAGE_CONFIG,
+            server_args=["--enforce-eager", "--disable-log-stats"],
+        ),
+        id="async_chunk",
+    )
+]
+
+
+def _wav_sample_rate(audio_bytes: bytes) -> int:
+    with wave.open(io.BytesIO(audio_bytes), "rb") as wav_file:
+        return int(wav_file.getframerate())
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", SERVER_PARAMS, indirect=True)
+def test_ming_tts_audio_speech_non_streaming(omni_server, openai_client) -> None:
+    """Test non-streaming Ming generation through /v1/audio/speech."""
+    request_config = {
+        "model": omni_server.model,
+        "input": "我会一直在这里陪着你，直到你慢慢地沉入那个最温柔的梦里。",
+        "stream": False,
+        "response_format": "wav",
+    }
+    request_inputs = [
+        "我会一直在这里陪着你，直到你慢慢地沉入那个最温柔的梦里。",
+        "这款产品的名字，叫变态坑爹牛肉丸。",
+    ]
+
+    def _send_one(text):
+        per_request_config = {**request_config, "input": text}
+        responses = openai_client.send_audio_speech_request(per_request_config)
+        assert len(responses) == 1
+        return text, responses[0]
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=len(request_inputs)) as executor:
+        futures = [executor.submit(_send_one, text) for text in request_inputs]
+        results = [future.result() for future in concurrent.futures.as_completed(futures)]
+
+    assert {text for text, _ in results} == set(request_inputs)
+    assert len(results) == len(request_inputs)
+    for _, response in results:
+        assert response.audio_bytes is not None, "Expected WAV bytes from /v1/audio/speech"
+        sample_rate = _wav_sample_rate(response.audio_bytes)
+        assert sample_rate == SAMPLE_RATE, f"Expected Ming output sample rate {SAMPLE_RATE}, got {sample_rate}"
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", SERVER_PARAMS, indirect=True)
+def test_ming_tts_audio_speech_streaming(omni_server, openai_client) -> None:
+    """Test streaming Ming generation through /v1/audio/speech."""
+    request_config = {
+        "model": omni_server.model,
+        "input": "这款产品的名字，叫变态坑爹牛肉丸。",
+        "voice": "灵小甄",
+        "stream": True,
+        "response_format": "wav",
+    }
+    openai_client.send_audio_speech_request(request_config)
diff --git a/tests/engine/test_async_omni_engine_input.py b/tests/engine/test_async_omni_engine_input.py
index 3700e426d42..f5433443a09 100644
--- a/tests/engine/test_async_omni_engine_input.py
+++ b/tests/engine/test_async_omni_engine_input.py
@@ -1,4 +1,7 @@
+from unittest.mock import Mock
+
 import pytest
+import torch
 from pytest_mock import MockerFixture
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import EngineCoreRequest
@@ -88,3 +91,54 @@ def test_build_add_request_message_with_resumable_streaming(mocker: MockerFixtur
     assert msg["type"] == "streaming_update"
     input_processor.process_inputs.assert_called_once()
     assert input_processor.process_inputs.call_args.kwargs["resumable"] is True
+
+
+def test_build_add_request_message_uses_ingress_processed_prompt_for_additional_information():
+    engine = object.__new__(AsyncOmniEngine)
+    params = SamplingParams(max_tokens=8)
+    engine.default_sampling_params_list = [params]
+    engine.stage_metadata = [{"stage_type": "llm"}]
+    engine.supported_tasks = ("speech",)
+
+    input_processor = Mock()
+    input_processor.process_inputs.return_value = _make_engine_core_request()
+    input_processor.input_preprocessor = Mock()
+    prompt_latents = torch.ones((4, 64), dtype=torch.float32)
+    processed_prompt = {
+        "prompt_token_ids": [1, 2, 3, 4],
+        "additional_information": {
+            "ming_prompt_latents": prompt_latents,
+            "global_request_id": ["req-1"],
+        },
+    }
+    input_processor.input_preprocessor.consume_last_processed_prompt.return_value = processed_prompt
+    engine.input_processor = input_processor
+
+    output_processor = Mock()
+    engine.output_processors = [output_processor]
+
+    raw_prompt = {
+        "prompt_token_ids": [1, 2, 3],
+        "additional_information": {},
+    }
+
+    msg = engine._build_add_request_message(
+        request_id="req-1",
+        prompt=raw_prompt,
+        sampling_params_list=[params],
+        final_stage_id=0,
+        arrival_time=0.0,
+    )
+
+    request = msg["prompt"]
+    assert isinstance(request, OmniEngineCoreRequest)
+    assert request.additional_information is not None
+    assert request.additional_information.entries["ming_prompt_latents"].tensor_shape == [4, 64]
+    input_processor.input_preprocessor.consume_last_processed_prompt.assert_called_once()
+    output_processor.add_request.assert_called_once()
+    call_kwargs = output_processor.add_request.call_args.kwargs
+    assert call_kwargs["request"] is request
+    assert call_kwargs["prompt"] is None
+    assert call_kwargs["parent_req"] is None
+    assert call_kwargs["request_index"] == 0
+    assert call_kwargs["queue"] is None
diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py
index b78d62d9eda..816d87e592a 100644
--- a/tests/entrypoints/openai_api/test_serving_speech.py
+++ b/tests/entrypoints/openai_api/test_serving_speech.py
@@ -6,6 +6,7 @@
 from inspect import Signature, signature
 from pathlib import Path
 from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock
 
 import numpy as np
 import pytest
@@ -1929,6 +1930,48 @@ def fish_speech_server(mocker: MockerFixture):
     server.shutdown()
 
 
+@pytest.fixture
+def ming_speech_server(mocker: MockerFixture):
+    mocker.patch.object(OmniOpenAIServingSpeech, "_load_supported_speakers", return_value={"灵小甄"})
+    mocker.patch.object(OmniOpenAIServingSpeech, "_load_codec_frame_rate", return_value=None)
+
+    mock_engine_client = mocker.MagicMock()
+    mock_engine_client.errored = False
+    mock_engine_client.model_config = mocker.MagicMock(model="inclusionAI/Ming-omni-tts-0.5B")
+    mock_engine_client.default_sampling_params_list = [
+        SimpleNamespace(max_tokens=512, stop_token_ids=[]),
+        SimpleNamespace(max_tokens=1, stop_token_ids=[]),
+    ]
+    mock_engine_client.tts_batch_max_items = 32
+    mock_engine_client.generate = mocker.MagicMock(return_value="generator")
+    mock_engine_client.stage_configs = [
+        SimpleNamespace(
+            engine_args=SimpleNamespace(
+                model_stage="llm",
+                model_arch="MingTTSForConditionalGeneration",
+                worker_type="ar",
+            ),
+            tts_args={},
+        )
+    ]
+
+    mock_models = mocker.MagicMock()
+    mock_models.is_base_model.return_value = True
+
+    server = OmniOpenAIServingSpeech(
+        engine_client=mock_engine_client,
+        models=mock_models,
+        request_logger=mocker.MagicMock(),
+    )
+    server._build_ming_prompt = MagicMock(
+        return_value={
+            "prompt_token_ids": [1, 2, 3],
+            "additional_information": {},
+        }
+    )
+    return server
+
+
 class TestFishSpeechServing:
     def test_build_fish_prompt_normalizes_legacy_speaker_tags(self, fish_speech_server):
         tokenizer = _FakeFishTokenizer()
@@ -2065,6 +2108,341 @@ def test_create_speech_batch_allows_fish_text_only_items(self, fish_speech_serve
         fish_speech_server._generate_audio_bytes.assert_awaited_once()
 
 
+class TestMingSpeechServing:
+    class _FakeMingTokenizer:
+        def __init__(self):
+            self._token_to_id = {
+                "<audioPatch>": 9001,
+                "<|vision_start|>": 9002,
+                "<|vision_pad|>": 9003,
+                "<|vision_end|>\n": 9004,
+            }
+            self._next = 100
+
+        def encode(self, text):
+            if text not in self._token_to_id:
+                self._token_to_id[text] = self._next
+                self._next += 1
+            return [self._token_to_id[text]]
+
+        def convert_tokens_to_ids(self, token):
+            if token not in self._token_to_id:
+                self._token_to_id[token] = self._next
+                self._next += 1
+            return self._token_to_id[token]
+
+    def test_protocol_accepts_ming_podcast_ref_audio_and_nested_embeddings(self):
+        request = OpenAICreateSpeechRequest(
+            input=" speaker_1:你好。\n speaker_2:你好。\n",
+            ref_audio=["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"],
+            ref_text=" speaker_1:参考一。\n speaker_2:参考二。\n",
+            speaker_embedding=[[0.1] * 192, [0.2] * 192],
+        )
+
+        assert request.ref_audio == ["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"]
+        assert request.speaker_embedding == [[0.1] * 192, [0.2] * 192]
+
+    def test_protocol_preserves_single_ming_ref_audio_and_flat_embedding(self):
+        single_ref = OpenAICreateSpeechRequest(
+            input="Hello",
+            ref_audio="data:audio/wav;base64,aaa",
+            ref_text="reference",
+        )
+        single_embedding = OpenAICreateSpeechRequest(
+            input="Hello",
+            speaker_embedding=[0.1] * 192,
+        )
+
+        assert single_ref.ref_audio == "data:audio/wav;base64,aaa"
+        assert single_embedding.speaker_embedding == [0.1] * 192
+
+    def test_validate_ming_podcast_rules(self, ming_speech_server):
+        valid = OpenAICreateSpeechRequest(
+            input=" speaker_1:你好。\n speaker_2:你好。\n",
+            ref_audio=["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"],
+            ref_text=" speaker_1:参考一。\n speaker_2:参考二。\n",
+        )
+        one_clip = OpenAICreateSpeechRequest(
+            input=" speaker_1:你好。\n",
+            ref_audio=["data:audio/wav;base64,aaa"],
+            ref_text=" speaker_1:参考一。\n",
+        )
+        missing_ref_text = OpenAICreateSpeechRequest(
+            input=" speaker_1:你好。\n speaker_2:你好。\n",
+            ref_audio=["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"],
+        )
+        mismatched_embeddings = OpenAICreateSpeechRequest(
+            input=" speaker_1:你好。\n speaker_2:你好。\n",
+            ref_audio=["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"],
+            ref_text=" speaker_1:参考一。\n speaker_2:参考二。\n",
+            speaker_embedding=[[0.1] * 192],
+        )
+
+        assert ming_speech_server._validate_ming_tts_request(valid) is None
+        assert "at least two" in ming_speech_server._validate_ming_tts_request(one_clip)
+        assert "ref_text" in ming_speech_server._validate_ming_tts_request(missing_ref_text)
+        assert "one speaker embedding per ref_audio" in ming_speech_server._validate_ming_tts_request(
+            mismatched_embeddings
+        )
+
+    def test_validate_ming_single_speaker_clone_still_accepts_existing_shape(self, ming_speech_server):
+        request = OpenAICreateSpeechRequest(
+            input="Hello",
+            ref_audio="data:audio/wav;base64,aaa",
+            ref_text="reference text",
+        )
+
+        assert ming_speech_server._validate_ming_tts_request(request) is None
+
+    def test_resolve_ref_audio_many_preserves_order(self, ming_speech_server):
+        ming_speech_server._resolve_ref_audio = AsyncMock(
+            side_effect=[
+                ([0.1, 0.2], 24000),
+                ([0.3, 0.4], 44100),
+            ]
+        )
+
+        resolved = asyncio.run(
+            ming_speech_server._resolve_ref_audio_many(["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"])
+        )
+
+        assert resolved == [([0.1, 0.2], 24000), ([0.3, 0.4], 44100)]
+        ming_speech_server._resolve_ref_audio.assert_any_await("data:audio/wav;base64,aaa")
+        ming_speech_server._resolve_ref_audio.assert_any_await("data:audio/wav;base64,bbb")
+
+    def test_extract_ming_speaker_embeddings_uses_one_call_per_wav(self, ming_speech_server, mocker: MockerFixture):
+        calls = []
+
+        class _FakeExtractor:
+            def __init__(self, model, target_sr=16000):
+                self.model = model
+                self.target_sr = target_sr
+
+            def extract_from_waveform(self, waveform, sample_rate):
+                calls.append(
+                    {
+                        "model": self.model,
+                        "target_sr": self.target_sr,
+                        "shape": tuple(waveform.shape),
+                        "sample_rate": int(sample_rate),
+                    }
+                )
+                return torch.full((192,), float(len(calls)), dtype=torch.float32)
+
+        mocker.patch(
+            "vllm_omni.model_executor.models.ming_tts.speaker_extractor.MingSpeakerEmbeddingExtractor",
+            _FakeExtractor,
+        )
+
+        embeddings = ming_speech_server._extract_ming_speaker_embeddings_from_ref_audio(
+            [
+                ([0.1, 0.2], 22050),
+                ([0.3, 0.4, 0.5], 44100),
+            ]
+        )
+
+        assert len(embeddings) == 2
+        assert embeddings[0] == [1.0] * 192
+        assert embeddings[1] == [2.0] * 192
+        assert calls == [
+            {
+                "model": "inclusionAI/Ming-omni-tts-0.5B",
+                "target_sr": 16000,
+                "shape": (1, 2),
+                "sample_rate": 22050,
+            },
+            {
+                "model": "inclusionAI/Ming-omni-tts-0.5B",
+                "target_sr": 16000,
+                "shape": (1, 3),
+                "sample_rate": 44100,
+            },
+        ]
+
+    def test_build_ming_prompt_handles_multi_speaker_podcast_inputs(self, ming_speech_server):
+        from vllm_omni.model_executor.models.ming_tts.config_ming_tts import KEY_SPEAKER_EMBEDDING
+
+        ming_speech_server._tts_tokenizer = self._FakeMingTokenizer()
+        request = OpenAICreateSpeechRequest(
+            input=" speaker_1:你好。\n speaker_2:你好。\n",
+            ref_audio=["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"],
+            ref_text=" speaker_1:参考一。\n speaker_2:参考二。\n",
+            speaker_embedding=[[0.1] * 192, [0.2] * 192],
+        )
+
+        prompt = OmniOpenAIServingSpeech._build_ming_prompt(
+            ming_speech_server,
+            request,
+            ref_audio_data=[
+                ([0.1] * 10, 44100),
+                ([0.2] * 20, 44100),
+            ],
+        )
+
+        info = prompt["additional_information"]
+        assert tuple(info[KEY_SPEAKER_EMBEDDING].shape) == (2, 192)
+        assert int(info["prompt_waveform_length"].item()) >= 30
+        assert info["prompt_text"] == " speaker_1:参考一。\n speaker_2:参考二。\n"
+        assert (
+            prompt["prompt_token_ids"].count(
+                ming_speech_server._tts_tokenizer.convert_tokens_to_ids("<|vision_start|>")
+            )
+            == 2
+        )
+
+    def test_build_ming_prompt_concatenates_podcast_waveforms_before_builder(
+        self, ming_speech_server, mocker: MockerFixture
+    ):
+        captured = {}
+
+        def _fake_build_ming_dense_prompt(*args, **kwargs):
+            captured.update(kwargs)
+            return {"prompt_token_ids": [1], "additional_information": {}}
+
+        mocker.patch(
+            "vllm_omni.model_executor.models.ming_tts.prompt_builder.build_ming_dense_prompt",
+            side_effect=_fake_build_ming_dense_prompt,
+        )
+        ming_speech_server._tts_tokenizer = object()
+        request = OpenAICreateSpeechRequest(
+            input=" speaker_1:你好。\n speaker_2:你好。\n",
+            ref_audio=["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"],
+            ref_text=" speaker_1:参考一。\n speaker_2:参考二。\n",
+            speaker_embedding=[[0.1] * 192, [0.2] * 192],
+        )
+
+        OmniOpenAIServingSpeech._build_ming_prompt(
+            ming_speech_server,
+            request,
+            ref_audio_data=[
+                ([0.1] * 10, 44100),
+                ([0.2] * 20, 44100),
+            ],
+        )
+
+        assert tuple(captured["prompt_waveform"].shape) == (1, 30)
+        assert captured["speaker_embedding"] == [[0.1] * 192, [0.2] * 192]
+        assert captured["prompt_text"] == " speaker_1:参考一。\n speaker_2:参考二。\n"
+
+    def test_build_ming_prompt_uses_single_ref_audio_as_speaker_only_without_ref_text(
+        self, ming_speech_server, mocker: MockerFixture
+    ):
+        captured = {}
+
+        def _fake_build_ming_dense_prompt(*args, **kwargs):
+            captured.update(kwargs)
+            return {"prompt_token_ids": [1], "additional_information": {}}
+
+        mocker.patch(
+            "vllm_omni.model_executor.models.ming_tts.prompt_builder.build_ming_dense_prompt",
+            side_effect=_fake_build_ming_dense_prompt,
+        )
+        ming_speech_server._tts_tokenizer = object()
+        request = OpenAICreateSpeechRequest(
+            input="我竟然抢到了陈奕迅的演唱会门票！",
+            ref_audio="data:audio/wav;base64,aaa",
+            speaker_embedding=[0.1] * 192,
+            instructions='{"情感":"高兴"}',
+        )
+
+        OmniOpenAIServingSpeech._build_ming_prompt(
+            ming_speech_server,
+            request,
+            ref_audio_data=([0.1] * 10, 44100),
+        )
+
+        assert captured["prompt_waveform"] is None
+        assert captured["prompt_text"] is None
+        assert captured["speaker_embedding"] == [0.1] * 192
+
+    def test_build_ming_prompt_keeps_single_ref_audio_waveform_with_ref_text(
+        self, ming_speech_server, mocker: MockerFixture
+    ):
+        captured = {}
+
+        def _fake_build_ming_dense_prompt(*args, **kwargs):
+            captured.update(kwargs)
+            return {"prompt_token_ids": [1], "additional_information": {}}
+
+        mocker.patch(
+            "vllm_omni.model_executor.models.ming_tts.prompt_builder.build_ming_dense_prompt",
+            side_effect=_fake_build_ming_dense_prompt,
+        )
+        ming_speech_server._tts_tokenizer = object()
+        request = OpenAICreateSpeechRequest(
+            input="我们的愿景是构建未来服务业的数字化基础设施。",
+            ref_audio="data:audio/wav;base64,aaa",
+            ref_text="在此奉劝大家别乱打美白针。",
+            speaker_embedding=[0.1] * 192,
+        )
+
+        OmniOpenAIServingSpeech._build_ming_prompt(
+            ming_speech_server,
+            request,
+            ref_audio_data=([0.1] * 10, 44100),
+        )
+
+        assert tuple(captured["prompt_waveform"].shape) == (1, 10)
+        assert captured["prompt_text"] == "在此奉劝大家别乱打美白针。"
+        assert captured["speaker_embedding"] == [0.1] * 192
+
+    def test_prepare_speech_generation_sets_ming_stop_token(self, ming_speech_server):
+        from vllm_omni.model_executor.models.ming_tts.config_ming_tts import TEXT_EOS_TOKEN_ID
+
+        request = OpenAICreateSpeechRequest(
+            input="这款产品的名字，叫变态坑爹牛肉丸。",
+            voice="灵小甄",
+        )
+
+        request_id, generator, _ = asyncio.run(ming_speech_server._prepare_speech_generation(request))
+
+        assert request_id.startswith("speech-")
+        assert generator == "generator"
+        sampling_params_list = ming_speech_server.engine_client.generate.call_args.kwargs["sampling_params_list"]
+        assert sampling_params_list[0].stop_token_ids == [int(TEXT_EOS_TOKEN_ID)]
+        assert sampling_params_list[0].max_tokens == 512
+        assert ming_speech_server.engine_client.default_sampling_params_list[0].stop_token_ids == []
+        assert ming_speech_server.engine_client.default_sampling_params_list[0].max_tokens == 512
+
+    def test_prepare_speech_generation_overrides_ming_stage_max_tokens(self, ming_speech_server):
+        from vllm_omni.model_executor.models.ming_tts.config_ming_tts import TEXT_EOS_TOKEN_ID
+
+        request = OpenAICreateSpeechRequest(
+            input="这款产品的名字，叫变态坑爹牛肉丸。",
+            voice="灵小甄",
+            max_new_tokens=16,
+        )
+
+        request_id, generator, _ = asyncio.run(ming_speech_server._prepare_speech_generation(request))
+
+        assert request_id.startswith("speech-")
+        assert generator == "generator"
+        sampling_params_list = ming_speech_server.engine_client.generate.call_args.kwargs["sampling_params_list"]
+        assert sampling_params_list[0].stop_token_ids == [int(TEXT_EOS_TOKEN_ID)]
+        assert sampling_params_list[0].max_tokens == 17
+        assert ming_speech_server.engine_client.default_sampling_params_list[0].max_tokens == 512
+
+    def test_prepare_speech_generation_extracts_ming_single_ref_audio_speaker_embedding(
+        self, ming_speech_server, mocker: MockerFixture
+    ):
+        request = OpenAICreateSpeechRequest(
+            input="我竟然抢到了陈奕迅的演唱会门票！",
+            ref_audio="data:audio/wav;base64,aaa",
+            instructions='{"情感":"高兴"}',
+        )
+        ming_speech_server._resolve_ref_audio = AsyncMock(return_value=([0.1, 0.2], 44100))
+        ming_speech_server._extract_ming_speaker_embeddings_from_ref_audio = mocker.MagicMock(
+            return_value=[[0.3] * 192]
+        )
+
+        asyncio.run(ming_speech_server._prepare_speech_generation(request))
+
+        ming_speech_server._extract_ming_speaker_embeddings_from_ref_audio.assert_called_once_with(
+            [([0.1, 0.2], 44100)]
+        )
+        assert request.speaker_embedding == [0.3] * 192
+
+
 class TestWAVStreaming:
     """Integration tests for WAV format streaming."""
 
diff --git a/tests/model_executor/models/ming_tts/test_ming_tts_components.py b/tests/model_executor/models/ming_tts/test_ming_tts_components.py
new file mode 100644
index 00000000000..14c4c02db05
--- /dev/null
+++ b/tests/model_executor/models/ming_tts/test_ming_tts_components.py
@@ -0,0 +1,505 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from types import SimpleNamespace
+
+import pytest
+import torch
+import torch.nn as nn
+
+from vllm_omni.model_executor.models.ming_tts.audio_tokenizer.configuration_audio_vae import AudioVAEconfig
+from vllm_omni.model_executor.models.ming_tts.audio_tokenizer.istft import ISTFT, ISTFTHead
+from vllm_omni.model_executor.models.ming_tts.audio_tokenizer.modeling_audio_vae import AudioVAE
+from vllm_omni.model_executor.models.ming_tts.audio_tokenizer.vae_modules import StreamingLinearUpsample
+from vllm_omni.model_executor.models.ming_tts.fm.cfm import CFM, Solver, get_epss_timesteps
+from vllm_omni.model_executor.models.ming_tts.fm.dit import (
+    Aggregator,
+    CondEmbedder,
+    DiT,
+    SinusPositionEmbedding,
+    TimestepEmbedder,
+)
+from vllm_omni.model_executor.models.ming_tts.fm.flowloss import FlowLoss
+from vllm_omni.model_executor.models.ming_tts.fm.modules import Attention, DiTBlock, RMSNorm
+from vllm_omni.model_executor.models.ming_tts.ming_tts import (
+    _coerce_prompt_latents,
+    _find_audio_placeholder_positions,
+    _initial_history,
+)
+from vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae import _coerce_finished, _coerce_latent_chunk
+from vllm_omni.model_executor.models.ming_tts.ming_tts_llm import _coerce_latent_history
+from vllm_omni.model_executor.stage_input_processors.ming_tts import llm2audio_vae_async_chunk
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+def _tiny_qwen_config(hidden_size=8):
+    return {
+        "hidden_size": hidden_size,
+        "intermediate_size": hidden_size * 2,
+        "num_hidden_layers": 1,
+        "num_attention_heads": 2,
+        "num_key_value_heads": 2,
+        "vocab_size": 32,
+        "max_position_embeddings": 64,
+    }
+
+
+def _tiny_audio_vae_config():
+    return AudioVAEconfig(
+        sample_rate=16000,
+        patch_size=2,
+        enc_kwargs={
+            "backbone": _tiny_qwen_config(),
+            "input_dim": 4,
+            "hop_size": 4,
+            "latent_dim": 2,
+        },
+        dec_kwargs={
+            "backbone": _tiny_qwen_config(),
+            "output_dim": 4,
+            "latent_dim": 2,
+        },
+        semantic_module_kwargs=None,
+    )
+
+
+class _DummyCFMModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.anchor = nn.Parameter(torch.zeros(()))
+
+    def forward(self, x, t, c, latent_history, mask=None):
+        del t, c, latent_history
+        if mask is not None:
+            x = x.masked_fill(~mask.unsqueeze(-1), 0.0)
+        return x
+
+    def forward_with_cfg(self, x, t, c, cfg_scale, latent_history, patch_size):
+        del t, c, cfg_scale, latent_history
+        cond = x[:, -patch_size:, :] + 1.0
+        uncond = x[:, -patch_size:, :]
+        return torch.cat([cond, uncond], dim=0)
+
+
+def test_rmsnorm_preserves_shape_and_dtype():
+    norm = RMSNorm(dim=8, eps=1e-6)
+    x = torch.randn(2, 3, 8, dtype=torch.float32)
+
+    out = norm(x)
+
+    assert out.shape == x.shape
+    assert out.dtype == x.dtype
+
+
+def test_attention_forward_shape_and_mask():
+    attn = Attention(dim=8, heads=2, dim_head=4, dropout=0.0)
+    x = torch.randn(1, 5, 8)
+    mask = torch.tensor([[True, True, True, True, False]])
+
+    out = attn(x, mask=mask)
+
+    assert out.shape == x.shape
+    assert torch.allclose(out[:, -1], torch.zeros_like(out[:, -1]))
+
+
+def test_attention_rejects_bad_mask_shape():
+    attn = Attention(dim=8, heads=2, dim_head=4, dropout=0.0)
+    x = torch.randn(1, 5, 8)
+
+    with pytest.raises(ValueError, match="Mask shape mismatch"):
+        attn(x, mask=torch.ones(1, 4, dtype=torch.bool))
+
+
+def test_dit_block_forward_shape():
+    block = DiTBlock(hidden_size=8, num_heads=2, mlp_ratio=2.0, dropout=0.0)
+    x = torch.randn(1, 5, 8)
+    mask = torch.ones(1, 5, dtype=torch.bool)
+
+    out = block(x, mask, rope=None)
+
+    assert out.shape == x.shape
+
+
+def test_sinus_position_embedding_shape():
+    embed = SinusPositionEmbedding(dim=8)
+    t = torch.tensor([0.0, 1.0], dtype=torch.float32)
+
+    out = embed(t)
+
+    assert out.shape == (2, 8)
+
+
+def test_timestep_embedder_distinguishes_steps():
+    embedder = TimestepEmbedder(dim=8, freq_embed_dim=8)
+
+    out_a = embedder(torch.tensor([0.0], dtype=torch.float32))
+    out_b = embedder(torch.tensor([1.0], dtype=torch.float32))
+
+    assert out_a.shape == (1, 8)
+    assert not torch.allclose(out_a, out_b)
+
+
+def test_cond_embedder_rejects_bad_rank():
+    embedder = CondEmbedder(input_feature_size=4, hidden_size=8, dropout_prob=0.0)
+
+    with pytest.raises(ValueError, match="rank-3"):
+        embedder(torch.randn(1, 4), train=False)
+
+
+def test_cond_drop_preserves_conditioning_dtype():
+    embedder = CondEmbedder(input_feature_size=4, hidden_size=8, dropout_prob=1.0)
+    llm_cond = torch.randn(1, 1, 4, dtype=torch.float16)
+
+    out = embedder.cond_drop(llm_cond)
+
+    assert out.dtype == llm_cond.dtype
+
+
+def test_dit_forward_shape():
+    model = DiT(
+        in_channels=2,
+        hidden_size=8,
+        depth=1,
+        num_heads=2,
+        mlp_ratio=2.0,
+        llm_cond_dim=4,
+        cfg_dropout_prob=0.0,
+    )
+    x = torch.randn(1, 2, 2)
+    latent_history = torch.randn(1, 4, 2)
+    c = torch.randn(1, 1, 4)
+    mask = torch.ones(1, 2, dtype=torch.bool)
+
+    out = model(x=x, t=torch.tensor([0.5]), c=c, latent_history=latent_history, mask=mask)
+
+    assert out.shape == (1, 7, 2)
+
+
+def test_dit_forward_with_cfg_preserves_conditioning_dtype(monkeypatch):
+    model = DiT(
+        in_channels=2,
+        hidden_size=8,
+        depth=1,
+        num_heads=2,
+        mlp_ratio=2.0,
+        llm_cond_dim=4,
+        cfg_dropout_prob=0.0,
+    )
+    seen = {}
+
+    def _fake_forward(x, t, c, latent_history, mask=None):
+        del x, t, latent_history, mask
+        seen["dtype"] = c.dtype
+        return torch.zeros((c.shape[0], 7, 2), dtype=torch.float32)
+
+    monkeypatch.setattr(model, "forward", _fake_forward)
+    x = torch.randn(1, 2, 2, dtype=torch.float16)
+    latent_history = torch.randn(1, 4, 2, dtype=torch.float16)
+    c = torch.randn(1, 1, 4, dtype=torch.float16)
+
+    model.forward_with_cfg(
+        x=x,
+        t=torch.tensor([0.5], dtype=torch.float16),
+        c=c,
+        cfg_scale=2.0,
+        latent_history=latent_history,
+        patch_size=2,
+    )
+
+    assert seen["dtype"] == c.dtype
+
+
+def test_aggregator_forward_shape():
+    agg = Aggregator(
+        in_channels=2,
+        hidden_size=8,
+        depth=1,
+        num_heads=2,
+        mlp_ratio=2.0,
+        llm_input_dim=4,
+    )
+    x = torch.randn(2, 3, 2)
+    mask = torch.ones(2, 3, dtype=torch.bool)
+
+    out = agg(x, mask=mask)
+
+    assert out.shape == (2, 1, 4)
+
+
+def test_get_epss_timesteps_predefined_and_fallback():
+    predefined = get_epss_timesteps(10, device=torch.device("cpu"), dtype=torch.float32)
+    fallback = get_epss_timesteps(9, device=torch.device("cpu"), dtype=torch.float32)
+
+    assert predefined.shape == (11,)
+    assert torch.allclose(predefined[-1], torch.tensor(1.0))
+    assert fallback.shape == (10,)
+    assert torch.allclose(fallback, torch.linspace(0, 1, 10))
+
+
+def test_solver_integrate_zero_function_is_stable():
+    y0 = torch.ones(1, 2, 2)
+    solver = Solver(lambda t, y: torch.zeros_like(y), y0=y0, sigma=0.0, temperature=0.0)
+    t = torch.linspace(0, 1, 4)
+
+    out = solver.integrate(t)
+
+    assert out.shape == (4, 1, 2, 2)
+    assert torch.allclose(out[0], y0)
+    assert torch.allclose(out[-1], y0)
+
+
+def test_cfm_forward_returns_scalar_loss():
+    torch.manual_seed(0)
+    cfm = CFM(model=_DummyCFMModel())
+    cond = torch.randn(1, 1, 4)
+    target = torch.randn(1, 2, 2)
+    latent_history = torch.randn(1, 4, 2)
+    mask = torch.ones(1, 2, dtype=torch.bool)
+
+    loss = cfm(cond=cond, target=target, latent_history=latent_history, mask=mask, patch_size=2)
+
+    assert loss.ndim == 0
+    assert torch.isfinite(loss)
+
+
+def test_cfm_sample_returns_sample_and_trajectory():
+    torch.manual_seed(0)
+    cfm = CFM(model=_DummyCFMModel())
+    noise = torch.randn(1, 2, 2)
+    cond = torch.randn(1, 1, 4)
+    latent_history = torch.randn(1, 4, 2)
+
+    out, trajectory = cfm.sample(noise=noise, c=cond, latent_history=latent_history, steps=4, patch_size=2)
+
+    assert out.shape == (1, 2, 2)
+    assert trajectory.shape == (5, 1, 2, 2)
+
+
+def test_cfm_sample_rejects_low_cfg_scale():
+    cfm = CFM(model=_DummyCFMModel())
+    noise = torch.randn(1, 2, 2)
+    cond = torch.randn(1, 1, 4)
+    latent_history = torch.randn(1, 4, 2)
+
+    out, trajectory = cfm.sample(
+        noise=noise,
+        c=cond,
+        latent_history=latent_history,
+        cfg_scale=0.0,
+        patch_size=2,
+    )
+
+    assert out.shape == (1, 2, 2)
+    assert trajectory.ndim == 4
+
+
+def test_flowloss_sample_returns_tensor_shape_and_dtype(monkeypatch):
+    flow = FlowLoss(
+        z_channels=2,
+        llm_cond_dim=4,
+        hidden_size=8,
+        depth=1,
+        num_heads=2,
+        mlp_ratio=2.0,
+        cfg_dropout_prob=0.0,
+    )
+
+    def _fake_sample(**kwargs):
+        noise = kwargs["noise"]
+        return noise.transpose(1, 2), torch.zeros(1)
+
+    monkeypatch.setattr(flow.cfm, "sample", _fake_sample)
+    z = torch.randn(1, 1, 4, dtype=torch.float32)
+    latent_history = torch.randn(1, 4, 2, dtype=torch.float32)
+
+    out = flow.sample(z=z, latent_history=latent_history, patch_size=3)
+
+    assert out.shape == (1, 3, 2)
+    assert out.dtype == z.dtype
+
+
+def test_streaming_linear_upsample_rejects_empty_final_flush():
+    upsample = StreamingLinearUpsample(scale_factor=2)
+
+    with pytest.raises(ValueError, match="end-of-stream"):
+        upsample(None, state=None, is_last=True)
+
+
+def test_streaming_linear_upsample_streams_and_flushes():
+    upsample = StreamingLinearUpsample(scale_factor=2)
+    chunk_a = torch.randn(1, 2, 3)
+    chunk_b = torch.randn(1, 2, 3)
+
+    out_a, state = upsample(chunk_a, state=None, is_last=False)
+    out_b, state = upsample(chunk_b, state=state, is_last=True)
+
+    assert out_a is None
+    assert out_b is not None
+    assert out_b.shape[0] == 1
+    assert out_b.shape[-1] == 3
+    assert state is None
+
+
+def test_istft_rejects_bad_rank():
+    istft = ISTFT(n_fft=16, hop_length=4, win_length=16, padding="same")
+
+    with pytest.raises(ValueError, match="rank-3"):
+        istft(torch.randn(1, 9))
+
+
+def test_istft_head_output_shape():
+    head = ISTFTHead(dim=8, n_fft=16, hop_length=4, padding="same")
+    x = torch.randn(1, 3, 8)
+
+    audio, spec, audio_buffer, window_buffer = head(x)
+
+    assert audio.shape[0] == 1
+    assert audio.shape[1] == 1
+    assert spec.shape == (1, 18, 3)
+    assert audio_buffer is None
+    assert window_buffer is None
+
+
+def test_audio_vae_encode_and_decode_shapes():
+    torch.manual_seed(0)
+    vae = AudioVAE(_tiny_audio_vae_config())
+    waveform = torch.randn(1, 12)
+    waveform_length = torch.tensor([12], dtype=torch.int32)
+
+    latent, frame_num = vae.encode_latent(waveform, waveform_length)
+    audio, stream_state, past_key_values = vae.decode(latent, use_cache=False)
+
+    assert latent.ndim == 3
+    assert latent.shape[0] == 1
+    assert latent.shape[-1] == 2
+    assert frame_num.tolist() == [2]
+    assert audio.ndim == 3
+    assert audio.shape[0] == 1
+    assert audio.shape[1] == 1
+    assert stream_state == (None, None, None)
+    assert past_key_values is None
+
+
+def test_audio_vae_rejects_invalid_inputs():
+    vae = AudioVAE(_tiny_audio_vae_config())
+
+    with pytest.raises(ValueError, match="waveform rank-2"):
+        vae.encode_latent(torch.randn(12), torch.tensor([12], dtype=torch.int32))
+
+    with pytest.raises(ValueError, match="Latent dim mismatch"):
+        vae.decode(torch.randn(1, 2, 3))
+
+
+def test_coerce_prompt_latents_supports_frames_and_patch_groups():
+    frames = torch.arange(8, dtype=torch.float32).reshape(4, 2)
+    patches = torch.arange(16, dtype=torch.float32).reshape(2, 2, 4)
+
+    out_frames = _coerce_prompt_latents(frames, patch_size=2, latent_dim=2)
+    out_patches = _coerce_prompt_latents(patches, patch_size=2, latent_dim=4)
+
+    assert out_frames["patches"].shape == (2, 2, 2)
+    assert out_frames["frames"].shape == (4, 2)
+    assert out_patches["patches"].shape == (2, 2, 4)
+    assert out_patches["frames"].shape == (4, 4)
+
+
+def test_initial_history_keeps_tail():
+    frames = torch.arange(12, dtype=torch.float32).reshape(6, 2)
+
+    history = _initial_history(
+        frames,
+        history_size=4,
+        latent_dim=2,
+        device=torch.device("cpu"),
+        dtype=torch.float32,
+    )
+
+    assert history.shape == (4, 2)
+    assert torch.allclose(history, frames[-4:])
+
+
+def test_find_audio_placeholder_positions_uses_audio_span():
+    cfg = SimpleNamespace(
+        audio_dummy_token_id=151705,
+        audio_start_token_id=151706,
+        audio_end_token_id=151707,
+    )
+    input_ids = torch.tensor([151705, 1, 151706, 151705, 151705, 151707, 151705], dtype=torch.long)
+
+    out = _find_audio_placeholder_positions(input_ids, cfg)
+
+    assert out.tolist() == [3, 4]
+
+
+def test_helper_coercions_fail_loudly():
+    cfg = SimpleNamespace(history_patch_size=4, latent_dim=2)
+
+    assert _coerce_finished(torch.tensor([1], dtype=torch.bool)) is True
+    latent_chunk = _coerce_latent_chunk(
+        torch.ones(4, 2),
+        device=torch.device("cpu"),
+        dtype=torch.float32,
+        latent_dim=2,
+        patch_size=4,
+    )
+    assert latent_chunk.shape == (1, 4, 2)
+
+    grouped_chunk = _coerce_latent_chunk(
+        torch.ones(2, 4, 2),
+        device=torch.device("cpu"),
+        dtype=torch.float32,
+        latent_dim=2,
+        patch_size=4,
+    )
+    assert grouped_chunk.shape == (2, 4, 2)
+
+    with pytest.raises(RuntimeError, match="latent_history shape mismatch"):
+        _coerce_latent_history(torch.ones(3, 2), device=torch.device("cpu"), dtype=torch.float32, cfg=cfg)
+
+    with pytest.raises(ValueError, match="Latent patch size mismatch"):
+        _coerce_latent_chunk(
+            torch.ones(1, 3, 2),
+            device=torch.device("cpu"),
+            dtype=torch.float32,
+            latent_dim=2,
+            patch_size=4,
+        )
+
+    with pytest.raises(ValueError, match="Latent dim mismatch"):
+        _coerce_latent_chunk(
+            torch.ones(4, 3),
+            device=torch.device("cpu"),
+            dtype=torch.float32,
+            latent_dim=2,
+            patch_size=4,
+        )
+
+
+def test_ming_async_chunk_rejects_left_context_replay():
+    transfer_manager = SimpleNamespace(
+        connector=SimpleNamespace(config={"extra": {"latent_chunk_size": 10, "latent_left_context": 1}}),
+        put_req_chunk={"req-1": 0},
+        request_payload={},
+    )
+    request = SimpleNamespace(external_req_id="req-1", is_finished=lambda: False)
+
+    with pytest.raises(ValueError, match="latent_left_context replay"):
+        llm2audio_vae_async_chunk(
+            transfer_manager=transfer_manager,
+            pooling_output=None,
+            request=request,
+            is_finished=False,
+        )
+
+
+def test_coerce_latent_history_casts_to_requested_dtype():
+    cfg = SimpleNamespace(history_patch_size=4, latent_dim=2)
+
+    history = _coerce_latent_history(
+        torch.ones(1, 4, 2, dtype=torch.float16),
+        device=torch.device("cpu"),
+        dtype=torch.float32,
+        cfg=cfg,
+    )
+
+    assert history.dtype == torch.float32
diff --git a/tests/model_executor/models/ming_tts/test_ming_tts_config_shim.py b/tests/model_executor/models/ming_tts/test_ming_tts_config_shim.py
new file mode 100644
index 00000000000..06cd4a8a787
--- /dev/null
+++ b/tests/model_executor/models/ming_tts/test_ming_tts_config_shim.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import AutoConfig
+
+from vllm_omni.engine.arg_utils import _register_omni_hf_configs
+from vllm_omni.model_executor.models.ming_tts.configuration_ming_dense import MingDenseConfig
+
+
+def test_ming_dense_autoconfig_registration_uses_local_config(tmp_path):
+    _register_omni_hf_configs()
+    model_dir = tmp_path / "ming"
+    model_dir.mkdir()
+    (model_dir / "config.json").write_text(
+        """
+{
+  "model_type": "dense",
+  "auto_map": {"AutoConfig": "configuration_bailingmm.BailingMMConfig"},
+  "llm_config": {
+    "model_type": "qwen2",
+    "hidden_size": 896,
+    "intermediate_size": 4864,
+    "num_hidden_layers": 24,
+    "num_attention_heads": 14,
+    "num_key_value_heads": 2,
+    "vocab_size": 151936
+  },
+  "audio_tokenizer_config": {
+    "sample_rate": 44100,
+    "patch_size": 4,
+    "enc_kwargs": {
+      "latent_dim": 64,
+      "input_dim": 882,
+      "hop_size": 882,
+      "backbone": {"attn_implementation": "flash_attention_2"}
+    },
+    "dec_kwargs": {
+      "latent_dim": 64,
+      "output_dim": 882,
+      "backbone": {"_attn_implementation": "flash_attention_2"}
+    }
+  }
+}
+""".strip()
+    )
+
+    cfg = AutoConfig.from_pretrained(model_dir, trust_remote_code=False, local_files_only=True)
+
+    assert isinstance(cfg, MingDenseConfig)
+    assert cfg.get_text_config().num_attention_heads == 14
+    assert cfg.audio_tokenizer_config.sample_rate == 44100
+    assert cfg.audio_tokenizer_config.patch_size == 4
diff --git a/tests/model_executor/models/ming_tts/test_ming_tts_loaders.py b/tests/model_executor/models/ming_tts/test_ming_tts_loaders.py
new file mode 100644
index 00000000000..b7f95469bf4
--- /dev/null
+++ b/tests/model_executor/models/ming_tts/test_ming_tts_loaders.py
@@ -0,0 +1,524 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from types import SimpleNamespace
+
+import pytest
+import torch
+from vllm.v1.outputs import SamplerOutput
+
+from vllm_omni.model_executor.models.ming_tts.config_ming_tts import KEY_PROMPT_LATENTS, KEY_REQUEST_ID, MingTTSConfig
+from vllm_omni.model_executor.models.ming_tts.ming_tts import MingTTSForConditionalGeneration
+from vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae import MingAudioVAEModel
+from vllm_omni.model_executor.models.ming_tts.ming_tts_llm import MingLLMModel
+
+
+class _DummyBackbone(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = torch.nn.Module()
+        self.model.layers = torch.nn.ModuleList([torch.nn.Linear(2, 2, bias=False)])
+        self.last_forward_kwargs = None
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return torch.zeros((input_ids.shape[0], 2), dtype=torch.float32)
+
+    def forward(self, *args, **kwargs):
+        del args
+        self.last_forward_kwargs = dict(kwargs)
+        return torch.zeros((1, 2), dtype=torch.float32)
+
+
+class _DummyAggregator(torch.nn.Module):
+    def __init__(self, in_channels: int, llm_input_dim: int, **kwargs):
+        super().__init__()
+        del kwargs
+        self.proj_in = torch.nn.Linear(in_channels, llm_input_dim, bias=False)
+
+    def forward(self, patch: torch.Tensor) -> torch.Tensor:
+        return self.proj_in(patch.mean(dim=1)).unsqueeze(1)
+
+
+class _DummyFlowLoss(torch.nn.Module):
+    def __init__(self, z_channels: int, llm_cond_dim: int, **kwargs):
+        super().__init__()
+        del z_channels, kwargs
+        self.dummy = torch.nn.Linear(llm_cond_dim, 64, bias=False)
+
+    def sample(self, **kwargs):
+        del kwargs
+        return torch.zeros((1, 4, 64), dtype=torch.float32)
+
+
+class _DummyAudioVAE(torch.nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        del config
+        self.encoder = torch.nn.Linear(2, 2, bias=False)
+        self.decoder = torch.nn.Linear(2, 2, bias=False)
+        self.last_chunk_values = []
+
+    def encode_latent(self, waveform: torch.Tensor, waveform_length: torch.Tensor):
+        del waveform_length
+        batch = int(waveform.shape[0])
+        return torch.zeros((batch, 8, 64), dtype=torch.float32), None
+
+    def decode(
+        self,
+        latent_patch: torch.Tensor,
+        *,
+        past_key_values=None,
+        use_cache=True,
+        stream_state=None,
+        last_chunk=False,
+    ):
+        del past_key_values, use_cache, stream_state
+        self.last_chunk_values.append(last_chunk)
+        samples = int(latent_patch.shape[1]) * 8
+        waveform = torch.ones((1, 1, samples), dtype=torch.float32)
+        return waveform, (None, None, None), None
+
+
+def _make_audio_cfg():
+    return SimpleNamespace(
+        enc_kwargs={
+            "backbone": {"hidden_size": 2},
+            "input_dim": 882,
+            "hop_size": 882,
+            "latent_dim": 64,
+        },
+        dec_kwargs={
+            "backbone": {"hidden_size": 2},
+            "output_dim": 882,
+            "latent_dim": 64,
+        },
+        patch_size=4,
+        sample_rate=44100,
+        semantic_module_kwargs=None,
+    )
+
+
+def _make_config() -> MingTTSConfig:
+    cfg = MingTTSConfig(audio_tokenizer_config=_make_audio_cfg())
+    cfg.validate()
+    return cfg
+
+
+def _make_vllm_config(model_stage: str):
+    return SimpleNamespace(
+        model_config=SimpleNamespace(hf_config=SimpleNamespace(), model_stage=model_stage),
+        quant_config=None,
+        device_config=SimpleNamespace(device=torch.device("cpu")),
+    )
+
+
+def test_ming_llm_load_weights_maps_and_loads_expected_prefixes(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: _DummyBackbone())
+    monkeypatch.setattr(llm_mod, "Aggregator", _DummyAggregator)
+    monkeypatch.setattr(llm_mod, "FlowLoss", _DummyFlowLoss)
+
+    model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
+    weights = [
+        ("model.model.layers.0.weight", torch.full((2, 2), 1.0, dtype=torch.float32)),
+        ("linear_proj_audio.proj_in.weight", torch.full((896, 64), 2.0, dtype=torch.float32)),
+        ("flowloss.dummy.weight", torch.full((64, 896), 3.0, dtype=torch.float32)),
+        ("stop_head.weight", torch.full((2, 896), 4.0, dtype=torch.float32)),
+        ("stop_head.bias", torch.full((2,), 5.0, dtype=torch.float32)),
+        ("spk_head.weight", torch.full((896, 192), 6.0, dtype=torch.float32)),
+        ("spk_head.bias", torch.full((896,), 7.0, dtype=torch.float32)),
+    ]
+
+    loaded = model.load_weights(weights)
+
+    assert "model.model.layers.0.weight" in loaded
+    assert "linear_proj_audio.proj_in.weight" in loaded
+    assert "flowloss.dummy.weight" in loaded
+    assert "stop_head.weight" in loaded
+    assert "spk_head.weight" in loaded
+    assert torch.allclose(model.model.model.layers[0].weight, torch.full((2, 2), 1.0))
+    assert torch.allclose(model.linear_proj_audio.proj_in.weight, torch.full((896, 64), 2.0))
+    assert torch.allclose(model.flowloss.dummy.weight, torch.full((64, 896), 3.0))
+
+
+def test_ming_llm_load_weights_accepts_complete_checkpoint_and_forward_shape(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: _DummyBackbone())
+    monkeypatch.setattr(llm_mod, "Aggregator", _DummyAggregator)
+    monkeypatch.setattr(llm_mod, "FlowLoss", _DummyFlowLoss)
+
+    model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
+    model.load_weights(
+        [
+            ("model.layers.0.weight", torch.ones((2, 2), dtype=torch.float32)),
+            ("linear_proj_audio.proj_in.weight", torch.ones((896, 64), dtype=torch.float32)),
+            ("flowloss.dummy.weight", torch.ones((64, 896), dtype=torch.float32)),
+            ("stop_head.weight", torch.ones((2, 896), dtype=torch.float32)),
+            ("stop_head.bias", torch.ones((2,), dtype=torch.float32)),
+            ("spk_head.weight", torch.ones((896, 192), dtype=torch.float32)),
+            ("spk_head.bias", torch.ones((896,), dtype=torch.float32)),
+        ]
+    )
+
+    output = model.forward(
+        input_ids=torch.tensor([1], dtype=torch.long),
+        positions=torch.tensor([0], dtype=torch.long),
+    )
+
+    assert output.text_hidden_states.shape == (1, 2)
+    assert output.multimodal_outputs is None
+
+
+def test_ming_llm_load_weights_fails_when_custom_heads_missing(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: _DummyBackbone())
+    monkeypatch.setattr(llm_mod, "Aggregator", _DummyAggregator)
+    monkeypatch.setattr(llm_mod, "FlowLoss", _DummyFlowLoss)
+
+    model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
+    weights = [
+        ("model.layers.0.weight", torch.full((2, 2), 1.0, dtype=torch.float32)),
+        ("stop_head.weight", torch.full((2, 896), 4.0, dtype=torch.float32)),
+        ("stop_head.bias", torch.full((2,), 5.0, dtype=torch.float32)),
+        ("spk_head.weight", torch.full((896, 192), 6.0, dtype=torch.float32)),
+        ("spk_head.bias", torch.full((896,), 7.0, dtype=torch.float32)),
+    ]
+
+    with pytest.raises(RuntimeError, match="flowloss|linear_proj_audio"):
+        model.load_weights(weights)
+
+
+def test_ming_llm_load_weights_rejects_incomplete_checkpoint(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: _DummyBackbone())
+    monkeypatch.setattr(llm_mod, "Aggregator", _DummyAggregator)
+    monkeypatch.setattr(llm_mod, "FlowLoss", _DummyFlowLoss)
+
+    model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
+
+    with pytest.raises(RuntimeError, match="flowloss|linear_proj_audio|stop_head|spk_head"):
+        model.load_weights(
+            [
+                ("model.layers.0.weight", torch.ones((2, 2), dtype=torch.float32)),
+                ("stop_head.weight", torch.ones((2, 896), dtype=torch.float32)),
+                ("stop_head.bias", torch.ones((2,), dtype=torch.float32)),
+            ]
+        )
+
+
+def test_ming_audio_vae_load_weights_fails_when_audio_params_missing(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae as vae_mod
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(vae_mod, "AudioVAE", _DummyAudioVAE)
+
+    model = MingAudioVAEModel(vllm_config=_make_vllm_config("audio_vae"))
+
+    with pytest.raises(RuntimeError, match="params not loaded"):
+        model.load_weights(
+            [
+                ("audio.encoder.weight", torch.full((2, 2), 1.0, dtype=torch.float32)),
+            ]
+        )
+
+
+def test_ming_audio_vae_load_weights_accepts_complete_checkpoint_and_forward_shape(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae as vae_mod
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(vae_mod, "AudioVAE", _DummyAudioVAE)
+
+    model = MingAudioVAEModel(vllm_config=_make_vllm_config("audio_vae"))
+    model.load_weights(
+        [
+            ("audio.encoder.weight", torch.ones((2, 2), dtype=torch.float32)),
+            ("audio.decoder.weight", torch.ones((2, 2), dtype=torch.float32)),
+        ]
+    )
+
+    output = model.forward(
+        runtime_additional_information=[
+            {
+                KEY_REQUEST_ID: "rid-audio",
+                "ming_latent_patches": torch.ones((1, 4, 64), dtype=torch.float32),
+                "stream_finished": torch.tensor(True, dtype=torch.bool),
+            }
+        ]
+    )
+
+    waveform = output.multimodal_outputs["model_outputs"][0]
+    sample_rate = output.multimodal_outputs["sr"][0]
+    assert waveform.ndim == 1
+    assert waveform.dtype == torch.float32
+    assert waveform.shape == (32,)
+    assert int(sample_rate.item()) == 44100
+    assert model.audio.last_chunk_values == [True]
+
+
+def test_ming_audio_vae_load_weights_rejects_incomplete_checkpoint(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae as vae_mod
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(vae_mod, "AudioVAE", _DummyAudioVAE)
+
+    model = MingAudioVAEModel(vllm_config=_make_vllm_config("audio_vae"))
+
+    with pytest.raises(RuntimeError, match="params not loaded|no checkpoint weights"):
+        model.load_weights(
+            [
+                ("audio.encoder.weight", torch.ones((2, 2), dtype=torch.float32)),
+            ]
+        )
+
+
+def test_ming_audio_vae_load_weights_rejects_empty_input(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae as vae_mod
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(vae_mod, "AudioVAE", _DummyAudioVAE)
+
+    model = MingAudioVAEModel(vllm_config=_make_vllm_config("audio_vae"))
+
+    with pytest.raises(RuntimeError, match="no checkpoint weights"):
+        model.load_weights([])
+
+
+def test_ming_llm_forward_drops_runner_only_kwargs(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
+
+    cfg = _make_config()
+    backbone = _DummyBackbone()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: backbone)
+    monkeypatch.setattr(llm_mod, "Aggregator", _DummyAggregator)
+    monkeypatch.setattr(llm_mod, "FlowLoss", _DummyFlowLoss)
+
+    model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
+    output = model.forward(
+        input_ids=torch.tensor([1], dtype=torch.long),
+        positions=torch.tensor([0], dtype=torch.long),
+        sampling_metadata=object(),
+        logits_index=0,
+        sampler=object(),
+        additional_information={"text": "hello"},
+    )
+
+    assert set(backbone.last_forward_kwargs) == {
+        "input_ids",
+        "positions",
+        "intermediate_tensors",
+        "inputs_embeds",
+    }
+    assert torch.equal(backbone.last_forward_kwargs["input_ids"], torch.tensor([1], dtype=torch.long))
+    assert torch.equal(backbone.last_forward_kwargs["positions"], torch.tensor([0], dtype=torch.long))
+    assert backbone.last_forward_kwargs["intermediate_tensors"] is None
+    assert torch.allclose(backbone.last_forward_kwargs["inputs_embeds"], torch.zeros((1, 2), dtype=torch.float32))
+    assert output.text_hidden_states.shape == (1, 2)
+    assert output.multimodal_outputs is None
+
+
+def test_ming_llm_forward_normalizes_runtime_additional_information(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
+
+    cfg = _make_config()
+    backbone = _DummyBackbone()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: backbone)
+    monkeypatch.setattr(llm_mod, "Aggregator", _DummyAggregator)
+    monkeypatch.setattr(llm_mod, "FlowLoss", _DummyFlowLoss)
+
+    model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
+    output = model.forward(
+        input_ids=torch.tensor([1], dtype=torch.long),
+        positions=torch.tensor([0], dtype=torch.long),
+        runtime_additional_information=[{"decode_step": 0}],
+    )
+
+    assert set(backbone.last_forward_kwargs) == {
+        "input_ids",
+        "positions",
+        "intermediate_tensors",
+        "inputs_embeds",
+    }
+    assert torch.equal(backbone.last_forward_kwargs["input_ids"], torch.tensor([1], dtype=torch.long))
+    assert torch.equal(backbone.last_forward_kwargs["positions"], torch.tensor([0], dtype=torch.long))
+    assert backbone.last_forward_kwargs["intermediate_tensors"] is None
+    assert torch.allclose(backbone.last_forward_kwargs["inputs_embeds"], torch.zeros((1, 2), dtype=torch.float32))
+    assert output.text_hidden_states.shape == (1, 2)
+    assert output.multimodal_outputs is None
+
+
+def test_ming_stage0_sampler_uses_model_sample(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts as ming_mod
+
+    class _DummyStage0(torch.nn.Module):
+        def sample(self, logits, sampling_metadata):
+            del logits, sampling_metadata
+            return SamplerOutput(
+                sampled_token_ids=torch.tensor([[151705]], dtype=torch.int32),
+                logprobs_tensors=None,
+            )
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(ming_mod, "init_vllm_registered_model", lambda **kwargs: _DummyStage0())
+
+    model = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("llm"))
+    sampler_output = model.sampler(
+        torch.zeros((1, cfg.llm_vocab_size), dtype=torch.float32),
+        SimpleNamespace(seq_groups=[]),
+    )
+
+    assert isinstance(sampler_output, SamplerOutput)
+    assert sampler_output.sampled_token_ids.dtype == torch.int32
+    assert sampler_output.sampled_token_ids.tolist() == [[151705]]
+
+
+def test_ming_stage0_load_weights_does_not_load_audio_weights(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts as ming_mod
+
+    class _DummyStage0(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.loaded = None
+
+        def load_weights(self, weights):
+            self.loaded = list(weights)
+            return {name for name, _ in self.loaded}
+
+    cfg = _make_config()
+    stage0 = _DummyStage0()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(ming_mod, "init_vllm_registered_model", lambda **kwargs: stage0)
+
+    model = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("llm"))
+    loaded = model.load_weights(
+        [
+            ("model.layers.0.weight", torch.ones((2, 2), dtype=torch.float32)),
+            ("linear_proj_audio.proj_in.weight", torch.ones((896, 64), dtype=torch.float32)),
+            ("flowloss.dummy.weight", torch.ones((64, 896), dtype=torch.float32)),
+            ("stop_head.weight", torch.ones((2, 896), dtype=torch.float32)),
+            ("spk_head.weight", torch.ones((896, 192), dtype=torch.float32)),
+            ("audio.encoder.weight", torch.ones((2, 2), dtype=torch.float32)),
+        ]
+    )
+
+    assert "model.audio.encoder.weight" not in loaded
+    assert all(not name.startswith("audio.") for name, _ in stage0.loaded)
+    assert not hasattr(model, "_prompt_audio_encoder")
+
+
+def test_ming_resolve_prompt_latents_accepts_raw_waveform(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts as ming_mod
+
+    class _DummyStage0(torch.nn.Module):
+        def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+            return torch.zeros((input_ids.shape[0], 2), dtype=torch.float32)
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(ming_mod, "init_vllm_registered_model", lambda **kwargs: _DummyStage0())
+
+    model = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("llm"))
+    direct = torch.ones((8, 64), dtype=torch.float32)
+
+    resolved = model._resolve_prompt_latents({KEY_PROMPT_LATENTS: direct})
+    assert resolved is not None
+    assert torch.equal(resolved["frames"], direct)
+
+    model._encode_prompt_waveform_to_latents = lambda waveform, waveform_length=None: torch.ones(
+        (8, 64), dtype=torch.float32
+    )
+    resolved = model._resolve_prompt_latents(
+        {
+            "prompt_waveform": torch.ones((1, 1000), dtype=torch.float32),
+            "prompt_waveform_length": torch.tensor([1000], dtype=torch.int32),
+            "prompt_text": "Reference words.",
+        }
+    )
+    assert resolved is not None
+    assert resolved["patches"].shape == (2, 4, 64)
+
+
+def test_ming_resolve_prompt_latents_rejects_dual_truth_waveform_and_latents(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts as ming_mod
+
+    class _DummyStage0(torch.nn.Module):
+        def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+            return torch.zeros((input_ids.shape[0], 2), dtype=torch.float32)
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(ming_mod, "init_vllm_registered_model", lambda **kwargs: _DummyStage0())
+
+    model = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("llm"))
+
+    with pytest.raises(ValueError, match="Choose exactly one source of truth"):
+        model._resolve_prompt_latents(
+            {
+                KEY_PROMPT_LATENTS: torch.ones((8, 64), dtype=torch.float32),
+                "prompt_waveform": torch.ones((1, 1000), dtype=torch.float32),
+                "prompt_waveform_length": torch.tensor([1000], dtype=torch.int32),
+                "prompt_text": "Reference words.",
+            }
+        )
+
+
+def test_ming_prefill_overwrites_speaker_slot_embedding(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts as ming_mod
+
+    class _DummyStage0(torch.nn.Module):
+        def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+            return torch.arange(int(input_ids.shape[0]) * 2, dtype=torch.float32).reshape(int(input_ids.shape[0]), 2)
+
+        def project_speaker_embedding(self, spk_emb: torch.Tensor) -> torch.Tensor:
+            del spk_emb
+            return torch.tensor([[101.0, 202.0]], dtype=torch.float32)
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(ming_mod, "init_vllm_registered_model", lambda **kwargs: _DummyStage0())
+
+    vllm_config = _make_vllm_config("llm")
+    vllm_config.model_config.hf_config = SimpleNamespace(vision_start_token_id=10)
+    model = MingTTSForConditionalGeneration(vllm_config=vllm_config)
+
+    input_ids = torch.tensor([1, 10, 20, 2], dtype=torch.long)
+    input_embeds = model.model.embed_input_ids(input_ids)
+    _, updated_embeds, _ = model._prefill_preprocess(
+        input_ids,
+        input_embeds,
+        speaker_embedding=torch.ones((192,), dtype=torch.float32),
+    )
+
+    assert torch.allclose(updated_embeds[2], torch.tensor([101.0, 202.0], dtype=torch.float32))
diff --git a/tests/model_executor/models/ming_tts/test_ming_tts_prompt_builder.py b/tests/model_executor/models/ming_tts/test_ming_tts_prompt_builder.py
new file mode 100644
index 00000000000..4381b913021
--- /dev/null
+++ b/tests/model_executor/models/ming_tts/test_ming_tts_prompt_builder.py
@@ -0,0 +1,375 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+from vllm_omni.model_executor.models.ming_tts.config_ming_tts import (
+    AUDIO_FRAME_HOP,
+    KEY_CFG,
+    KEY_MAX_DECODE_STEPS,
+    KEY_MIN_DECODE_STEPS,
+    KEY_PROMPT_LATENTS,
+    KEY_SPEAKER_EMBEDDING,
+    PATCH_SIZE,
+    SAMPLE_RATE,
+)
+from vllm_omni.model_executor.models.ming_tts.ingress import MingIngressProcessor
+from vllm_omni.model_executor.models.ming_tts.prompt_builder import (
+    build_dense_prompt_token_ids,
+    build_ming_dense_prompt,
+    count_prompt_waveform_patches,
+    pad_prompt_waveform,
+)
+
+
+class _DummyTokenizer:
+    def __init__(self):
+        self._token_to_id = {"<audioPatch>": 9001, "<|vision_start|>": 9002}
+        self._id_to_token = {token_id: token for token, token_id in self._token_to_id.items()}
+        self._next = 100
+
+    def encode(self, text):
+        if text not in self._token_to_id:
+            self._token_to_id[text] = self._next
+            self._id_to_token[self._next] = text
+            self._next += 1
+        return [self._token_to_id[text]]
+
+    def convert_tokens_to_ids(self, token):
+        if token not in self._token_to_id:
+            self._token_to_id[token] = self._next
+            self._id_to_token[self._next] = token
+            self._next += 1
+        return self._token_to_id[token]
+
+    def decode(self, token_ids):
+        return "".join(self._id_to_token[int(token_id)] for token_id in token_ids)
+
+
+def _make_dummy_ingress_processor(tokenizer):
+    processor = MingIngressProcessor.__new__(MingIngressProcessor)
+    processor.tokenizer = tokenizer
+    processor.profile_ingress = False
+    processor.ming_config = SimpleNamespace(patch_size=4, latent_dim=64, vae_patch_size=4, audio_frame_hop=882)
+    return processor
+
+
+def test_build_dense_prompt_token_ids_matches_ming_dense_layout():
+    tokenizer = _DummyTokenizer()
+
+    prompt_ids = build_dense_prompt_token_ids(
+        tokenizer,
+        prompt="Prompt text.",
+        text="Target text.",
+        instruction="instruction-json",
+        prompt_text="reference transcript",
+        speaker_count=2,
+        prompt_patch_count=3,
+    )
+
+    assert prompt_ids.count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == 3
+    assert prompt_ids.count(tokenizer.convert_tokens_to_ids("<|vision_start|>")) == 2
+    assert tokenizer.encode("instruction-json")[0] in prompt_ids
+    assert tokenizer.encode("reference transcript")[0] in prompt_ids
+
+
+def test_build_ming_dense_prompt_pads_prompt_waveform_and_zero_speaker():
+    tokenizer = _DummyTokenizer()
+    waveform = torch.ones((1, 1000), dtype=torch.float32)
+
+    prompt = build_ming_dense_prompt(
+        tokenizer,
+        prompt="Please imitate the reference speech.",
+        text="Hello world.",
+        prompt_text="Reference words.",
+        prompt_waveform=waveform,
+        use_zero_spk_emb=True,
+    )
+
+    info = prompt["additional_information"]
+    padded_waveform = info["prompt_waveform"]
+
+    assert padded_waveform.shape == (1, 14112)
+    assert int(info[KEY_SPEAKER_EMBEDDING].numel()) == 192
+    expected_patch_count = count_prompt_waveform_patches(waveform)
+    assert prompt["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == expected_patch_count
+
+
+def test_build_ming_dense_prompt_uses_patch_count_not_frame_count_for_zero_shot_waveform():
+    tokenizer = _DummyTokenizer()
+    waveform = torch.ones((1, 211680), dtype=torch.float32)
+
+    prompt = build_ming_dense_prompt(
+        tokenizer,
+        prompt="Please generate speech based on the following description.\n",
+        text="Target text.",
+        prompt_text="Reference words.",
+        prompt_waveform=waveform,
+        speaker_embedding=torch.ones((192,), dtype=torch.float32),
+    )
+
+    expected_patch_count = count_prompt_waveform_patches(waveform)
+    assert prompt["additional_information"].get(KEY_PROMPT_LATENTS) is None
+    assert prompt["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == expected_patch_count
+
+
+def test_build_ming_dense_prompt_accepts_flat_speaker_embedding_list():
+    tokenizer = _DummyTokenizer()
+    speaker_embedding = [0.1] * 192
+
+    prompt = build_ming_dense_prompt(
+        tokenizer,
+        prompt="Please imitate the reference speech.",
+        text="Hello world.",
+        speaker_embedding=speaker_embedding,
+    )
+
+    info = prompt["additional_information"]
+    assert tuple(info[KEY_SPEAKER_EMBEDDING].shape) == (192,)
+    assert prompt["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<|vision_start|>")) == 1
+
+
+def test_build_ming_dense_prompt_uses_prompt_latents_to_set_patch_count():
+    tokenizer = _DummyTokenizer()
+    prompt_latents = torch.ones((15, 4, 64), dtype=torch.float32)
+
+    prompt = build_ming_dense_prompt(
+        tokenizer,
+        prompt="Please generate speech based on the following description.\n",
+        text="Target text.",
+        prompt_text="Reference words.",
+        prompt_latents=prompt_latents,
+        speaker_embedding=torch.ones((192,), dtype=torch.float32),
+    )
+
+    assert torch.equal(prompt["additional_information"][KEY_PROMPT_LATENTS], prompt_latents)
+    assert prompt["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == 15
+
+
+def test_build_ming_dense_prompt_allows_raw_waveform_shell_without_explicit_prompt_latents():
+    tokenizer = _DummyTokenizer()
+    waveform = torch.ones((1, 1000), dtype=torch.float32)
+
+    prompt = build_ming_dense_prompt(
+        tokenizer,
+        prompt="Please imitate the reference speech.",
+        text="Hello world.",
+        prompt_text="Reference words.",
+        prompt_waveform=waveform,
+        speaker_embedding=torch.ones((192,), dtype=torch.float32),
+    )
+
+    expected_patch_count = count_prompt_waveform_patches(waveform)
+    assert prompt["additional_information"].get(KEY_PROMPT_LATENTS) is None
+    assert prompt["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == expected_patch_count
+
+
+def test_build_ming_dense_prompt_rejects_dual_truth_waveform_and_prompt_latents():
+    tokenizer = _DummyTokenizer()
+    waveform = torch.ones((1, 1000), dtype=torch.float32)
+    prompt_latents = torch.ones((4, 64), dtype=torch.float32)
+
+    with pytest.raises(ValueError, match="Choose exactly one source of truth"):
+        build_ming_dense_prompt(
+            tokenizer,
+            prompt="Please imitate the reference speech.",
+            text="Hello world.",
+            prompt_text="Reference words.",
+            prompt_waveform=waveform,
+            prompt_latents=prompt_latents,
+        )
+
+
+def test_ming_ingress_processor_preserves_raw_waveform_for_stage0_encoding():
+    tokenizer = _DummyTokenizer()
+    waveform = torch.ones((1, 1000), dtype=torch.float32)
+    prompt_text = "Reference words."
+    prompt = build_ming_dense_prompt(
+        tokenizer,
+        prompt="Please imitate the reference speech.",
+        text="Hello world.",
+        prompt_text=prompt_text,
+        prompt_waveform=waveform,
+        speaker_embedding=torch.ones((192,), dtype=torch.float32),
+    )
+    prompt["prompt"] = "Please imitate the reference speech."
+    prompt["text"] = "Hello world."
+    prompt["prompt_text"] = prompt_text
+    prompt["prompt_waveform"] = waveform
+    prompt["prompt_waveform_length"] = torch.tensor([1000], dtype=torch.int32)
+
+    processor = _make_dummy_ingress_processor(tokenizer)
+    finalized = processor(prompt)
+
+    assert finalized["prompt_waveform"] is waveform
+    assert torch.equal(finalized["prompt_waveform_length"], torch.tensor([1000], dtype=torch.int32))
+    assert finalized["additional_information"]["prompt_waveform"] is prompt["additional_information"]["prompt_waveform"]
+    assert torch.equal(
+        finalized["additional_information"]["prompt_waveform_length"],
+        prompt["additional_information"]["prompt_waveform_length"],
+    )
+    assert KEY_PROMPT_LATENTS not in finalized["additional_information"]
+    expected_patch_count = count_prompt_waveform_patches(waveform)
+    assert finalized["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == expected_patch_count
+
+
+def test_build_ming_dense_prompt_rejects_prompt_waveform_without_prompt_text():
+    tokenizer = _DummyTokenizer()
+    waveform = torch.ones((1, 1000), dtype=torch.float32)
+
+    with pytest.raises(ValueError, match="prompt_waveform requires prompt_text"):
+        build_ming_dense_prompt(
+            tokenizer,
+            prompt="Please generate speech based on the following description.\n",
+            text="我竟然抢到了陈奕迅的演唱会门票！",
+            instruction={"情感": "高兴"},
+            prompt_waveform=waveform,
+        )
+
+
+def test_ming_ingress_processor_rejects_raw_prompt_waveform_without_prompt_text():
+    tokenizer = _DummyTokenizer()
+    waveform = torch.ones((1, 1000), dtype=torch.float32)
+    prompt = {
+        "prompt": "Please generate speech based on the following description.\n",
+        "text": "我竟然抢到了陈奕迅的演唱会门票！",
+        "prompt_token_ids": [1, 2, 3],
+        "additional_information": {
+            "prompt_waveform": waveform,
+            "prompt_waveform_length": torch.tensor([1000], dtype=torch.int32),
+        },
+    }
+
+    processor = _make_dummy_ingress_processor(tokenizer)
+
+    with pytest.raises(RuntimeError, match="prompt_waveform requires prompt_text"):
+        processor(prompt)
+
+
+def test_ming_ingress_processor_rebuilds_podcast_prompt_with_prompt_text_before_target_text():
+    tokenizer = _DummyTokenizer()
+    prompt_prefix = "Please generate speech based on the following description.\n"
+    prompt_text = " speaker_1:reference one\n speaker_2:reference two\n"
+    target_text = " speaker_1:target one\n speaker_2:target two\n"
+    speaker_embeddings = torch.ones((2, 192), dtype=torch.float32)
+    prompt_waveform = [
+        torch.ones((1, 1000), dtype=torch.float32),
+        torch.ones((1, 2000), dtype=torch.float32),
+    ]
+
+    prompt = build_ming_dense_prompt(
+        tokenizer,
+        prompt=prompt_prefix,
+        text=target_text,
+        prompt_text=prompt_text,
+        prompt_waveform=prompt_waveform,
+        speaker_embedding=speaker_embeddings,
+    )
+
+    processor = _make_dummy_ingress_processor(tokenizer)
+    finalized = processor(prompt)
+    decoded = tokenizer.decode(finalized["prompt_token_ids"])
+    expected_patch_count = count_prompt_waveform_patches(prompt_waveform)
+
+    assert decoded.index(prompt_text) < decoded.index(target_text)
+    assert finalized["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<|vision_start|>")) == 2
+    assert finalized["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == expected_patch_count
+    assert "prompt_waveform" in finalized["additional_information"]
+    assert KEY_PROMPT_LATENTS not in finalized["additional_information"]
+
+
+def test_build_ming_dense_prompt_keeps_single_speaker_initial_payload_compatible():
+    tokenizer = _DummyTokenizer()
+    prompt_prefix = "Please imitate the reference speech."
+    target_text = "Hello world."
+    prompt_text = "Reference words."
+    waveform = torch.ones((1, 1000), dtype=torch.float32)
+
+    prompt = build_ming_dense_prompt(
+        tokenizer,
+        prompt=prompt_prefix,
+        text=target_text,
+        prompt_text=prompt_text,
+        prompt_waveform=waveform,
+        speaker_embedding=torch.ones((192,), dtype=torch.float32),
+    )
+    expected_patch_count = count_prompt_waveform_patches(waveform)
+    expected_prompt_token_ids = build_dense_prompt_token_ids(
+        tokenizer,
+        prompt=prompt_prefix,
+        text=target_text,
+        prompt_text=prompt_text,
+        speaker_count=1,
+        prompt_patch_count=expected_patch_count,
+    )
+
+    assert prompt["prompt"] == prompt_prefix
+    assert prompt["text"] == target_text
+    assert prompt["prompt_token_ids"] == expected_prompt_token_ids
+    assert prompt["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == expected_patch_count
+    assert prompt["additional_information"]["prompt_text"] == prompt_text
+
+
+def test_pad_prompt_waveform_matches_upstream_ming_alignment():
+    padded = pad_prompt_waveform(torch.ones((1, 3529), dtype=torch.float32))
+    assert int(padded.shape[-1]) == 14112
+    assert int(padded.shape[-1]) % int((float(SAMPLE_RATE) / 12.5) * int(PATCH_SIZE)) == 0
+    assert int(padded.shape[-1]) % int(AUDIO_FRAME_HOP * PATCH_SIZE) == 0
+
+
+def test_build_ming_dense_prompt_injects_duration_window_when_missing():
+    tokenizer = _DummyTokenizer()
+
+    prompt = build_ming_dense_prompt(
+        tokenizer,
+        prompt="Please generate music based on the following description.\n",
+        text=" Genre: electronic. Mood: confident. Instrument: drums. Theme: festival. Duration: 30s.",
+        runtime_controls={KEY_CFG: 2.0},
+    )
+
+    info = prompt["additional_information"]
+    assert float(info[KEY_CFG].item()) == 2.0
+    assert int(info[KEY_MIN_DECODE_STEPS].item()) == 91
+    assert int(info[KEY_MAX_DECODE_STEPS].item()) == 97
+
+
+def test_build_ming_dense_prompt_preserves_explicit_decode_window_overrides():
+    tokenizer = _DummyTokenizer()
+
+    prompt = build_ming_dense_prompt(
+        tokenizer,
+        prompt="Please generate music based on the following description.\n",
+        text=" Genre: electronic. Mood: confident. Instrument: drums. Theme: festival. Duration: 30s.",
+        runtime_controls={
+            KEY_MIN_DECODE_STEPS: 11,
+            KEY_MAX_DECODE_STEPS: 13,
+        },
+    )
+
+    info = prompt["additional_information"]
+    assert int(info[KEY_MIN_DECODE_STEPS].item()) == 11
+    assert int(info[KEY_MAX_DECODE_STEPS].item()) == 13
+
+
+def test_build_ming_dense_prompt_does_not_inject_duration_window_without_valid_duration():
+    tokenizer = _DummyTokenizer()
+
+    prompt_missing = build_ming_dense_prompt(
+        tokenizer,
+        prompt="Please generate music based on the following description.\n",
+        text=" Genre: electronic. Mood: confident. Instrument: drums. Theme: festival.",
+        runtime_controls={KEY_CFG: 2.0},
+    )
+    prompt_malformed = build_ming_dense_prompt(
+        tokenizer,
+        prompt="Please generate music based on the following description.\n",
+        text=" Genre: electronic. Mood: confident. Instrument: drums. Theme: festival. Duration: nope.",
+        runtime_controls={KEY_CFG: 2.0},
+    )
+
+    for prompt in (prompt_missing, prompt_malformed):
+        info = prompt["additional_information"]
+        assert KEY_MIN_DECODE_STEPS not in info
+        assert KEY_MAX_DECODE_STEPS not in info
diff --git a/tests/model_executor/stage_input_processors/test_ming_tts_async_chunk.py b/tests/model_executor/stage_input_processors/test_ming_tts_async_chunk.py
new file mode 100644
index 00000000000..1a7acd04263
--- /dev/null
+++ b/tests/model_executor/stage_input_processors/test_ming_tts_async_chunk.py
@@ -0,0 +1,421 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import defaultdict
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+from vllm_omni.model_executor.models.ming_tts.config_ming_tts import (
+    KEY_REQUEST_ID,
+    LATENT_CHUNK_SIZE,
+    LATENT_LEFT_CONTEXT,
+    PATCH_SIZE,
+)
+from vllm_omni.model_executor.stage_input_processors.ming_tts import (
+    MING_EMIT_PATCH_COUNT_KEY,
+    MING_ESTIMATED_BYTES_KEY,
+    MING_FINAL_DECODE_STEP_KEY,
+    MING_FINAL_FLUSH_KEY,
+    MING_LATENT_SHAPE_KEY,
+    MING_STOP_REASON_KEY,
+    _extract_last_patch,
+    llm2audio_vae,
+    llm2audio_vae_async_chunk,
+)
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+_LATENT_D = 64
+
+
+def _req(external_req_id: str, *, finished: bool):
+    return SimpleNamespace(
+        external_req_id=external_req_id,
+        is_finished=lambda: finished,
+    )
+
+
+def _manager(*, chunk_size: int | None = 2, left_context: int | None = 0, raw_config=None):
+    if raw_config is None:
+        extra = {}
+        if chunk_size is not None:
+            extra["latent_chunk_size"] = chunk_size
+        if left_context is not None:
+            extra["latent_left_context"] = left_context
+        raw_config = {"extra": extra}
+    return SimpleNamespace(
+        code_prompt_token_ids=defaultdict(list),
+        put_req_chunk=defaultdict(int),
+        request_payload={},
+        connector=SimpleNamespace(config=raw_config),
+    )
+
+
+def _patch(fill: float) -> torch.Tensor:
+    return torch.full((PATCH_SIZE, _LATENT_D), fill, dtype=torch.float32)
+
+
+def _payload(fill: float, *, has_patch=True, decode_step=None, stop_reason=None) -> dict[str, object]:
+    payload = {
+        "ming_has_patch": torch.tensor([has_patch]),
+        "ming_latent_patch": _patch(fill).unsqueeze(0),
+    }
+    if decode_step is not None:
+        payload["ming_decode_step"] = torch.tensor([decode_step], dtype=torch.int32)
+    if stop_reason is not None:
+        payload[MING_STOP_REASON_KEY] = (stop_reason,)
+    return payload
+
+
+def test_extract_last_patch_uses_active_mask():
+    patch = torch.arange(3 * PATCH_SIZE * _LATENT_D, dtype=torch.float16).reshape(3, PATCH_SIZE, _LATENT_D)
+    payload = {
+        "ming_has_patch": torch.tensor([False, True, False]),
+        "ming_latent_patch": patch,
+    }
+
+    out = _extract_last_patch(payload)
+
+    assert out is not None
+    assert out.shape == (PATCH_SIZE, _LATENT_D)
+    assert out.dtype == torch.float32
+    assert out.device.type == "cpu"
+    assert torch.allclose(out, patch[1].to(torch.float32).cpu())
+
+
+def test_llm2audio_vae_async_chunk_waits_for_full_chunk():
+    transfer_manager = _manager(chunk_size=2)
+    request = _req("rid-wait", finished=False)
+
+    payload = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output=_payload(1.0),
+        request=request,
+    )
+
+    assert payload is None
+    assert len(transfer_manager.code_prompt_token_ids["rid-wait"]) == 1
+
+
+def test_llm2audio_vae_async_chunk_partial_chunk_does_not_emit():
+    transfer_manager = _manager(chunk_size=3)
+    request = _req("rid-partial", finished=False)
+
+    first = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output=_payload(1.0),
+        request=request,
+    )
+    second = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output=_payload(2.0),
+        request=request,
+    )
+
+    assert first is None
+    assert second is None
+    assert len(transfer_manager.code_prompt_token_ids["rid-partial"]) == 2
+
+
+def test_llm2audio_vae_async_chunk_emits_full_chunk():
+    transfer_manager = _manager(chunk_size=2)
+    request_id = "rid-full"
+    request = _req(request_id, finished=False)
+    transfer_manager.code_prompt_token_ids[request_id].append(_patch(1.0))
+
+    payload = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output=_payload(2.0),
+        request=request,
+    )
+
+    assert payload is not None
+    assert payload["finished"].item() is False
+    assert payload["stream_finished"].item() is False
+    assert payload[KEY_REQUEST_ID] == request_id
+    assert payload["code_predictor_codes"] == [0]
+    assert payload["ming_latent_patches"].shape == (2, PATCH_SIZE, _LATENT_D)
+    assert payload[MING_EMIT_PATCH_COUNT_KEY] == 2
+    assert payload[MING_LATENT_SHAPE_KEY] == (2, PATCH_SIZE, _LATENT_D)
+    assert payload[MING_ESTIMATED_BYTES_KEY] == int(
+        payload["ming_latent_patches"].numel() * payload["ming_latent_patches"].element_size()
+    )
+    assert payload[MING_ESTIMATED_BYTES_KEY] > 0
+    assert payload[MING_FINAL_FLUSH_KEY] is False
+    assert torch.allclose(payload["ming_latent_patches"][0], _patch(1.0))
+    assert torch.allclose(payload["ming_latent_patches"][1], _patch(2.0))
+    assert transfer_manager.request_payload[request_id]["_ming_async_state"]["seen_patch_len"] == 2
+
+
+def test_llm2audio_vae_async_chunk_multi_request_interleaving_has_no_state_bleed():
+    transfer_manager = _manager(chunk_size=2)
+    req_a = _req("rid-a", finished=False)
+    req_b = _req("rid-b", finished=False)
+
+    assert (
+        llm2audio_vae_async_chunk(transfer_manager=transfer_manager, pooling_output=_payload(1.0), request=req_a)
+        is None
+    )
+    assert (
+        llm2audio_vae_async_chunk(transfer_manager=transfer_manager, pooling_output=_payload(10.0), request=req_b)
+        is None
+    )
+
+    payload_a = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output=_payload(2.0),
+        request=req_a,
+    )
+    assert payload_a is not None
+    assert payload_a[KEY_REQUEST_ID] == "rid-a"
+    assert torch.allclose(payload_a["ming_latent_patches"][0], _patch(1.0))
+    assert torch.allclose(payload_a["ming_latent_patches"][1], _patch(2.0))
+
+    assert len(transfer_manager.code_prompt_token_ids["rid-b"]) == 1
+
+    payload_b = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output=_payload(20.0),
+        request=req_b,
+    )
+    assert payload_b is not None
+    assert payload_b[KEY_REQUEST_ID] == "rid-b"
+    assert torch.allclose(payload_b["ming_latent_patches"][0], _patch(10.0))
+    assert torch.allclose(payload_b["ming_latent_patches"][1], _patch(20.0))
+
+    assert transfer_manager.request_payload["rid-a"]["_ming_async_state"]["seen_patch_len"] == 2
+    assert transfer_manager.request_payload["rid-b"]["_ming_async_state"]["seen_patch_len"] == 2
+
+
+def test_llm2audio_vae_async_chunk_finish_after_full_chunk_only_emits_eof():
+    transfer_manager = _manager(chunk_size=2)
+    request_id = "rid-drain"
+    request = _req(request_id, finished=False)
+    transfer_manager.code_prompt_token_ids[request_id].append(_patch(1.0))
+
+    payload = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output=_payload(2.0),
+        request=request,
+    )
+
+    assert payload is not None
+    assert transfer_manager.request_payload[request_id]["_ming_async_state"]["seen_patch_len"] == 2
+
+    finish_payload = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output=None,
+        request=_req(request_id, finished=True),
+    )
+
+    assert finish_payload == {
+        "code_predictor_codes": [],
+        "finished": torch.tensor(True, dtype=torch.bool),
+        "stream_finished": torch.tensor(True, dtype=torch.bool),
+        "ming_chunk_id": 0,
+        KEY_REQUEST_ID: request_id,
+        MING_EMIT_PATCH_COUNT_KEY: 0,
+        MING_LATENT_SHAPE_KEY: None,
+        MING_ESTIMATED_BYTES_KEY: 0,
+        MING_FINAL_FLUSH_KEY: True,
+    }
+
+
+def test_llm2audio_vae_async_chunk_flushes_tail_on_finish_without_new_patch():
+    transfer_manager = _manager(chunk_size=3)
+    request_id = "rid-tail"
+    request = _req(request_id, finished=True)
+    transfer_manager.code_prompt_token_ids[request_id] = [
+        _patch(1.0),
+        _patch(2.0),
+    ]
+
+    payload = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output=None,
+        request=request,
+    )
+
+    assert payload is not None
+    assert payload["finished"].item() is True
+    assert payload["stream_finished"].item() is True
+    assert payload[KEY_REQUEST_ID] == request_id
+    assert payload["ming_latent_patches"].shape == (2, PATCH_SIZE, _LATENT_D)
+    assert payload[MING_EMIT_PATCH_COUNT_KEY] == 2
+    assert payload[MING_LATENT_SHAPE_KEY] == (2, PATCH_SIZE, _LATENT_D)
+    assert payload[MING_ESTIMATED_BYTES_KEY] > 0
+    assert payload[MING_FINAL_FLUSH_KEY] is True
+    assert torch.allclose(payload["ming_latent_patches"][0], _patch(1.0))
+    assert torch.allclose(payload["ming_latent_patches"][1], _patch(2.0))
+
+
+def test_llm2audio_vae_async_chunk_final_flush_emits_partial_chunk_with_new_patch():
+    transfer_manager = _manager(chunk_size=3)
+    request_id = "rid-tail-new"
+
+    transfer_manager.code_prompt_token_ids[request_id].append(_patch(1.0))
+    payload = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output=_payload(2.0, decode_step=7, stop_reason="stop_head"),
+        request=_req(request_id, finished=True),
+    )
+
+    assert payload is not None
+    assert payload["finished"].item() is True
+    assert payload["stream_finished"].item() is True
+    assert payload[MING_EMIT_PATCH_COUNT_KEY] == 2
+    assert payload[MING_FINAL_FLUSH_KEY] is True
+    assert payload[MING_FINAL_DECODE_STEP_KEY] == 7
+    assert payload[MING_STOP_REASON_KEY] == "stop_head"
+    assert torch.allclose(payload["ming_latent_patches"][0], _patch(1.0))
+    assert torch.allclose(payload["ming_latent_patches"][1], _patch(2.0))
+
+
+def test_llm2audio_vae_async_chunk_emits_eof_when_finished_without_frames():
+    transfer_manager = _manager(chunk_size=2)
+    request = _req("rid-eof", finished=True)
+
+    payload = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output=None,
+        request=request,
+    )
+
+    assert payload == {
+        "code_predictor_codes": [],
+        "finished": torch.tensor(True, dtype=torch.bool),
+        "stream_finished": torch.tensor(True, dtype=torch.bool),
+        "ming_chunk_id": 0,
+        KEY_REQUEST_ID: "rid-eof",
+        MING_EMIT_PATCH_COUNT_KEY: 0,
+        MING_LATENT_SHAPE_KEY: None,
+        MING_ESTIMATED_BYTES_KEY: 0,
+        MING_FINAL_FLUSH_KEY: True,
+    }
+
+
+def test_llm2audio_vae_async_chunk_zero_latent_final_flush_returns_empty_payload_not_error():
+    transfer_manager = _manager(chunk_size=2)
+
+    payload = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output={
+            "ming_has_patch": torch.tensor([False]),
+            "ming_latent_patch": torch.zeros((1, PATCH_SIZE, _LATENT_D), dtype=torch.float32),
+        },
+        request=_req("rid-zero-final", finished=True),
+    )
+
+    assert payload == {
+        "code_predictor_codes": [],
+        "finished": torch.tensor(True, dtype=torch.bool),
+        "stream_finished": torch.tensor(True, dtype=torch.bool),
+        "ming_chunk_id": 0,
+        KEY_REQUEST_ID: "rid-zero-final",
+        MING_EMIT_PATCH_COUNT_KEY: 0,
+        MING_LATENT_SHAPE_KEY: None,
+        MING_ESTIMATED_BYTES_KEY: 0,
+        MING_FINAL_FLUSH_KEY: True,
+    }
+
+
+def test_llm2audio_vae_async_chunk_rejects_left_context_config():
+    transfer_manager = _manager(chunk_size=2, left_context=1)
+    request = _req("rid-bad-cfg", finished=False)
+
+    with pytest.raises(
+        ValueError,
+        match="does not support latent_left_context replay.*Got latent_left_context=1",
+    ):
+        llm2audio_vae_async_chunk(
+            transfer_manager=transfer_manager,
+            pooling_output=_payload(1.0),
+            request=request,
+        )
+
+
+def test_llm2audio_vae_async_chunk_rejects_non_positive_chunk_size():
+    transfer_manager = _manager(chunk_size=0, left_context=0)
+
+    with pytest.raises(ValueError, match="Invalid Ming latent_chunk_size=0"):
+        llm2audio_vae_async_chunk(
+            transfer_manager=transfer_manager,
+            pooling_output=_payload(1.0),
+            request=_req("rid-bad-chunk", finished=False),
+        )
+
+
+def test_llm2audio_vae_async_chunk_missing_config_uses_fallback_defaults():
+    transfer_manager = _manager(raw_config={"extra": {}})
+    request_id = "rid-fallback"
+
+    for idx in range(LATENT_CHUNK_SIZE - 1):
+        payload = llm2audio_vae_async_chunk(
+            transfer_manager=transfer_manager,
+            pooling_output=_payload(float(idx + 1)),
+            request=_req(request_id, finished=False),
+        )
+        assert payload is None
+
+    payload = llm2audio_vae_async_chunk(
+        transfer_manager=transfer_manager,
+        pooling_output=_payload(float(LATENT_CHUNK_SIZE)),
+        request=_req(request_id, finished=False),
+    )
+
+    assert payload is not None
+    assert payload[MING_EMIT_PATCH_COUNT_KEY] == LATENT_CHUNK_SIZE
+    assert payload[MING_LATENT_SHAPE_KEY] == (LATENT_CHUNK_SIZE, PATCH_SIZE, _LATENT_D)
+    assert LATENT_LEFT_CONTEXT == 0
+
+
+def test_llm2audio_vae_builds_generation_prompt_from_stage_output():
+    patches = torch.arange(2 * PATCH_SIZE * _LATENT_D, dtype=torch.float32).reshape(2, PATCH_SIZE, _LATENT_D)
+    stage_output = SimpleNamespace(
+        request_id="rid-stage",
+        finished=True,
+        outputs=[
+            SimpleNamespace(
+                multimodal_output={
+                    "ming_has_patch": torch.tensor([True, True]),
+                    "ming_latent_patch": patches,
+                    "ming_decode_step": torch.tensor([26, 27], dtype=torch.int32),
+                    "ming_stop_reason": ("continue", "stop_head"),
+                }
+            )
+        ],
+    )
+    stage = SimpleNamespace(engine_outputs=[stage_output])
+
+    prompts = llm2audio_vae(stage_list=[stage], engine_input_source=[0])
+
+    assert len(prompts) == 1
+    info = prompts[0]["additional_information"]
+    assert info[KEY_REQUEST_ID] == "rid-stage"
+    assert info["finished"].item() is True
+    assert info["ming_latent_patches"].shape == (2, PATCH_SIZE, _LATENT_D)
+    assert torch.allclose(info["ming_latent_patches"], patches)
+    assert info[MING_FINAL_DECODE_STEP_KEY] == 27
+    assert info[MING_STOP_REASON_KEY] == "stop_head"
+
+
+def test_llm2audio_vae_skips_unfinished_stage_output():
+    patch = torch.arange(PATCH_SIZE * _LATENT_D, dtype=torch.float32).reshape(1, PATCH_SIZE, _LATENT_D)
+    stage_output = SimpleNamespace(
+        request_id="rid-unfinished",
+        finished=False,
+        outputs=[
+            SimpleNamespace(
+                multimodal_output={
+                    "ming_has_patch": torch.tensor([True]),
+                    "ming_latent_patch": patch,
+                }
+            )
+        ],
+    )
+    stage = SimpleNamespace(engine_outputs=[stage_output])
+
+    prompts = llm2audio_vae(stage_list=[stage], engine_input_source=[0])
+
+    assert prompts == []
diff --git a/tests/worker/test_ming_tts_runner.py b/tests/worker/test_ming_tts_runner.py
new file mode 100644
index 00000000000..89deda2ddb1
--- /dev/null
+++ b/tests/worker/test_ming_tts_runner.py
@@ -0,0 +1,674 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+from vllm_omni.model_executor.models.ming_tts.config_ming_tts import (
+    KEY_LATENT_HISTORY,
+    KEY_NEXT_EMBEDS,
+    KEY_PROMPT_LATENTS,
+    KEY_REQUEST_ID,
+    KEY_SPEAKER_EMBEDDING,
+    MingTTSConfig,
+)
+from vllm_omni.model_executor.models.ming_tts.ming_tts import MingTTSForConditionalGeneration
+from vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae import MingAudioVAEModel
+from vllm_omni.model_executor.models.ming_tts.ming_tts_llm import (
+    MING_STOP_REASON_CONTINUE,
+    MING_STOP_REASON_KEY,
+    MING_STOP_REASON_MAX_DECODE_STEPS,
+    MING_STOP_REASON_STOP_HEAD,
+    MingLLMModel,
+    _resolve_ming_stop_decision,
+)
+from vllm_omni.model_executor.models.output_templates import OmniOutput
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+class DummyBackbone(torch.nn.Module):
+    def __init__(self, hidden_size: int):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        ids = input_ids.to(torch.float32).reshape(-1, 1)
+        return ids.repeat(1, self.hidden_size) / 100.0
+
+    def get_input_embeddings(self):
+        return None
+
+    def forward(self, input_ids, positions, intermediate_tensors=None, inputs_embeds=None, **kwargs):
+        del input_ids, positions, intermediate_tensors, kwargs
+        return inputs_embeds
+
+
+class NaNOnSecondDecodeBackbone(DummyBackbone):
+    def __init__(self, hidden_size: int):
+        super().__init__(hidden_size)
+        self.decode_calls = 0
+
+    def forward(self, input_ids, positions, intermediate_tensors=None, inputs_embeds=None, **kwargs):
+        del input_ids, positions, intermediate_tensors, kwargs
+        self.decode_calls += 1
+        if self.decode_calls >= 2:
+            return torch.full_like(inputs_embeds, float("nan"))
+        return inputs_embeds
+
+
+class DummyAggregator(torch.nn.Module):
+    def __init__(self, in_channels: int, llm_input_dim: int, **kwargs):
+        super().__init__()
+        del in_channels, kwargs
+        self.hidden_size = llm_input_dim
+
+    def forward(self, patch: torch.Tensor) -> torch.Tensor:
+        pooled = patch.mean(dim=1)
+        repeats = self.hidden_size // pooled.shape[-1]
+        return pooled.repeat(1, repeats).reshape(pooled.shape[0], 1, self.hidden_size)
+
+
+class DummyFlowLoss(torch.nn.Module):
+    def __init__(self, z_channels: int, llm_cond_dim: int, **kwargs):
+        super().__init__()
+        del z_channels, llm_cond_dim, kwargs
+
+    def sample(self, z, latent_history, cfg, patch_size, sigma, temperature):
+        del latent_history, cfg, sigma, temperature
+        base = z[:, 0, :64]
+        return torch.stack([base + float(i + 1) for i in range(patch_size)], dim=1)
+
+
+class DummyAudioVAE(torch.nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.weight = torch.nn.Parameter(torch.tensor(1.0))
+        self.decode_calls: list[dict[str, object]] = []
+
+    def encode_latent(self, waveform: torch.Tensor, waveform_length: torch.Tensor):
+        if waveform.ndim == 2:
+            frames = waveform.shape[-1] // 64
+            latent = waveform[:, : frames * 64].reshape(waveform.shape[0], frames, 64)
+        else:
+            latent = waveform
+        frame_num = torch.full((latent.shape[0],), latent.shape[1], dtype=torch.int32, device=latent.device)
+        return latent.to(torch.float32), frame_num
+
+    def decode(self, latent, past_key_values=None, use_cache=False, stream_state=(None, None, None), last_chunk=False):
+        del use_cache, last_chunk
+        prev_frames = int((past_key_values or {}).get("frames", 0))
+        waveform = latent.sum(dim=-1).reshape(latent.shape[0], -1).to(torch.float32) + prev_frames * 10.0
+        new_stream_state = ("stream", prev_frames + latent.shape[1], tuple(latent.shape))
+        new_past = {"frames": prev_frames + int(latent.shape[1])}
+        self.decode_calls.append(
+            {
+                "stream_state": stream_state,
+                "past_key_values": past_key_values,
+                "latent_shape": tuple(latent.shape),
+            }
+        )
+        return waveform, new_stream_state, new_past
+
+
+class _DummySamplingMetadata:
+    def __init__(self, step: int):
+        self.output_token_ids = [[0] * int(step)]
+
+
+def _make_config() -> MingTTSConfig:
+    audio_cfg = SimpleNamespace(
+        enc_kwargs={"latent_dim": 64, "input_dim": 882, "hop_size": 882},
+        dec_kwargs={"latent_dim": 64, "output_dim": 882},
+        patch_size=4,
+        sample_rate=44100,
+    )
+    cfg = MingTTSConfig(audio_tokenizer_config=audio_cfg)
+    cfg.validate()
+    return cfg
+
+
+def _make_vllm_config(model_stage: str, **hf_overrides):
+    return SimpleNamespace(
+        model_config=SimpleNamespace(hf_config=SimpleNamespace(**hf_overrides), model_stage=model_stage),
+        quant_config=None,
+        device_config=SimpleNamespace(device=torch.device("cpu")),
+    )
+
+
+def _make_runner_for_ming(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts as wrapper_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae as vae_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+
+    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: DummyBackbone(cfg.llm_hidden_size))
+    monkeypatch.setattr(llm_mod, "Aggregator", DummyAggregator)
+    monkeypatch.setattr(llm_mod, "FlowLoss", DummyFlowLoss)
+    monkeypatch.setattr(wrapper_mod, "AudioVAE", DummyAudioVAE, raising=False)
+    monkeypatch.setattr(vae_mod, "AudioVAE", DummyAudioVAE)
+
+    llm_model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
+    vae_model = MingAudioVAEModel(vllm_config=_make_vllm_config("audio_vae"))
+
+    def _wrapper_loader(*, architectures, **kwargs):
+        arch = architectures[0]
+        if arch == "MingLLMModel":
+            return llm_model
+        if arch == "MingAudioVAEModel":
+            return vae_model
+        raise AssertionError(f"unexpected architecture {arch}")
+
+    monkeypatch.setattr(wrapper_mod, "init_vllm_registered_model", _wrapper_loader)
+
+    stage1 = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("llm"))
+    stage2 = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("audio_vae"))
+
+    return SimpleNamespace(config=cfg, llm=llm_model, vae=vae_model, stage1=stage1, stage2=stage2)
+
+
+def test_ming_llm_step_shapes(monkeypatch):
+    runner = _make_runner_for_ming(monkeypatch)
+    cfg = runner.config
+
+    prefill_ids = torch.tensor(
+        [1, cfg.audio_start_token_id, cfg.audio_dummy_token_id, cfg.audio_dummy_token_id, cfg.audio_end_token_id, 2],
+        dtype=torch.long,
+    )
+    prefill_embeds = torch.zeros((prefill_ids.shape[0], cfg.llm_hidden_size), dtype=torch.float32)
+    prompt_latents = torch.arange(8 * 64, dtype=torch.float32).reshape(1, 8, 64)
+
+    _, prefill_out_embeds, prefill_info = runner.stage1.preprocess_input(
+        prefill_ids,
+        prefill_embeds,
+        **{KEY_PROMPT_LATENTS: prompt_latents},
+        **{KEY_REQUEST_ID: "req-1"},
+    )
+
+    assert prefill_info[KEY_LATENT_HISTORY].shape == (32, 64)
+    assert torch.allclose(prefill_info[KEY_LATENT_HISTORY][-8:], prompt_latents.reshape(8, 64))
+    assert torch.count_nonzero(prefill_out_embeds[1]).item() > 0
+    assert torch.count_nonzero(prefill_out_embeds[2]).item() > 0
+
+    decode_ids = torch.tensor([cfg.audio_dummy_token_id], dtype=torch.long)
+    decode_embeds = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
+    _, decode_embeds, decode_info = runner.stage1.preprocess_input(
+        decode_ids,
+        decode_embeds,
+        **prefill_info,
+    )
+
+    output = runner.llm.forward(
+        decode_ids,
+        positions=torch.tensor([0], dtype=torch.long),
+        inputs_embeds=decode_embeds,
+        model_intermediate_buffer=[decode_info],
+        seq_token_counts=[1],
+    )
+    mm = output.multimodal_outputs
+
+    assert mm["ming_latent_patch"].shape == (1, 4, 64)
+    assert mm["ming_next_embeds"].shape == (1, 1, cfg.llm_hidden_size)
+    assert mm["ming_new_history"].shape == (1, 32, 64)
+
+    update = runner.stage1.postprocess(output.text_hidden_states, multimodal_outputs=mm, **decode_info)
+    assert update[KEY_LATENT_HISTORY].shape == (1, 32, 64)
+    assert torch.allclose(update[KEY_LATENT_HISTORY][0, -4:], mm["ming_latent_patch"][0].cpu())
+    assert update[KEY_NEXT_EMBEDS].shape == (1, 1, cfg.llm_hidden_size)
+
+
+def test_ming_prefill_injects_speaker_into_dense_placeholder(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts as wrapper_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: DummyBackbone(cfg.llm_hidden_size))
+    monkeypatch.setattr(llm_mod, "Aggregator", DummyAggregator)
+    monkeypatch.setattr(llm_mod, "FlowLoss", DummyFlowLoss)
+    monkeypatch.setattr(
+        wrapper_mod, "init_vllm_registered_model", lambda **kwargs: MingLLMModel(vllm_config=_make_vllm_config("llm"))
+    )
+
+    vision_start_token_id = 32001
+    stage1 = MingTTSForConditionalGeneration(
+        vllm_config=_make_vllm_config("llm", vision_start_token_id=vision_start_token_id)
+    )
+
+    input_ids = torch.tensor(
+        [
+            1,
+            vision_start_token_id,
+            77,
+            cfg.audio_start_token_id,
+            cfg.audio_dummy_token_id,
+            cfg.audio_end_token_id,
+        ],
+        dtype=torch.long,
+    )
+    input_embeds = torch.zeros((input_ids.shape[0], cfg.llm_hidden_size), dtype=torch.float32)
+    baseline_embeds = stage1.model.embed_input_ids(input_ids).clone()
+    speaker = torch.ones((192,), dtype=torch.float32)
+
+    _, out_embeds, _ = stage1.preprocess_input(
+        input_ids,
+        input_embeds,
+        **{KEY_SPEAKER_EMBEDDING: speaker},
+    )
+
+    assert torch.count_nonzero(out_embeds[2]).item() > 0
+    assert not torch.allclose(out_embeds[2], baseline_embeds[2])
+    assert torch.allclose(out_embeds[3], baseline_embeds[3])
+
+
+def test_ming_prefill_injects_multiple_speakers_into_multiple_dense_placeholders(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts as wrapper_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: DummyBackbone(cfg.llm_hidden_size))
+    monkeypatch.setattr(llm_mod, "Aggregator", DummyAggregator)
+    monkeypatch.setattr(llm_mod, "FlowLoss", DummyFlowLoss)
+    monkeypatch.setattr(
+        wrapper_mod, "init_vllm_registered_model", lambda **kwargs: MingLLMModel(vllm_config=_make_vllm_config("llm"))
+    )
+
+    vision_start_token_id = 32001
+    stage1 = MingTTSForConditionalGeneration(
+        vllm_config=_make_vllm_config("llm", vision_start_token_id=vision_start_token_id)
+    )
+
+    input_ids = torch.tensor(
+        [
+            1,
+            vision_start_token_id,
+            77,
+            2,
+            vision_start_token_id,
+            88,
+            cfg.audio_start_token_id,
+            cfg.audio_dummy_token_id,
+            cfg.audio_end_token_id,
+        ],
+        dtype=torch.long,
+    )
+    input_embeds = torch.zeros((input_ids.shape[0], cfg.llm_hidden_size), dtype=torch.float32)
+    baseline_embeds = stage1.model.embed_input_ids(input_ids).clone()
+    speaker = torch.ones((2, 192), dtype=torch.float32)
+
+    _, out_embeds, _ = stage1.preprocess_input(
+        input_ids,
+        input_embeds,
+        **{KEY_SPEAKER_EMBEDDING: speaker},
+    )
+
+    assert torch.count_nonzero(out_embeds[2]).item() > 0
+    assert torch.count_nonzero(out_embeds[5]).item() > 0
+    assert not torch.allclose(out_embeds[2], baseline_embeds[2])
+    assert not torch.allclose(out_embeds[5], baseline_embeds[5])
+    assert torch.allclose(out_embeds[6], baseline_embeds[6])
+
+
+def test_ming_stop_logic_no_stop_before_min_required_decode_steps(monkeypatch):
+    runner = _make_runner_for_ming(monkeypatch)
+    cfg = runner.config
+
+    def _high_stop(_hidden_states):
+        return torch.tensor([[0.0, 10.0]], dtype=torch.float32)
+
+    monkeypatch.setattr(runner.llm.stop_head, "forward", _high_stop)
+    hidden = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
+
+    stop_reason, stop_now, force_stop, min_required_decode_steps, next_token_id = _resolve_ming_stop_decision(
+        step=4,
+        stop_prob=1.0,
+        stop_threshold=float(cfg.stop_head_threshold),
+        min_stop_step=int(cfg.stop_head_min_steps),
+        min_decode_steps=7,
+        max_decode_steps=int(cfg.max_decode_steps),
+        audio_dummy_token_id=int(cfg.audio_dummy_token_id),
+        text_eos_token_id=int(cfg.text_eos_token_id),
+    )
+    assert stop_reason == MING_STOP_REASON_CONTINUE
+    assert stop_now is False
+    assert force_stop is False
+    assert min_required_decode_steps == 7
+    assert next_token_id == cfg.audio_dummy_token_id
+
+    logits_step3 = runner.llm.compute_logits(
+        OmniOutput(
+            text_hidden_states=hidden,
+            multimodal_outputs={"ming_min_decode_steps": torch.tensor([7], dtype=torch.int32)},
+        ),
+        _DummySamplingMetadata(step=3),
+    )
+    out_step3 = runner.llm.sample(logits_step3, _DummySamplingMetadata(step=3))
+    assert int(out_step3.sampled_token_ids[0, 0]) == cfg.audio_dummy_token_id
+    assert torch.isfinite(logits_step3[0, int(cfg.audio_dummy_token_id)])
+    assert not torch.isfinite(logits_step3[0, int(cfg.text_eos_token_id)])
+
+
+def test_ming_stop_logic_stop_head_inside_window(monkeypatch):
+    runner = _make_runner_for_ming(monkeypatch)
+    cfg = runner.config
+
+    def _high_stop(_hidden_states):
+        return torch.tensor([[0.0, 10.0]], dtype=torch.float32)
+
+    monkeypatch.setattr(runner.llm.stop_head, "forward", _high_stop)
+    hidden = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
+
+    stop_reason, stop_now, force_stop, min_required_decode_steps, next_token_id = _resolve_ming_stop_decision(
+        step=4,
+        stop_prob=1.0,
+        stop_threshold=float(cfg.stop_head_threshold),
+        min_stop_step=int(cfg.stop_head_min_steps),
+        min_decode_steps=0,
+        max_decode_steps=int(cfg.max_decode_steps),
+        audio_dummy_token_id=int(cfg.audio_dummy_token_id),
+        text_eos_token_id=int(cfg.text_eos_token_id),
+    )
+    assert stop_reason == MING_STOP_REASON_STOP_HEAD
+    assert stop_now is True
+    assert force_stop is False
+    assert min_required_decode_steps == int(cfg.stop_head_min_steps) + 1
+    assert next_token_id == cfg.text_eos_token_id
+
+    logits_step4 = runner.llm.compute_logits(hidden, _DummySamplingMetadata(step=4))
+    out_step4 = runner.llm.sample(logits_step4, _DummySamplingMetadata(step=4))
+    assert int(out_step4.sampled_token_ids[0, 0]) == cfg.text_eos_token_id
+
+
+def test_ming_stop_logic_rejects_impossible_decode_window(monkeypatch):
+    runner = _make_runner_for_ming(monkeypatch)
+    cfg = runner.config
+    hidden = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
+
+    with pytest.raises(RuntimeError, match="Invalid Ming decode window"):
+        runner.llm.compute_logits(
+            OmniOutput(
+                text_hidden_states=hidden,
+                multimodal_outputs={
+                    "ming_min_decode_steps": torch.tensor([7], dtype=torch.int32),
+                    "ming_max_decode_steps": torch.tensor([5], dtype=torch.int32),
+                },
+            ),
+            _DummySamplingMetadata(step=4),
+        )
+
+
+def test_ming_stop_logic_max_decode_guard(monkeypatch):
+    runner = _make_runner_for_ming(monkeypatch)
+    cfg = runner.config
+    cfg.max_decode_steps = 5
+
+    def _high_stop(_hidden_states):
+        return torch.tensor([[0.0, 10.0]], dtype=torch.float32)
+
+    monkeypatch.setattr(runner.llm.stop_head, "forward", _high_stop)
+    hidden = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
+
+    stop_reason, stop_now, force_stop, min_required_decode_steps, next_token_id = _resolve_ming_stop_decision(
+        step=4,
+        stop_prob=1.0,
+        stop_threshold=float(cfg.stop_head_threshold),
+        min_stop_step=int(cfg.stop_head_min_steps),
+        min_decode_steps=0,
+        max_decode_steps=int(cfg.max_decode_steps),
+        audio_dummy_token_id=int(cfg.audio_dummy_token_id),
+        text_eos_token_id=int(cfg.text_eos_token_id),
+    )
+    assert stop_reason == MING_STOP_REASON_MAX_DECODE_STEPS
+    assert stop_now is True
+    assert force_stop is True
+    assert min_required_decode_steps == int(cfg.stop_head_min_steps) + 1
+    assert next_token_id == cfg.text_eos_token_id
+
+    logits = runner.llm.compute_logits(hidden, _DummySamplingMetadata(step=4))
+    out = runner.llm.sample(logits, _DummySamplingMetadata(step=4))
+    assert int(out.sampled_token_ids[0, 0]) == cfg.text_eos_token_id
+
+
+def test_ming_compute_logits_uses_forward_stop_prob_payload(monkeypatch):
+    runner = _make_runner_for_ming(monkeypatch)
+    cfg = runner.config
+
+    def _low_stop(_hidden_states):
+        return torch.tensor([[10.0, 0.0]], dtype=torch.float32)
+
+    monkeypatch.setattr(runner.llm.stop_head, "forward", _low_stop)
+    hidden = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
+
+    logits = runner.llm.compute_logits(
+        OmniOutput(
+            text_hidden_states=hidden,
+            multimodal_outputs={
+                "ming_stop_prob": torch.tensor([1.0], dtype=torch.float32),
+                "ming_decode_step": torch.tensor([4], dtype=torch.int32),
+            },
+        ),
+        _DummySamplingMetadata(step=4),
+    )
+    out = runner.llm.sample(logits, _DummySamplingMetadata(step=4))
+    assert int(out.sampled_token_ids[0, 0]) == cfg.text_eos_token_id
+
+
+def test_ming_compute_logits_uses_cached_forward_stop_prob_for_tensor_path(monkeypatch):
+    runner = _make_runner_for_ming(monkeypatch)
+    cfg = runner.config
+
+    def _low_stop(_hidden_states):
+        return torch.tensor([[10.0, 0.0]], dtype=torch.float32)
+
+    monkeypatch.setattr(runner.llm.stop_head, "forward", _low_stop)
+    runner.llm._last_sample_stop_probs = torch.tensor([1.0], dtype=torch.float32)
+    runner.llm._last_sample_decode_steps = torch.tensor([4], dtype=torch.int32)
+    hidden = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
+
+    logits = runner.llm.compute_logits(hidden, _DummySamplingMetadata(step=4))
+    out = runner.llm.sample(logits, _DummySamplingMetadata(step=4))
+    assert int(out.sampled_token_ids[0, 0]) == cfg.text_eos_token_id
+
+
+def test_ming_forward_exposes_stop_reason_in_outputs_and_pending_state(monkeypatch):
+    runner = _make_runner_for_ming(monkeypatch)
+    cfg = runner.config
+
+    def _low_stop(_hidden_states):
+        return torch.tensor([[10.0, 0.0]], dtype=torch.float32)
+
+    monkeypatch.setattr(runner.llm.stop_head, "forward", _low_stop)
+    decode_ids = torch.tensor([cfg.audio_dummy_token_id], dtype=torch.long)
+    decode_embeds = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
+    output = runner.llm.forward(
+        decode_ids,
+        positions=torch.tensor([0], dtype=torch.long),
+        inputs_embeds=decode_embeds,
+        model_intermediate_buffer=[
+            {
+                KEY_LATENT_HISTORY: torch.zeros((cfg.history_patch_size, cfg.latent_dim), dtype=torch.float32),
+                KEY_REQUEST_ID: "req-stop-reason",
+            }
+        ],
+        seq_token_counts=[1],
+    )
+
+    assert output.multimodal_outputs[MING_STOP_REASON_KEY] == (MING_STOP_REASON_CONTINUE,)
+    pending = runner.llm.pop_postprocess_update("req-stop-reason")
+    assert pending[MING_STOP_REASON_KEY] == MING_STOP_REASON_CONTINUE
+
+
+def test_ming_postprocess_forwards_stop_reason(monkeypatch):
+    runner = _make_runner_for_ming(monkeypatch)
+    cfg = runner.config
+
+    decode_ids = torch.tensor([cfg.audio_dummy_token_id], dtype=torch.long)
+    decode_embeds = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
+    decode_info = {
+        KEY_LATENT_HISTORY: torch.zeros((cfg.history_patch_size, cfg.latent_dim), dtype=torch.float32),
+        KEY_REQUEST_ID: "req-postprocess-stop-reason",
+    }
+
+    output = runner.llm.forward(
+        decode_ids,
+        positions=torch.tensor([0], dtype=torch.long),
+        inputs_embeds=decode_embeds,
+        model_intermediate_buffer=[decode_info],
+        seq_token_counts=[1],
+    )
+    update = runner.stage1.postprocess(output.text_hidden_states, **decode_info)
+
+    assert update[MING_STOP_REASON_KEY] == MING_STOP_REASON_CONTINUE
+
+
+def test_ming_vae_incremental_decode(monkeypatch):
+    runner = _make_runner_for_ming(monkeypatch)
+
+    chunk_a = torch.stack(
+        [
+            torch.ones((4, 64), dtype=torch.float32),
+            torch.full((4, 64), 2.0, dtype=torch.float32),
+        ],
+        dim=0,
+    )
+    out_a = runner.stage2.forward(
+        model_intermediate_buffer=[
+            {
+                "ming_latent_patches": chunk_a,
+                "finished": torch.tensor(False),
+                "stream_finished": torch.tensor(False),
+                KEY_REQUEST_ID: "r1",
+            }
+        ]
+    )
+    wav_a = out_a.multimodal_outputs["model_outputs"][0]
+    state_a = runner.vae._stream_state["r1"]
+    past_a = runner.vae._past_key_values["r1"]
+
+    chunk_b = torch.full((1, 4, 64), 3.0, dtype=torch.float32)
+    out_b = runner.stage2.forward(
+        model_intermediate_buffer=[
+            {
+                "ming_latent_patches": chunk_b,
+                "finished": torch.tensor(False),
+                "stream_finished": torch.tensor(False),
+                KEY_REQUEST_ID: "r1",
+            }
+        ]
+    )
+    wav_b = out_b.multimodal_outputs["model_outputs"][0]
+    state_b = runner.vae._stream_state["r1"]
+
+    assert len(runner.vae.audio.decode_calls) == 3
+    assert runner.vae.audio.decode_calls[1]["latent_shape"] == (1, 4, 64)
+    assert runner.vae.audio.decode_calls[1]["past_key_values"] == {"frames": 4}
+    assert runner.vae.audio.decode_calls[2]["stream_state"] == state_a
+    assert runner.vae.audio.decode_calls[2]["past_key_values"] == past_a
+    assert state_b != state_a
+
+    expected_a = torch.cat(
+        [
+            chunk_a[0].sum(dim=-1),
+            chunk_a[1].sum(dim=-1) + 4 * 10.0,
+        ]
+    )
+    expected_b = chunk_b[0].sum(dim=-1) + 8 * 10.0
+    assert torch.allclose(wav_a, expected_a)
+    assert torch.allclose(wav_b, expected_b)
+    assert torch.allclose(torch.cat([wav_a, wav_b]), torch.cat([expected_a, expected_b]))
+
+
+def test_ming_vae_finalizes_when_stream_finished_is_absent(monkeypatch):
+    runner = _make_runner_for_ming(monkeypatch)
+    chunk = torch.stack(
+        [
+            torch.ones((4, 64), dtype=torch.float32),
+            torch.full((4, 64), 2.0, dtype=torch.float32),
+        ],
+        dim=0,
+    )
+
+    out = runner.stage2.forward(
+        model_intermediate_buffer=[
+            {
+                "ming_latent_patches": chunk,
+                "finished": torch.tensor(True),
+                KEY_REQUEST_ID: "r-sequential",
+            }
+        ]
+    )
+
+    wav = out.multimodal_outputs["model_outputs"][0]
+    assert wav.numel() > 0
+    assert "r-sequential" not in runner.vae._stream_state
+    assert "r-sequential" not in runner.vae._past_key_values
+
+
+def test_ming_recurrent_backbone_can_poison_hidden_states_before_flowloss(monkeypatch):
+    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts as wrapper_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae as vae_mod
+    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
+
+    cfg = _make_config()
+    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
+    monkeypatch.setattr(
+        llm_mod, "init_vllm_registered_model", lambda **kwargs: NaNOnSecondDecodeBackbone(cfg.llm_hidden_size)
+    )
+    monkeypatch.setattr(llm_mod, "Aggregator", DummyAggregator)
+    monkeypatch.setattr(llm_mod, "FlowLoss", DummyFlowLoss)
+    monkeypatch.setattr(vae_mod, "AudioVAE", DummyAudioVAE)
+
+    llm_model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
+
+    def _wrapper_loader(*, architectures, **kwargs):
+        arch = architectures[0]
+        if arch == "MingLLMModel":
+            return llm_model
+        raise AssertionError(f"unexpected architecture {arch}")
+
+    monkeypatch.setattr(wrapper_mod, "init_vllm_registered_model", _wrapper_loader)
+    stage1 = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("llm"))
+
+    decode_ids = torch.tensor([cfg.audio_dummy_token_id], dtype=torch.long)
+    decode_embeds = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
+    decode_info = {
+        KEY_LATENT_HISTORY: torch.zeros((cfg.history_patch_size, cfg.latent_dim), dtype=torch.float32),
+        KEY_REQUEST_ID: "req-nan",
+    }
+
+    _, decode_embeds, decode_info = stage1.preprocess_input(decode_ids, decode_embeds, **decode_info)
+    output = llm_model.forward(
+        decode_ids,
+        positions=torch.tensor([0], dtype=torch.long),
+        inputs_embeds=decode_embeds,
+        model_intermediate_buffer=[decode_info],
+        seq_token_counts=[1],
+    )
+    mm = output.multimodal_outputs
+    assert torch.isfinite(mm["ming_next_embeds"]).all()
+
+    update = stage1.postprocess(output.text_hidden_states, multimodal_outputs=mm, **decode_info)
+    _, next_decode_embeds, next_decode_info = stage1.preprocess_input(
+        decode_ids,
+        torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32),
+        **update,
+    )
+    assert torch.isfinite(next_decode_embeds).all()
+
+    with pytest.raises(RuntimeError, match="Non-finite z_diff_cond before FlowLoss.sample"):
+        llm_model.forward(
+            decode_ids,
+            positions=torch.tensor([1], dtype=torch.long),
+            inputs_embeds=next_decode_embeds,
+            model_intermediate_buffer=[next_decode_info],
+            seq_token_counts=[1],
+        )
diff --git a/tests/worker/test_omni_gpu_model_runner.py b/tests/worker/test_omni_gpu_model_runner.py
index a74c9ffc2d2..738d4c6457b 100644
--- a/tests/worker/test_omni_gpu_model_runner.py
+++ b/tests/worker/test_omni_gpu_model_runner.py
@@ -300,6 +300,28 @@ def test_update_intermediate_buffer_skips_unknown_req_id():
     assert "unknown_req" not in runner.model_intermediate_buffer
 
 
+def test_update_additional_information_uses_legacy_additional_information():
+    runner = _make_runner(req_ids=("r1",), hidden_size=4)
+
+    scheduler_output = SimpleNamespace(
+        scheduled_new_reqs=[
+            SimpleNamespace(
+                req_id="r1",
+                additional_information={"new_field": 1},
+            )
+        ],
+        scheduled_cached_reqs=SimpleNamespace(
+            additional_information={"r1": {"cached_field": 3}},
+        ),
+    )
+
+    OmniGPUModelRunner._update_additional_information(runner, scheduler_output)
+
+    info = runner.model_intermediate_buffer["r1"]
+    assert info["new_field"] == 1
+    assert info["cached_field"] == 3
+
+
 def test_maybe_attach_mimo_audio_req_infos_enriches_dict():
     runner = _make_runner_for_mimo()
     req_id = "r_mimo"
diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py
index 89139bf1b0b..783a146bc94 100644
--- a/vllm_omni/engine/arg_utils.py
+++ b/vllm_omni/engine/arg_utils.py
@@ -36,6 +36,9 @@ def _register_omni_hf_configs() -> None:
         from transformers import AutoConfig
 
         from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
+        from vllm_omni.model_executor.models.ming_tts.configuration_ming_dense import (
+            MingDenseConfig,
+        )
         from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig
         from vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts import (
             Qwen3TTSConfig,
@@ -58,6 +61,7 @@ def _register_omni_hf_configs() -> None:
         _CONFIG_REGISTRY = None
 
     for model_type, config_cls in [
+        ("dense", MingDenseConfig),
         ("qwen3_tts", Qwen3TTSConfig),
         ("cosyvoice3", CosyVoice3Config),
         ("omnivoice", OmniVoiceConfig),
diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index 61da4388be0..ceec612c499 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -220,6 +220,27 @@ def _apply_omni_final_stage_metadata(
     )
 
 
+def _consume_processed_prompt(
+    input_processor: InputProcessor | None,
+    fallback_prompt: Any,
+) -> Any:
+    """Return the prompt dict actually seen by the stage-0 preprocessor."""
+    if input_processor is None:
+        return fallback_prompt
+    preprocessor = getattr(input_processor, "input_preprocessor", None)
+    if preprocessor is None:
+        return fallback_prompt
+    consume = getattr(preprocessor, "consume_last_processed_prompt", None)
+    if consume is None:
+        return fallback_prompt
+    processed_prompt = consume()
+    if processed_prompt is None:
+        return fallback_prompt
+    if fallback_prompt is not None and not isinstance(processed_prompt, type(fallback_prompt)):
+        return fallback_prompt
+    return processed_prompt
+
+
 def _weak_shutdown_async_omni_engine(
     orchestrator_thread: threading.Thread | None,
     request_queue: janus.Queue[dict[str, Any]] | None,
@@ -658,10 +679,19 @@ def _attach_llm_stage(
                 # Use omni preprocessor so text-only prompts with
                 # mm_processor_kwargs (e.g. GLM-Image t2i target_h/target_w)
                 # still go through multimodal processor path.
-                input_processor.input_preprocessor = OmniInputPreprocessor(
+                omni_preprocessor = OmniInputPreprocessor(
                     vllm_config=started.vllm_config,
                     renderer=input_processor.renderer,
                 )
+                ingress_processor_factory = getattr(started.metadata, "initial_prompt_processor_factory", None)
+                if ingress_processor_factory is not None:
+                    omni_preprocessor.set_initial_prompt_processor(
+                        ingress_processor_factory(
+                            vllm_config=started.vllm_config,
+                            tokenizer=tokenizer,
+                        )
+                    )
+                input_processor.input_preprocessor = omni_preprocessor
         except Exception:
             try:
                 stage_client.shutdown()
@@ -1051,9 +1081,10 @@ def _build_add_request_message(
                 data_parallel_rank=data_parallel_rank,
                 resumable=resumable,
             )
+            processed_prompt = _consume_processed_prompt(self.input_processor, prompt)
             # TODO (Peiqi): add this for Qwen3-TTS only. Other models don't have
             # additional_information field in the prompt.
-            request = _upgrade_to_omni_request(request, prompt)
+            request = _upgrade_to_omni_request(request, processed_prompt)
 
             if reasoning_ended is not None:
                 request.reasoning_ended = reasoning_ended
@@ -1121,11 +1152,13 @@ def _enqueue_cfg_companions(
                 params=companion_params,
                 supported_tasks=self.supported_tasks,
             )
+            processed_prompt = _consume_processed_prompt(self.input_processor, companion_prompt)
+            request = _upgrade_to_omni_request(request, processed_prompt)
             request.external_req_id = cid
 
             self.output_processors[0].add_request(
                 request=request,
-                prompt=companion_prompt,
+                prompt=processed_prompt,
                 parent_req=None,
                 request_index=0,
                 queue=None,
diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py
index cc7676ba5d4..c5d6f01fa85 100644
--- a/vllm_omni/engine/stage_init_utils.py
+++ b/vllm_omni/engine/stage_init_utils.py
@@ -255,6 +255,7 @@ class StageMetadata:
     final_output_type: str | None
     default_sampling_params: OmniSamplingParams
     custom_process_input_func: Callable | None
+    initial_prompt_processor_factory: Callable | None
     model_stage: str | None
     runtime_cfg: Any
     prompt_expand_func: Callable | None = None
@@ -309,6 +310,11 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata:
         mod_path, fn_name = _cpif_path.rsplit(".", 1)
         custom_process_input_func = getattr(importlib.import_module(mod_path), fn_name)
 
+    initial_prompt_processor_factory: Callable | None = None
+    if hasattr(stage_config, "initial_prompt_processor"):
+        mod_path, fn_name = stage_config.initial_prompt_processor.rsplit(".", 1)
+        initial_prompt_processor_factory = getattr(importlib.import_module(mod_path), fn_name)
+
     prompt_expand_func: Callable | None = None
     _pef_path = getattr(stage_config, "prompt_expand_func", None)
     if _pef_path:
@@ -333,6 +339,7 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata:
             final_output_type=final_output_type,
             default_sampling_params=default_sampling_params,
             custom_process_input_func=custom_process_input_func,
+            initial_prompt_processor_factory=initial_prompt_processor_factory,
             model_stage=None,
             runtime_cfg=runtime_cfg,
             cfg_kv_collect_func=cfg_kv_collect_func,
@@ -354,6 +361,7 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata:
         final_output_type=final_output_type,
         default_sampling_params=default_sampling_params,
         custom_process_input_func=custom_process_input_func,
+        initial_prompt_processor_factory=initial_prompt_processor_factory,
         model_stage=model_stage,
         runtime_cfg=runtime_cfg,
         prompt_expand_func=prompt_expand_func,
diff --git a/vllm_omni/entrypoints/openai/protocol/audio.py b/vllm_omni/entrypoints/openai/protocol/audio.py
index 59b5777a874..b5b5e13b915 100644
--- a/vllm_omni/entrypoints/openai/protocol/audio.py
+++ b/vllm_omni/entrypoints/openai/protocol/audio.py
@@ -7,6 +7,43 @@
 _MAX_EMBEDDING_DIM = 8192
 
 
+def _normalize_ref_audio_value(value):
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value
+    if isinstance(value, (list, tuple)):
+        items = []
+        for item in value:
+            if not isinstance(item, str):
+                raise TypeError("'ref_audio' list entries must be strings")
+            items.append(item)
+        if not items:
+            raise ValueError("'ref_audio' list cannot be empty")
+        return items
+    raise TypeError("'ref_audio' must be a string or list of strings")
+
+
+def _normalize_speaker_embedding_value(value):
+    if value is None:
+        return None
+    if not isinstance(value, (list, tuple)):
+        raise TypeError("'speaker_embedding' must be a list of numbers or list of embedding vectors")
+    if not value:
+        return []
+
+    first = value[0]
+    if isinstance(first, (list, tuple)):
+        embeddings = []
+        for item in value:
+            if not isinstance(item, (list, tuple)):
+                raise TypeError("'speaker_embedding' must not mix flat and nested values")
+            embeddings.append([float(x) for x in item])
+        return embeddings
+
+    return [float(x) for x in value]
+
+
 class OpenAICreateSpeechRequest(BaseModel):
     input: str
     model: str | None = None
@@ -46,7 +83,7 @@ class OpenAICreateSpeechRequest(BaseModel):
         default=None,
         description="Language code (e.g., 'Chinese', 'English', 'Auto')",
     )
-    ref_audio: str | None = Field(
+    ref_audio: str | list[str] | None = Field(
         default=None,
         description="Reference audio for voice cloning (Base task). URL, base64, or file URI.",
     )
@@ -58,7 +95,7 @@ class OpenAICreateSpeechRequest(BaseModel):
         default=None,
         description="Use speaker embedding only without in-context learning (Base task)",
     )
-    speaker_embedding: list[float] | None = Field(
+    speaker_embedding: list[float] | list[list[float]] | None = Field(
         default=None,
         max_length=_MAX_EMBEDDING_DIM,
         description="Pre-computed speaker embedding vector (1024-dim for 0.6B, "
@@ -86,17 +123,36 @@ def validate_stream_format(cls, v: str) -> str:
             raise ValueError("'sse' is not a supported stream_format yet. Please use 'audio'.")
         return v
 
+    @field_validator("ref_audio", mode="before")
+    @classmethod
+    def normalize_ref_audio(cls, v):
+        return _normalize_ref_audio_value(v)
+
     @field_validator("speaker_embedding")
     @classmethod
-    def validate_speaker_embedding(cls, v: list[float] | None) -> list[float] | None:
-        if v is not None and not all(math.isfinite(x) for x in v):
+    def validate_speaker_embedding(
+        cls, v: list[float] | list[list[float]] | None
+    ) -> list[float] | list[list[float]] | None:
+        v = _normalize_speaker_embedding_value(v)
+        if v is None:
+            return None
+        if not v:
+            return []
+        if isinstance(v[0], list):
+            for item in v:
+                if not item:
+                    raise ValueError("'speaker_embedding' nested vectors must be non-empty")
+                if not all(math.isfinite(x) for x in item):
+                    raise ValueError("'speaker_embedding' values must be finite (no NaN or Inf)")
+            return v
+        if not all(math.isfinite(x) for x in v):
             raise ValueError("'speaker_embedding' values must be finite (no NaN or Inf)")
         return v
 
     @model_validator(mode="after")
     def validate_embedding_constraints(self) -> "OpenAICreateSpeechRequest":
         if self.speaker_embedding is not None:
-            if self.ref_audio is not None:
+            if self.ref_audio is not None and not isinstance(self.ref_audio, list):
                 raise ValueError("'speaker_embedding' and 'ref_audio' are mutually exclusive")
         return self
 
diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index 4946368e904..93322ce4120 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -56,6 +56,7 @@
 logger = init_logger(__name__)
 
 # TTS Configuration
+_MING_TTS_MODEL_ARCHS = {"MingTTSForConditionalGeneration"}
 _VOXTRAL_TTS_MODEL_STAGES = {"audio_generation"}
 _QWEN3_TTS_MODEL_STAGES = {"qwen3_tts"}
 _FISH_TTS_MODEL_STAGES = {"fish_speech_slow_ar"}
@@ -92,6 +93,7 @@
 _TTS_MAX_INSTRUCTIONS_LENGTH = 500
 _TTS_MAX_NEW_TOKENS_MIN = 1
 _TTS_MAX_NEW_TOKENS_MAX = 4096
+_MING_DEFAULT_PROMPT = "Please generate speech based on the following description.\n"
 
 
 def _create_wav_header(sample_rate: int, num_channels: int = 1, bits_per_sample: int = 16) -> bytes:
@@ -291,7 +293,13 @@ def shutdown(self) -> None:
     def _find_tts_stage(self):
         """Find and return the TTS stage config, or None if not found."""
         for stage in self.engine_client.stage_configs:
-            if stage.engine_args.model_stage in _TTS_MODEL_STAGES:
+            engine_args = getattr(stage, "engine_args", None)
+            model_stage = getattr(engine_args, "model_stage", None)
+            model_arch = getattr(engine_args, "model_arch", None)
+            worker_type = getattr(engine_args, "worker_type", None)
+            if model_stage in _TTS_MODEL_STAGES:
+                return stage
+            if model_arch in _MING_TTS_MODEL_ARCHS and worker_type == "ar":
                 return stage
         return None
 
@@ -323,6 +331,8 @@ def _detect_tts_model_type(self) -> str | None:
             return "voxcpm" if has_vae_stage or model_stage == "vae" else "voxcpm2"
         if model_stage in _MING_TTS_MODEL_STAGES:
             return "ming_flash_omni_tts"
+        if model_arch in _MING_TTS_MODEL_ARCHS:
+            return "ming_tts"
         return None
 
     def _compute_max_instructions_length(self) -> int:
@@ -354,6 +364,8 @@ def _load_supported_speakers(self) -> set[str]:
         try:
             if self._tts_model_type == "voxcpm":
                 return set()
+            if self._tts_model_type == "ming_tts":
+                return set()
             if self._tts_model_type == "voxtral_tts":
                 config = self.engine_client.model_config.hf_config.audio_config
             else:
@@ -751,11 +763,15 @@ async def upload_voice_embedding(self, embedding_json: str, consent: str, name:
             raise ValueError("'speaker_embedding' values must be finite (no NaN or Inf)")
 
         emb_dim = len(embedding)
-        if emb_dim not in {1024, 2048}:
-            logger.warning(
-                "speaker_embedding has %d dimensions; expected 1024 (0.6B) or 2048 (1.7B)",
-                emb_dim,
-            )
+        expected_dims = {192} if self._tts_model_type == "ming_tts" else {1024, 2048}
+        if emb_dim not in expected_dims:
+            if self._tts_model_type == "ming_tts":
+                logger.warning("speaker_embedding has %d dimensions; Ming dense expects 192", emb_dim)
+            else:
+                logger.warning(
+                    "speaker_embedding has %d dimensions; expected 1024 (0.6B) or 2048 (1.7B)",
+                    emb_dim,
+                )
 
         voice_name_lower = name.lower()
         if voice_name_lower in self.uploaded_speakers:
@@ -838,7 +854,7 @@ async def delete_voice(self, name: str) -> bool:
 
     def _is_tts_model(self) -> bool:
         """Check if the current model is a supported TTS model."""
-        return any(stage.engine_args.model_stage in _TTS_MODEL_STAGES for stage in self.engine_client.stage_configs)
+        return self._find_tts_stage() is not None
 
     def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None:
         """Validate TTS request parameters. Returns error message or None."""
@@ -853,6 +869,8 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non
         if self._tts_model_type == "voxcpm2":
             return None  # VoxCPM2 accepts any text input
         if self._tts_model_type == "ming_flash_omni_tts":
+            return self._validate_ming_flash_omni_tts_request(request)
+        if self._tts_model_type == "ming_tts":
             return self._validate_ming_tts_request(request)
         return self._validate_qwen_tts_request(request)
 
@@ -875,7 +893,7 @@ def _voxcpm2_encode(self, text: str) -> list[int]:
         ids = self._voxcpm2_tokenizer.encode(text, add_special_tokens=True)
         return split_multichar_chinese(ids, self._voxcpm2_split_map)
 
-    def _validate_ming_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None:
+    def _validate_ming_flash_omni_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None:
         """Validate Ming-flash-omni standalone-talker request parameters."""
         if not request.input or not request.input.strip():
             return "Input text cannot be empty"
@@ -912,6 +930,8 @@ def _validate_ming_tts_request(self, request: OpenAICreateSpeechRequest) -> str
 
     def _validate_ref_audio_format(self, ref_audio: str) -> str | None:
         """Validate ref_audio is a supported URI format. Returns error or None."""
+        if not isinstance(ref_audio, str):
+            return "ref_audio must be a URL (http/https), base64 data URL (data:...), or file URI (file://...)"
         if not (
             ref_audio.startswith(("http://", "https://"))
             or ref_audio.startswith("data:")
@@ -1175,6 +1195,99 @@ def _validate_cosyvoice3_request(self, request: OpenAICreateSpeechRequest) -> st
 
         return None
 
+    def _validate_ming_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None:
+        """Validate Ming TTS request parameters. Returns error message or None."""
+        if not request.input or not request.input.strip():
+            return "Input text cannot be empty"
+
+        if isinstance(request.ref_audio, list):
+            return self._validate_ming_tts_podcast_request(request)
+        return self._validate_ming_tts_single_speaker_request(request)
+
+    def _validate_ming_tts_single_speaker_request(self, request: OpenAICreateSpeechRequest) -> str | None:
+        if request.ref_audio is not None:
+            fmt_err = self._validate_ref_audio_format(request.ref_audio)
+            if fmt_err:
+                return fmt_err
+
+        if request.speaker_embedding is not None:
+            if not request.speaker_embedding:
+                return "'speaker_embedding' must be a non-empty list of floats"
+            emb_len = len(request.speaker_embedding)
+            if emb_len != 192:
+                logger.warning(
+                    "speaker_embedding has %d dimensions; Ming dense expects 192. "
+                    "Wrong dimensions will likely fail or degrade output.",
+                    emb_len,
+                )
+
+        voice_lower = request.voice.lower() if isinstance(request.voice, str) else None
+        uploaded_voice = bool(voice_lower and voice_lower in self.uploaded_speakers)
+        clone_source_present = request.ref_audio is not None or request.speaker_embedding is not None or uploaded_voice
+
+        if request.task_type == "Base" and not clone_source_present:
+            return "Base task requires 'ref_audio', 'speaker_embedding', or an uploaded voice sample"
+
+        if request.ref_audio is not None and request.ref_text is not None and not request.ref_text.strip():
+            return "'ref_text' must be non-empty when provided with 'ref_audio'"
+
+        # Ming offline ref-audio cases use prompt_waveform without prompt_text;
+        # keep the transcript requirement for other TTS models.
+        if request.ref_audio is not None and request.speaker_embedding is None and not self._is_ming_tts_model():
+            uploaded_ref_text = self.uploaded_speakers[voice_lower].get("ref_text") if uploaded_voice else None
+            if not (request.ref_text and request.ref_text.strip()) and not uploaded_ref_text:
+                return "Reference-audio cloning requires non-empty 'ref_text'"
+
+        if request.ref_text is not None and request.ref_audio is None and not uploaded_voice:
+            return "'ref_text' requires 'ref_audio' or an uploaded voice sample"
+
+        if request.instructions and len(request.instructions) > self._max_instructions_length:
+            return f"Instructions too long (max {self._max_instructions_length} characters)"
+
+        if request.max_new_tokens is not None:
+            if request.max_new_tokens < _TTS_MAX_NEW_TOKENS_MIN:
+                return f"max_new_tokens must be at least {_TTS_MAX_NEW_TOKENS_MIN}"
+            if request.max_new_tokens > _TTS_MAX_NEW_TOKENS_MAX:
+                return f"max_new_tokens cannot exceed {_TTS_MAX_NEW_TOKENS_MAX}"
+
+        return None
+
+    def _validate_ming_tts_podcast_request(self, request: OpenAICreateSpeechRequest) -> str | None:
+        if len(request.ref_audio) < 2:
+            return "Podcast-style Ming requests require at least two 'ref_audio' clips"
+
+        for ref_audio in request.ref_audio:
+            fmt_err = self._validate_ref_audio_format(ref_audio)
+            if fmt_err:
+                return fmt_err
+
+        if not request.ref_text or not request.ref_text.strip():
+            return "Podcast-style Ming requests require non-empty 'ref_text'"
+
+        if request.speaker_embedding is not None:
+            embeddings = request.speaker_embedding
+            embedding_count = len(embeddings) if embeddings and isinstance(embeddings[0], list) else 1
+            if embedding_count != len(request.ref_audio):
+                return (
+                    "Podcast-style Ming requests require one speaker embedding per ref_audio clip; "
+                    f"got {embedding_count} embeddings for {len(request.ref_audio)} clips"
+                )
+            if embeddings and isinstance(embeddings[0], list):
+                for item in embeddings:
+                    if len(item) != 192:
+                        return "Podcast-style Ming speaker embeddings must each have 192 dimensions"
+
+        if request.instructions and len(request.instructions) > self._max_instructions_length:
+            return f"Instructions too long (max {self._max_instructions_length} characters)"
+
+        if request.max_new_tokens is not None:
+            if request.max_new_tokens < _TTS_MAX_NEW_TOKENS_MIN:
+                return f"max_new_tokens must be at least {_TTS_MAX_NEW_TOKENS_MIN}"
+            if request.max_new_tokens > _TTS_MAX_NEW_TOKENS_MAX:
+                return f"max_new_tokens cannot exceed {_TTS_MAX_NEW_TOKENS_MAX}"
+
+        return None
+
     async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int]:
         """Resolve ref_audio to (wav_samples, sample_rate).
 
@@ -1209,13 +1322,123 @@ async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int
             )
         return wav_np.tolist(), sr
 
-    async def _generate_audio_chunks(
+    async def _resolve_ref_audio_many(self, ref_audio_list: list[str]) -> list[tuple[list[float], int]]:
+        resolved = []
+        for ref_audio in ref_audio_list:
+            resolved.append(await self._resolve_ref_audio(ref_audio))
+        return resolved
+
+    # ---- Ming TTS helpers ----
+
+    def _is_ming_tts_model(self) -> bool:
+        return self._tts_model_type == "ming_tts"
+
+    def _coerce_ming_prompt_waveform(self, wav_samples, sample_rate):
+        from torchaudio.functional import resample as resample_audio
+
+        from vllm_omni.model_executor.models.ming_tts.config_ming_tts import SAMPLE_RATE
+
+        waveform = torch.as_tensor(wav_samples, dtype=torch.float32).reshape(1, -1)
+        if int(sample_rate) != SAMPLE_RATE:
+            waveform = resample_audio(waveform, int(sample_rate), SAMPLE_RATE)
+        return waveform
+
+    def _build_ming_prompt_waveform(
         self,
-        generator,
-        request_id: str,
-        response_format: str = "pcm",
-        raw_request: Request | None = None,
+        ref_audio_data: tuple[list[float], int] | list[tuple[list[float], int]] | None,
     ):
+        if isinstance(ref_audio_data, list):
+            return torch.cat(
+                [self._coerce_ming_prompt_waveform(item[0], item[1]) for item in ref_audio_data],
+                dim=-1,
+            )
+        if ref_audio_data is not None:
+            return self._coerce_ming_prompt_waveform(ref_audio_data[0], ref_audio_data[1])
+        return None
+
+    def _extract_ming_speaker_embeddings_from_ref_audio(
+        self,
+        ref_audio_data_list: list[tuple[list[float], int]],
+    ) -> list[list[float]]:
+        from vllm_omni.model_executor.models.ming_tts.speaker_extractor import MingSpeakerEmbeddingExtractor
+
+        extractor = MingSpeakerEmbeddingExtractor(self.engine_client.model_config.model, target_sr=16000)
+        embeddings = []
+        for wav_samples, sr in ref_audio_data_list:
+            waveform = torch.as_tensor(wav_samples, dtype=torch.float32).reshape(1, -1)
+            embedding = extractor.extract_from_waveform(waveform, int(sr))
+            flat = embedding.detach().reshape(-1).to(torch.float32).cpu()
+            if int(flat.numel()) != 192:
+                raise ValueError(f"Ming speaker extractor returned {int(flat.numel())} dims; expected 192")
+            embeddings.append(flat.tolist())
+        return embeddings
+
+    def _parse_ming_instruction(self, request: OpenAICreateSpeechRequest) -> Any:
+        """Build a Ming instruction payload from OpenAI speech fields."""
+        instruction_text = request.instructions.strip() if isinstance(request.instructions, str) else None
+        instruction_dict: dict[str, Any] = {}
+
+        if request.language not in (None, "", "Auto"):
+            instruction_dict["方言"] = request.language
+
+        voice_lower = request.voice.lower() if isinstance(request.voice, str) else None
+        if request.voice and not (voice_lower and voice_lower in self.uploaded_speakers):
+            instruction_dict["IP"] = request.voice
+
+        if instruction_text:
+            try:
+                parsed = json.loads(instruction_text)
+            except json.JSONDecodeError:
+                parsed = None
+            if isinstance(parsed, dict):
+                instruction_dict.update(parsed)
+            elif instruction_dict:
+                instruction_dict["风格"] = instruction_text
+            else:
+                return instruction_text
+
+        return instruction_dict or None
+
+    def _build_ming_dense_prompt(
+        self,
+        request: OpenAICreateSpeechRequest,
+        *,
+        ref_audio_data: tuple[list[float], int] | list[tuple[list[float], int]] | None = None,
+    ) -> dict[str, Any]:
+        """Build a Ming dense prompt directly from the OpenAI speech request."""
+        from transformers import AutoTokenizer
+
+        from vllm_omni.model_executor.models.ming_tts.config_ming_tts import KEY_MAX_DECODE_STEPS
+        from vllm_omni.model_executor.models.ming_tts.prompt_builder import build_ming_dense_prompt
+
+        if self._tts_tokenizer is None:
+            model_name = self.engine_client.model_config.model
+            self._tts_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)
+
+        ref_text = request.ref_text
+        prompt_waveform = self._build_ming_prompt_waveform(ref_audio_data) if ref_text is not None else None
+        speaker_embedding = request.speaker_embedding
+        use_zero_spk_emb = prompt_waveform is None and speaker_embedding is None
+
+        runtime_controls = {}
+        if request.max_new_tokens is not None:
+            runtime_controls[KEY_MAX_DECODE_STEPS] = request.max_new_tokens
+
+        return build_ming_dense_prompt(
+            self._tts_tokenizer,
+            # bgm / music-prompt mode not supported online;
+            # requires prompt_mode API extension (deferred).
+            prompt=_MING_DEFAULT_PROMPT,
+            text=request.input,
+            runtime_controls=runtime_controls or None,
+            instruction=self._parse_ming_instruction(request),
+            prompt_text=ref_text,
+            prompt_waveform=prompt_waveform,
+            speaker_embedding=speaker_embedding,
+            use_zero_spk_emb=use_zero_spk_emb,
+        )
+
+    async def _generate_audio_chunks(self, generator, request_id: str, response_format: str = "pcm"):
         """Generate audio chunks for streaming response.
 
         Handles two audio output modes from the engine:
@@ -1574,7 +1797,7 @@ async def _build_cosyvoice3_prompt(
 
     # ---- Ming-flash-omni standalone-talker (TTS) helpers ----
 
-    def _build_ming_prompt(self, request: OpenAICreateSpeechRequest) -> dict[str, Any]:
+    def _build_ming_flash_omni_prompt(self, request: OpenAICreateSpeechRequest) -> dict[str, Any]:
         # request.instructions accepts two forms:
         # 1. Plain text: mapped to the caption's 风格 (style) field
         # 2. JSON object: parsed and splatted into the caption. Unlocks
@@ -1681,8 +1904,29 @@ async def _prepare_speech_generation(
                 prompt = await self._build_cosyvoice3_prompt(request)
                 tts_params = {}
             elif self._tts_model_type == "ming_flash_omni_tts":
-                prompt = self._build_ming_prompt(request)
+                prompt = self._build_ming_flash_omni_prompt(request)
                 tts_params = {}
+            elif self._tts_model_type == "ming_tts":
+                ref_audio_source = request.ref_audio
+                voice_lower = request.voice.lower() if isinstance(request.voice, str) else None
+                if ref_audio_source is None and voice_lower in self.uploaded_speakers:
+                    ref_audio_source = self._get_uploaded_audio_data(request.voice)
+                    if request.ref_text is None:
+                        request.ref_text = self.uploaded_speakers[voice_lower].get("ref_text")
+                ref_audio_data = None
+                if isinstance(ref_audio_source, list):
+                    ref_audio_data = await self._resolve_ref_audio_many(ref_audio_source)
+                    if request.speaker_embedding is None:
+                        request.speaker_embedding = self._extract_ming_speaker_embeddings_from_ref_audio(ref_audio_data)
+                elif ref_audio_source is not None and isinstance(ref_audio_source, str):
+                    wav_list, sr = await self._resolve_ref_audio(ref_audio_source)
+                    ref_audio_data = (wav_list, sr)
+                    if request.speaker_embedding is None:
+                        request.speaker_embedding = self._extract_ming_speaker_embeddings_from_ref_audio(
+                            [ref_audio_data]
+                        )[0]
+                prompt = self._build_ming_dense_prompt(request, ref_audio_data=ref_audio_data)
+                tts_params = prompt.get("additional_information", {})
             else:
                 tts_params = self._build_tts_params(request)
                 # Resolve ref_audio (explicit or auto-set for uploaded voices)
@@ -1732,6 +1976,8 @@ async def _prepare_speech_generation(
             model_type = "voxcpm2"
         elif self._tts_model_type == "ming_flash_omni_tts":
             model_type = "ming_flash_omni_tts"
+        elif self._tts_model_type == "ming_tts":
+            model_type = "ming_tts"
         elif self._is_tts:
             model_type = tts_params.get("task_type", ["unknown"])[0]
         else:
@@ -1790,6 +2036,17 @@ async def _prepare_speech_generation(
 
             sampling_params_list = copy.deepcopy(sampling_params_list)
             sampling_params_list[0].max_tokens = request.max_new_tokens
+        elif self._tts_model_type == "ming_tts" and sampling_params_list:
+            import copy
+
+            from vllm_omni.model_executor.models.ming_tts.config_ming_tts import TEXT_EOS_TOKEN_ID
+
+            sampling_params_list = copy.deepcopy(sampling_params_list)
+            sampling_params_list[0].stop_token_ids = [int(TEXT_EOS_TOKEN_ID)]
+            if request.max_new_tokens is not None:
+                # Ming emits TEXT_EOS after the latent decode budget is exhausted, so
+                # Stage-0 needs one extra token beyond ming_max_decode_steps.
+                sampling_params_list[0].max_tokens = int(request.max_new_tokens) + 1
 
         generator = self.engine_client.generate(
             prompt=prompt,
diff --git a/vllm_omni/inputs/preprocess.py b/vllm_omni/inputs/preprocess.py
index cca6ce56870..a5bba6cb4df 100644
--- a/vllm_omni/inputs/preprocess.py
+++ b/vllm_omni/inputs/preprocess.py
@@ -25,6 +25,30 @@ class OmniInputPreprocessor(InputPreprocessor):
     Supports processing tokens, embeddings, text, and multimodal inputs.
     """
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.initial_prompt_processor = None
+        self._last_processed_prompt = None
+
+    def set_initial_prompt_processor(self, processor: Any) -> None:
+        self.initial_prompt_processor = processor
+
+    def consume_last_processed_prompt(self) -> Any:
+        prompt = self._last_processed_prompt
+        self._last_processed_prompt = None
+        return prompt
+
+    def _apply_initial_prompt_processor(self, prompt: SingletonDictPrompt) -> SingletonDictPrompt:
+        self._last_processed_prompt = prompt
+        processor = self.initial_prompt_processor
+        if processor is None or not isinstance(prompt, dict):
+            return prompt
+        processed = processor(prompt)
+        if not isinstance(processed, dict):
+            raise TypeError(f"Initial prompt processor must return a prompt dict, got {type(processed).__name__}")
+        self._last_processed_prompt = processed
+        return processed
+
     def _process_text(
         self,
         parsed_content: OmniTextPrompt,
@@ -164,6 +188,8 @@ def _prompt_to_llm_inputs(
 
         * [`SingletonInput`][vllm.inputs.engine.SingletonInput] instance
         """
+        prompt = self._apply_initial_prompt_processor(prompt)
+
         if "prompt_embeds" in prompt:
             return self._process_embeds(prompt)  # type: ignore[arg-type]
 
diff --git a/vllm_omni/model_executor/models/ming_tts/__init__.py b/vllm_omni/model_executor/models/ming_tts/__init__.py
new file mode 100644
index 00000000000..5c945c5410e
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/__init__.py
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .configuration_ming_dense import MingDenseConfig
+from .ming_tts import MingTTSForConditionalGeneration
+from .ming_tts_audio_vae import MingAudioVAEModel
+from .ming_tts_llm import MingLLMModel
+
+__all__ = [
+    "MingDenseConfig",
+    "MingTTSForConditionalGeneration",
+    "MingLLMModel",
+    "MingAudioVAEModel",
+]
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/__init__.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/__init__.py
new file mode 100644
index 00000000000..208f01a7cb5
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/audio_encoder.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/audio_encoder.py
new file mode 100644
index 00000000000..d5d11121791
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/audio_encoder.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/audio_tokenizer/audio_encoder.py
+
+from collections.abc import Iterable
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torchtune.modules import RotaryPositionalEmbeddings
+from transformers.cache_utils import DynamicCache
+
+try:
+    from flash_attn import flash_attn_func
+
+    _FLASH_ATTN_AVAILABLE = True
+except (ImportError, RuntimeError, OSError):
+    _FLASH_ATTN_AVAILABLE = False
+    flash_attn_func = None  # guarded by semantic_module_kwargs check above
+
+
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        return super().forward(x.float()).type(x.dtype)
+
+
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x,
+            self.weight.to(x.dtype),
+            None if self.bias is None else self.bias.to(x.dtype),
+        )
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int, layer_idx: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+        self.layer_idx = layer_idx
+        self.rotary_embed = RotaryPositionalEmbeddings(dim=n_state // n_head)
+
+    def forward(self, x: Tensor, past_key_values=None):
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+
+        wv, qk, past_key_values = self.qkv_attention(q, k, v, past_key_values=past_key_values)
+        return self.out(wv), qk, past_key_values
+
+    def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, past_key_values=None):
+        if not _FLASH_ATTN_AVAILABLE:
+            raise ImportError("flash_attn is required for Ming semantic audio encoder attention.")
+        q = q.view(*q.shape[:2], self.n_head, -1)  # [B, T, nhead, dm]
+        k = k.view(*k.shape[:2], self.n_head, -1)  # [B, T, nhead, dm]
+        v = v.view(*v.shape[:2], self.n_head, -1)  # [B, T, nhead, dm]
+
+        if past_key_values is not None:
+            past_seen_tokens = past_key_values.get_seq_length(self.layer_idx) if past_key_values is not None else 0
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + q.size(1), device=q.device)
+            cache_position = cache_position.unsqueeze(0)
+        else:
+            cache_position = None
+
+        q = self.rotary_embed(q, input_pos=cache_position)
+        k = self.rotary_embed(k, input_pos=cache_position)
+
+        q = q.permute(0, 2, 1, 3)
+        k = k.permute(0, 2, 1, 3)
+        v = v.permute(0, 2, 1, 3)
+
+        if past_key_values is not None:
+            k, v = past_key_values.update(k, v, self.layer_idx, {"cache_position": cache_position})
+
+        a = flash_attn_func(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), causal=True)
+        out = a.flatten(start_dim=2)
+        qk = None
+
+        return out, qk, past_key_values
+
+
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int, layer_idx: int):
+        super().__init__()
+
+        self.attn = MultiHeadAttention(n_state, n_head, layer_idx)
+        self.attn_ln = LayerNorm(n_state)
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state))
+        self.mlp_ln = LayerNorm(n_state)
+        self.layer_idx = layer_idx
+
+    def forward(self, x: Tensor, past_key_values=None):
+        attn_out, _, past_key_values = self.attn(self.attn_ln(x), past_key_values=past_key_values)
+        x = x + attn_out
+        x = x + self.mlp(self.mlp_ln(x))
+        return x, past_key_values
+
+
+class WhisperAudioEncoder(nn.Module):
+    def __init__(self, n_state: int, n_head: int, n_layer: int):
+        super().__init__()
+
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head, layer_idx=i) for i in range(n_layer)]
+        )
+        self.ln_post = LayerNorm(n_state)
+
+    def forward(self, whisper_feats: Tensor, use_cache=False, past_key_values=None, **kwargs):
+        if past_key_values is None and use_cache:
+            past_key_values = DynamicCache()
+
+        x = whisper_feats
+
+        for block in self.blocks:
+            x, past_key_values = block(x, past_key_values=past_key_values)
+
+        x = self.ln_post(x)
+
+        return x, past_key_values
+
+    @classmethod
+    def from_pretrained(cls, dims):
+        audio_encoder = cls(
+            dims["n_state"],
+            dims["n_head"],
+            dims["n_layer"],
+        )
+
+        audio_encoder.audio_emb_dim = dims["n_state"]
+        return audio_encoder
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/configuration_audio_vae.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/configuration_audio_vae.py
new file mode 100644
index 00000000000..ce9c069c277
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/configuration_audio_vae.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/audio_tokenizer/configuration_audio_vae.py
+
+
+from transformers import PretrainedConfig
+
+
+class AudioVAEconfig(PretrainedConfig):
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        enc_kwargs: dict = None,
+        semantic_module_kwargs: dict = None,
+        dec_kwargs: dict = None,
+        hifi_gan_disc_kwargs: dict = None,
+        spec_disc_kwargs: dict = None,
+        lambda_disc=1.0,
+        lambda_mel_loss=15,
+        lambda_adv=1.0,
+        lambda_feat_match_loss=1.0,
+        lambda_semantic=5.0,
+        init_method="normal",
+        patch_size=-1,
+        **kwargs,
+    ):
+        self.sample_rate = sample_rate
+        self.enc_kwargs = enc_kwargs
+        self.semantic_module_kwargs = semantic_module_kwargs
+        self.dec_kwargs = dec_kwargs
+        self.hifi_gan_disc_kwargs = hifi_gan_disc_kwargs
+        self.spec_disc_kwargs = spec_disc_kwargs
+        self.lambda_disc = lambda_disc
+        self.lambda_mel_loss = lambda_mel_loss
+        self.lambda_adv = lambda_adv
+        self.lambda_feat_match_loss = lambda_feat_match_loss
+        self.lambda_semantic = lambda_semantic
+        self.init_method = init_method
+        self.patch_size = patch_size
+        super().__init__(**kwargs)
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py
new file mode 100644
index 00000000000..c365381c87f
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py
@@ -0,0 +1,188 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/audio_tokenizer/istft.py
+
+import torch
+import torch.nn as nn
+
+
+class ISTFT(nn.Module):
+    """
+    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
+    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
+    See issue: https://github.com/pytorch/pytorch/issues/62323
+    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
+    The NOLA constraint is met as we trim padded samples anyway.
+
+    Args:
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames.
+        win_length (int): The size of window frame and STFT filter.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+
+    def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        window = torch.hann_window(win_length)
+        self.register_buffer("window", window)
+
+        self.audio_buffer = None
+        self.window_buffer = None
+        self.buffer_len = self.win_length - self.hop_length
+
+    def __buffer_process(self, x, buffer, pad, last_chunk=False, streaming=False):
+        if streaming:
+            if buffer is None:
+                # first chunk
+                x = x[:, pad:]
+            if buffer is not None:
+                # next chunk
+                x[:, : self.buffer_len] += buffer
+            buffer = x[:, -self.buffer_len :]
+            if not last_chunk:
+                x = x[:, : -self.buffer_len]
+            else:
+                x = x[:, :-pad]
+        else:
+            x = x[:, pad:-pad]
+
+        return x, buffer
+
+    def forward(self, spec: torch.Tensor, audio_buffer=None, window_buffer=None, streaming=False, last_chunk=False):
+        """
+        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
+
+        Args:
+            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
+                            N is the number of frequency bins, and T is the number of time frames.
+            audio_buffer (Tensor): [Streaming Input/State] The audio overlap buffer from the previous chunk.
+                            Shape: (B, win_length - hop_length)
+            window_buffer (Tensor): [Streaming Input/State] The window overlap buffer from the previous chunk.
+            streaming: If `True`, the function operates in streaming mode, processing `spec` as a single chunk.
+            last_chunk: When `streaming=True` and `last_chunk=True`, the function can perform final "flush" operations
+
+        Returns:
+            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
+        """
+        if self.padding == "center":
+            # Fallback to pytorch native implementation
+            return torch.istft(spec, self.n_fft, self.hop_length, self.win_length, self.window, center=True)
+        elif self.padding == "same":
+            pad = (self.win_length - self.hop_length) // 2
+        else:
+            raise ValueError("Padding must be 'center' or 'same'.")
+
+        if spec.dim() != 3:
+            raise ValueError(f"Expected spec rank-3 [Batch, Freq, Time], got {tuple(spec.shape)}")
+        B, N, T = spec.shape
+
+        # Inverse FFT
+        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
+        ifft = ifft * self.window[None, :, None]
+
+        # Overlap and Add
+        output_size = (T - 1) * self.hop_length + self.win_length
+        y = torch.nn.functional.fold(
+            ifft,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        )[:, 0, 0, :]
+
+        y, audio_buffer = self.__buffer_process(y, audio_buffer, pad, last_chunk=last_chunk, streaming=streaming)
+
+        # Window envelope
+        window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
+        window_envelope = (
+            torch.nn.functional.fold(
+                window_sq,
+                output_size=(1, output_size),
+                kernel_size=(1, self.win_length),
+                stride=(1, self.hop_length),
+            )
+            .squeeze(0)
+            .squeeze(0)
+        )
+
+        window_envelope, window_buffer = self.__buffer_process(
+            window_envelope, window_buffer, pad, last_chunk=last_chunk, streaming=streaming
+        )
+        window_envelope = window_envelope.squeeze()
+
+        # Normalize
+        if not (window_envelope > 1e-11).all():
+            raise RuntimeError("ISTFT window envelope underflowed; invalid overlap-add state.")
+        y = y / window_envelope
+
+        return y, audio_buffer, window_buffer
+
+
+class FourierHead(nn.Module):
+    """Base class for inverse fourier modules."""
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        raise NotImplementedError("Subclasses must implement the forward method.")
+
+
+class ISTFTHead(FourierHead):
+    """
+    ISTFT Head module for predicting STFT complex coefficients.
+
+    Args:
+        dim (int): Hidden dimension of the model.
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames, which should align with
+                          the resolution of the input features.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+
+    def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"):
+        super().__init__()
+        out_dim = n_fft + 2
+        self.out = torch.nn.Linear(dim, out_dim)
+        self.istft = ISTFT(n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding)
+
+    def forward(self, x: torch.Tensor, audio_buffer=None, window_buffer=None, streaming=False, last_chunk=False):
+        """
+        Forward pass of the ISTFTHead module.
+
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        x_pred = self.out(x)
+        # x_pred = x
+        x_pred = x_pred.transpose(1, 2)
+        mag, p = x_pred.chunk(2, dim=1)
+        mag = torch.exp(mag)
+        mag = torch.clip(mag, max=1e2)  # safeguard to prevent excessively large magnitudes
+        # wrapping happens here. These two lines produce real and imaginary value
+        x = torch.cos(p)
+        y = torch.sin(p)
+        # recalculating phase here does not produce anything new
+        # only costs time
+        # phase = torch.atan2(y, x)
+        # S = mag * torch.exp(phase * 1j)
+        # better directly produce the complex value
+        S = mag * (x + 1j * y)
+        audio, audio_buffer, window_buffer = self.istft(
+            S, audio_buffer=audio_buffer, window_buffer=window_buffer, streaming=streaming, last_chunk=last_chunk
+        )
+        return audio.unsqueeze(1), x_pred, audio_buffer, window_buffer
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/modeling_audio_vae.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/modeling_audio_vae.py
new file mode 100644
index 00000000000..f72c12184fd
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/modeling_audio_vae.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/audio_tokenizer/modeling_audio_vae.py
+# audio_tokenizer/modeling_audio_vae.py
+import torch
+import torch.nn as nn
+from vllm.logger import init_logger
+
+from .configuration_audio_vae import AudioVAEconfig
+from .vae_modules import Decoder, Encoder
+
+logger = init_logger(__name__)
+
+
+def _get_backbone(config: AudioVAEconfig, branch: str):
+    branch_cfg = getattr(config, branch, None)
+    if not isinstance(branch_cfg, dict):
+        return None
+    backbone = branch_cfg.get("backbone")
+    if not isinstance(backbone, dict):
+        return None
+    return backbone
+
+
+def _maybe_fallback_attention(config: AudioVAEconfig) -> None:
+    enc_backbone = _get_backbone(config, "enc_kwargs")
+    dec_backbone = _get_backbone(config, "dec_kwargs")
+    requested_attn_impl = "flash_attention_2"
+
+    if dec_backbone is not None:
+        requested_attn_impl = dec_backbone.get(
+            "_attn_implementation",
+            dec_backbone.get("attn_implementation", requested_attn_impl),
+        )
+    elif enc_backbone is not None:
+        requested_attn_impl = enc_backbone.get(
+            "_attn_implementation",
+            enc_backbone.get("attn_implementation", requested_attn_impl),
+        )
+
+    if requested_attn_impl != "flash_attention_2":
+        return
+
+    try:
+        import flash_attn  # noqa: F401
+    except ImportError:
+        if enc_backbone is not None:
+            enc_backbone["_attn_implementation"] = "sdpa"
+            enc_backbone["attn_implementation"] = "sdpa"
+        if dec_backbone is not None:
+            dec_backbone["_attn_implementation"] = "sdpa"
+            dec_backbone["attn_implementation"] = "sdpa"
+        logger.warning("flash_attn not available, falling back to sdpa for Ming audio VAE")
+
+
+class AudioVAE(nn.Module):
+    def __init__(self, config: AudioVAEconfig):
+        super().__init__()
+        self.config = config
+        _maybe_fallback_attention(self.config)
+
+        # --- Ming/Bailing config sanity (fail early on bad nested config parsing) ---
+        enc_kwargs = config.enc_kwargs
+        dec_kwargs = config.dec_kwargs
+
+        # Required nested fields
+        for k in ("backbone", "input_dim", "latent_dim"):
+            if k not in enc_kwargs:
+                raise ValueError(f"AudioVAE.enc_kwargs missing required key: {k}")
+        for k in ("backbone", "output_dim", "latent_dim"):
+            if k not in dec_kwargs:
+                raise ValueError(f"AudioVAE.dec_kwargs missing required key: {k}")
+
+        # Ming-specific geometry checks (safe because this integration targets Ming checkpoint family)
+        hop_size = enc_kwargs.get("hop_size", enc_kwargs["input_dim"])
+        if enc_kwargs["input_dim"] != hop_size:
+            raise ValueError(f"AudioVAE encoder input_dim ({enc_kwargs['input_dim']}) != hop_size ({hop_size}).")
+        if hop_size != dec_kwargs["output_dim"]:
+            raise ValueError(
+                f"AudioVAE encoder hop_size ({hop_size}) != decoder output_dim ({dec_kwargs['output_dim']})."
+            )
+
+        self.encoder = Encoder(
+            encoder_args=enc_kwargs["backbone"],
+            input_dim=enc_kwargs["input_dim"],
+            hop_size=hop_size,
+            latent_dim=enc_kwargs["latent_dim"],
+            patch_size=config.patch_size,
+        )
+
+        # Semantic module is null for this checkpoint.
+        if config.semantic_module_kwargs is not None:
+            from .audio_encoder import WhisperAudioEncoder
+
+            semantic_model = WhisperAudioEncoder.from_pretrained(dims=config.semantic_module_kwargs["whisper_encoder"])
+        else:
+            semantic_model = None
+
+        self.decoder = Decoder(
+            decoder_args=dec_kwargs["backbone"],  # IMPORTANT: decoder uses dec_kwargs.backbone
+            output_dim=dec_kwargs["output_dim"],  # Ming checkpoint uses 882
+            latent_dim=dec_kwargs["latent_dim"],
+            semantic_model=semantic_model,
+            patch_size=config.patch_size,
+        )
+
+    @torch.inference_mode()
+    def encode_latent(
+        self,
+        waveform: torch.Tensor,
+        waveform_length: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Encode waveform -> acoustic latent.
+        """
+        if waveform.ndim != 2:
+            raise ValueError(f"Expected waveform rank-2 [Batch, Time], got {tuple(waveform.shape)}")
+        if waveform_length.ndim != 1:
+            raise ValueError(f"Expected waveform_length rank-1 [Batch], got {tuple(waveform_length.shape)}")
+        if waveform.shape[0] != waveform_length.shape[0]:
+            raise ValueError(
+                "Batch mismatch: "
+                f"waveform batch={waveform.shape[0]} vs "
+                f"waveform_length batch={waveform_length.shape[0]}"
+            )
+        if torch.any(waveform_length <= 0):
+            raise ValueError("waveform_length must be strictly positive.")
+
+        frame_num = torch.ceil(waveform_length / self.config.enc_kwargs["input_dim"]).to(torch.int32)
+        if self.config.patch_size != -1:
+            frame_num = torch.ceil(frame_num / self.config.patch_size)
+
+        h, _ = self.encoder(waveform)
+        h = h.transpose(1, 2)  # [B, 2*latent_dim, T] (posterior params: mean + logvar)
+
+        # Inline OobleckDiagonalGaussianDistribution.sample()
+        mean, logvar = torch.chunk(h, 2, dim=1)
+        logvar = torch.clamp(logvar, -30.0, 20.0)
+        std = torch.exp(0.5 * logvar)
+        latent = mean + std * torch.randn_like(mean)  # [B, latent_dim, T]
+        latent = latent.transpose(1, 2)  # [B, T, d/2]
+
+        return latent, frame_num
+
+    @torch.inference_mode()
+    def decode(
+        self,
+        latent: torch.Tensor,
+        past_key_values=None,
+        use_cache: bool = False,
+        stream_state: tuple = (None, None, None),
+        last_chunk: bool = False,
+    ) -> tuple[torch.Tensor, tuple, object]:
+        """
+        Decode acoustic latent -> waveform.
+        """
+        if latent.dim() != 3:
+            raise ValueError(f"Expected latent rank-3 [B,T,D], got shape={tuple(latent.shape)}")
+        if latent.shape[0] <= 0:
+            raise ValueError("latent batch size must be positive.")
+
+        target_dtype = next(self.decoder.parameters()).dtype
+        target_device = next(self.decoder.parameters()).device
+        if latent.dtype != target_dtype or latent.device != target_device:
+            latent = latent.to(device=target_device, dtype=target_dtype)
+
+        expected_latent_dim = self.config.dec_kwargs["latent_dim"]
+        if latent.shape[-1] != expected_latent_dim:
+            raise ValueError(f"Latent dim mismatch in decode(): got {latent.shape[-1]}, expected {expected_latent_dim}")
+
+        waveform, stream_state, past_key_values = self.decoder.low_level_reconstruct(
+            latent,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            stream_state=stream_state,
+            last_chunk=last_chunk,
+        )
+        return waveform, stream_state, past_key_values
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py
new file mode 100644
index 00000000000..3920f4be7d4
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/audio_tokenizer/vae_modules.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import Qwen2Config, Qwen2Model
+
+from .istft import ISTFTHead
+
+
+class StreamingLinearUpsample(nn.Module):
+    def __init__(self, scale_factor=4):
+        super().__init__()
+        self.scale_factor = scale_factor
+        self.upsampler = nn.Upsample(scale_factor=scale_factor, mode="linear", align_corners=False)
+
+    def forward(self, x, state=None, is_last=False):
+        if x is None and is_last and (state is None or state.get("prev_chunk") is None):
+            raise ValueError("Received end-of-stream without any latent chunk to upsample.")
+        # 初始化状态
+        if state is None:
+            state = {"prev_chunk": None, "history_last": None, "is_first": True}
+
+        if x is None and not is_last:
+            return None, state
+
+        if state["is_first"] and is_last:
+            out = self.upsampler(x.transpose(1, 2)).transpose(1, 2)
+            return out, None  # 结束后清除状态
+
+        output_chunks = []
+
+        if state["is_first"]:
+            state["prev_chunk"] = x
+            state["is_first"] = False
+            if not is_last:
+                return None, state
+
+        if state["prev_chunk"] is not None:
+            p = state["prev_chunk"].transpose(1, 2)
+
+            if state["history_last"] is None:
+                lookahead = x[:, :1, :].transpose(1, 2)
+                inp = torch.cat([p, lookahead], dim=2)
+                up = self.upsampler(inp)
+                out_prev = up[:, :, : p.size(2) * self.scale_factor]
+            else:
+                lookahead = x[:, :1, :].transpose(1, 2)
+                inp = torch.cat([state["history_last"], p, lookahead], dim=2)
+                up = self.upsampler(inp)
+                start = self.scale_factor
+                end = start + p.size(2) * self.scale_factor
+                out_prev = up[:, :, start:end]
+
+            output_chunks.append(out_prev.transpose(1, 2))
+            state["history_last"] = p[:, :, -1:]
+            state["prev_chunk"] = x
+
+        if is_last:
+            p = state["prev_chunk"].transpose(1, 2)
+            inp = torch.cat([state["history_last"], p], dim=2)
+            up = self.upsampler(inp)
+            out_last = up[:, :, self.scale_factor :]
+            output_chunks.append(out_last.transpose(1, 2))
+            state = None  # 结束
+
+        final_out = torch.cat(output_chunks, dim=1) if output_chunks else None
+        return final_out, state
+
+
+class Encoder(nn.Module):
+    def __init__(self, encoder_args, input_dim=320, hop_size=320, latent_dim=64, patch_size=-1):
+        super().__init__()
+        config = Qwen2Config.from_dict(config_dict=encoder_args)
+        self.encoder = Qwen2Model(config)
+        self.input_dim = input_dim
+        self.hop_size = hop_size
+        self.latent_dim = latent_dim
+        self.fc1 = nn.Linear(input_dim, config.hidden_size, bias=False)
+        self.fc2 = nn.Linear(config.hidden_size, config.hidden_size)
+        self.fc3 = nn.Linear(config.hidden_size, latent_dim * 2)
+        self.norm = nn.LayerNorm(config.hidden_size)
+        self.patch_size = patch_size
+        if patch_size != -1:
+            aggregator_config = Qwen2Config.from_dict({**encoder_args, "num_hidden_layers": 4})
+            self.aggregator = Qwen2Model(aggregator_config)
+            self.cls_embed = nn.Parameter(torch.rand(1, 1, config.hidden_size))
+            self.cls_embed.data.normal_(0, 0.02)
+
+    def get_frames(self, x):
+        num_frames_total = (x.size(-1) + self.hop_size - 1) // self.hop_size  # 向上取整的帧数
+        expected_len = (num_frames_total - 1) * self.hop_size + self.input_dim
+        padding_needed = expected_len - x.size(-1)
+        waveform = F.pad(x, (0, padding_needed), value=0.0)
+
+        frames = waveform.unfold(dimension=-1, size=self.input_dim, step=self.hop_size)  # [B, T, d]
+        return frames
+
+    def pad_patch_insert_cls(self, x):
+        bsz, _, dim = x.size()
+        num_frame = x.size(1)
+        r = num_frame % self.patch_size
+        pad_num = self.patch_size - r if r else 0
+        x = F.pad(x, (0, 0, 0, pad_num), value=0.0)  # 帧数对齐到patch_size倍数
+        x = x.reshape(-1, self.patch_size, dim)
+        x = torch.cat((x, self.cls_embed.expand(x.size(0), -1, -1)), dim=1)  # 每个patch后插入一个cls
+        x = x.reshape(bsz, -1, dim)
+        return x
+
+    def forward(self, waveform):
+        x = self.get_frames(waveform)
+
+        x = self.fc1(x)
+        x = self.fc2(x)
+        x = self.encoder(inputs_embeds=x)
+        x = x.last_hidden_state
+
+        # downsample
+        if self.patch_size != -1:
+            x = self.pad_patch_insert_cls(x)
+            x = self.aggregator(inputs_embeds=x)
+            x = x.last_hidden_state
+            bsz, _, dim = x.size()
+            x = x.reshape(-1, self.patch_size + 1, dim)
+            x = x[:, -1:, :].reshape(bsz, -1, dim)
+
+        x = self.fc3(x)
+        return x, waveform.unsqueeze(1)
+
+
+class Decoder(nn.Module):
+    def __init__(self, decoder_args, output_dim=320, latent_dim=64, semantic_model=None, patch_size=-1):
+        super().__init__()
+        config = Qwen2Config.from_dict(config_dict=decoder_args)
+        self.decoder = Qwen2Model(config)
+        self.output_dim = output_dim
+        self.latent_dim = latent_dim
+        self.fc1 = nn.Linear(latent_dim, config.hidden_size)
+
+        if semantic_model is not None:
+            self.gelu = nn.GELU()
+            self.fc2 = nn.Linear(config.hidden_size, semantic_model.audio_emb_dim)
+            self.semantic_model = semantic_model
+            self.fc3 = nn.Linear(semantic_model.audio_emb_dim, config.hidden_size)
+        else:
+            self.semantic_model = None
+
+        self.hop_length = output_dim
+        self.head = ISTFTHead(
+            dim=config.hidden_size, n_fft=self.hop_length * 4, hop_length=self.hop_length, padding="same"
+        )
+        self.patch_size = patch_size
+        if self.patch_size != -1:
+            self.upsampling = StreamingLinearUpsample(scale_factor=patch_size)
+
+    def forward(self, x, only_semantic_emb=False, past_key_values=None, use_cache=False):
+        x = self.fc1(x)
+
+        if self.semantic_model is not None:
+            x = self.fc2(self.gelu(x))
+            x, past_key_values = self.semantic_model(
+                whisper_feats=x, past_key_values=past_key_values, use_cache=use_cache
+            )
+            unified_emb = x
+            if only_semantic_emb:
+                return unified_emb, past_key_values
+            x = self.fc3(x)
+        else:
+            unified_emb = None
+
+        if self.patch_size != -1:
+            x = self.upsampling(x.transpose(1, 2)).transpose(1, 2)
+
+        x = self.decoder(inputs_embeds=x)
+        x = x.last_hidden_state
+
+        x, _ = self.head(x)
+
+        return x, unified_emb
+
+    def low_level_reconstruct(self, x, past_key_values=None, use_cache=False, stream_state=None, last_chunk=False):
+        # Guard against None on first chunk (connector initialises per-request)
+        if stream_state is None:
+            stream_state = (None, None, None)
+        upsample_state, audio_buffer, window_buffer = stream_state
+        bsz, device, dtype = x.size(0), x.device, x.dtype
+        x = self.fc1(x)
+        if self.patch_size != -1:
+            if use_cache:
+                # streaming
+                x, upsample_state = self.upsampling(x, state=upsample_state, is_last=last_chunk)
+                if x is None:
+                    stream_state = (upsample_state, audio_buffer, window_buffer)
+                    return torch.empty(bsz, 1, 0, device=device, dtype=dtype), stream_state, past_key_values
+            else:
+                x = self.upsampling.upsampler(x.transpose(1, 2)).transpose(1, 2)
+
+        outputs = self.decoder(inputs_embeds=x, past_key_values=past_key_values, use_cache=use_cache)
+        past_key_values = outputs.past_key_values
+        x = outputs.last_hidden_state
+
+        x, _, audio_buffer, window_buffer = self.head(
+            x, streaming=use_cache, audio_buffer=audio_buffer, window_buffer=window_buffer, last_chunk=last_chunk
+        )
+
+        stream_state = (upsample_state, audio_buffer, window_buffer)
+        return x, stream_state, past_key_values
diff --git a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
new file mode 100644
index 00000000000..09ae85be69a
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
@@ -0,0 +1,364 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# config_ming_tts.py
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import PretrainedConfig, Qwen2Config
+
+from .audio_tokenizer.configuration_audio_vae import AudioVAEconfig
+
+# ---------------------------------------------------------------------------
+# Token IDs (confirmed from tokenizer_config.json)
+# ---------------------------------------------------------------------------
+
+AUDIO_DUMMY_TOKEN_ID: int = 151705  # <audioPatch>
+AUDIO_START_TOKEN_ID: int = 151706  # <audio>
+AUDIO_END_TOKEN_ID: int = 151707  # </audio>
+AUDIO_EOS_TOKEN_ID: int = 151704  # <end_of_audio>
+VISION_START_TOKEN_ID: int = 151652  # <|vision_start|>
+
+TEXT_EOS_TOKEN_ID: int = 151669  # <text_eos>
+PAD_TOKEN_ID: int = 151643  # <|endoftext|>
+
+# Backward-compat alias for older code paths
+EOS_TOKEN_ID: int = TEXT_EOS_TOKEN_ID
+
+
+# ---------------------------------------------------------------------------
+# Architectural constants (confirmed from original config.json)
+# ---------------------------------------------------------------------------
+
+LATENT_DIM: int = 64
+PATCH_SIZE: int = 4
+HISTORY_PATCH_SIZE: int = 32
+LLM_HIDDEN_SIZE: int = 896
+LLM_VOCAB_SIZE: int = 151936
+AGGREGATOR_HIDDEN_SIZE: int = 1024
+VAE_PATCH_SIZE: int = 4
+SAMPLE_RATE: int = 44100
+
+# AudioVAE frame/hop geometry (confirmed)
+AUDIO_FRAME_HOP: int = 882  # enc input_dim / hop_size / dec output_dim
+
+# stop_head defaults
+STOP_HEAD_MIN_STEPS: int = 3
+STOP_HEAD_THRESHOLD: float = 0.5
+
+# FlowLoss sampling defaults
+DEFAULT_CFG: float = 2.0
+DEFAULT_SIGMA: float = 0.25
+DEFAULT_TEMPERATURE: float = 0.0
+
+# Connector / Stage-2 streaming defaults (runtime tuning)
+LATENT_CHUNK_SIZE: int = 25
+LATENT_LEFT_CONTEXT: int = 0
+MAX_DECODE_STEPS: int = 200
+
+# seq_data.extra_data keys
+KEY_LATENT_HISTORY: str = "ming_latent_history"
+KEY_DECODE_STEP: str = "ming_decode_step"
+KEY_LAST_STOP_PROB: str = "ming_last_stop_prob"
+KEY_NEXT_EMBEDS: str = "ming_next_embeds"
+KEY_PROMPT_LATENTS: str = "ming_prompt_latents"
+KEY_PROMPT_LATENT_TAIL: str = "ming_prompt_latent_tail"
+KEY_SPEAKER_EMBEDDING: str = "ming_speaker_embedding"
+KEY_REQUEST_ID: str = "ming_request_id"
+KEY_CHUNK_ID: str = "ming_chunk_id"
+KEY_CFG: str = "ming_cfg"
+KEY_SIGMA: str = "ming_sigma"
+KEY_TEMPERATURE: str = "ming_temperature"
+KEY_MAX_DECODE_STEPS: str = "ming_max_decode_steps"
+KEY_MIN_DECODE_STEPS: str = "ming_min_decode_steps"
+KEY_TEXT_MODE: str = "ming_text_mode"
+
+
+@dataclass
+class MingTTSConfig:
+    """Flat config object shared by Stage-1 and Stage-2. Build via from_hf_config()."""
+
+    # --- LLM backbone ---
+    llm_hidden_size: int = LLM_HIDDEN_SIZE
+    llm_vocab_size: int = LLM_VOCAB_SIZE
+    llm_config: dict[str, Any] = field(default_factory=dict)
+
+    # --- Audio latent space ---
+    latent_dim: int = LATENT_DIM
+    patch_size: int = PATCH_SIZE
+    history_patch_size: int = HISTORY_PATCH_SIZE
+
+    # --- Flow / Aggregator sub-configs ---
+    ditar_config: dict[str, Any] = field(default_factory=dict)
+    aggregator_config: dict[str, Any] = field(default_factory=dict)
+
+    # --- AudioVAE ---
+    audio_tokenizer_config: AudioVAEconfig | None = None
+    vae_patch_size: int = VAE_PATCH_SIZE
+    sample_rate: int = SAMPLE_RATE
+    audio_frame_hop: int = AUDIO_FRAME_HOP
+
+    # --- Generation control ---
+    cfg: float = DEFAULT_CFG
+    sigma: float = DEFAULT_SIGMA
+    temperature: float = DEFAULT_TEMPERATURE
+    stop_head_min_steps: int = STOP_HEAD_MIN_STEPS
+    stop_head_threshold: float = STOP_HEAD_THRESHOLD
+    max_decode_steps: int = MAX_DECODE_STEPS
+
+    # --- Stage-2 chunking (runtime tuning) ---
+    latent_chunk_size: int = LATENT_CHUNK_SIZE
+    latent_left_context: int = LATENT_LEFT_CONTEXT
+
+    # --- Token IDs ---
+    text_eos_token_id: int = TEXT_EOS_TOKEN_ID
+    eos_token_id: int = TEXT_EOS_TOKEN_ID  # compat alias
+    pad_token_id: int = PAD_TOKEN_ID
+    audio_dummy_token_id: int = AUDIO_DUMMY_TOKEN_ID
+    audio_start_token_id: int = AUDIO_START_TOKEN_ID
+    audio_end_token_id: int = AUDIO_END_TOKEN_ID
+    audio_eos_token_id: int = AUDIO_EOS_TOKEN_ID
+
+    @classmethod
+    def from_hf_config(cls, hf_config: PretrainedConfig) -> MingTTSConfig:
+        """
+        Build from vllm-omni's hf_config. Supports nested configs as objects or dicts.
+        """
+
+        # --- Read nested sub-configs (must NOT read flat hf_config attrs for these) ---
+        llm_raw = getattr(hf_config, "llm_config", {}) or {}
+        ditar_raw = getattr(hf_config, "ditar_config", {}) or {}
+        agg_raw = getattr(hf_config, "aggregator_config", {}) or {}
+        atc_raw = getattr(hf_config, "audio_tokenizer_config", None)
+
+        llm_dict = _to_plain_dict(llm_raw)
+        ditar = _to_plain_dict(ditar_raw)
+        agg = _to_plain_dict(agg_raw)
+
+        # Keep Ming DiT backend explicit; original checkpoint uses "torch"
+        ditar.setdefault("attn_backend", "torch")
+
+        atc = _coerce_audio_vae_config(atc_raw)
+
+        # --- Pull nested values safely ---
+        atc_enc_latent_dim = _nested_get(atc, "enc_kwargs", "latent_dim", default=LATENT_DIM)
+        atc_patch_size = _nested_get(atc, "patch_size", default=VAE_PATCH_SIZE)
+        atc_sample_rate = _nested_get(atc, "sample_rate", default=SAMPLE_RATE)
+
+        enc_input_dim = _nested_get(atc, "enc_kwargs", "input_dim", default=AUDIO_FRAME_HOP)
+        enc_hop_size = _nested_get(atc, "enc_kwargs", "hop_size", default=AUDIO_FRAME_HOP)
+        dec_output_dim = _nested_get(atc, "dec_kwargs", "output_dim", default=AUDIO_FRAME_HOP)
+
+        cfg = cls(
+            llm_hidden_size=llm_dict.get("hidden_size", LLM_HIDDEN_SIZE),
+            llm_vocab_size=llm_dict.get("vocab_size", LLM_VOCAB_SIZE),
+            llm_config=llm_dict,
+            latent_dim=atc_enc_latent_dim,
+            patch_size=ditar.get("patch_size", PATCH_SIZE),
+            history_patch_size=ditar.get("history_patch_size", HISTORY_PATCH_SIZE),
+            ditar_config=ditar,
+            aggregator_config=agg,
+            audio_tokenizer_config=atc,
+            vae_patch_size=atc_patch_size,
+            sample_rate=atc_sample_rate,
+            audio_frame_hop=enc_hop_size if enc_hop_size is not None else AUDIO_FRAME_HOP,
+        )
+
+        # Optional debug cache (safe to keep)
+        cfg._enc_input_dim = enc_input_dim
+        cfg._enc_hop_size = enc_hop_size
+        cfg._dec_output_dim = dec_output_dim
+
+        return cfg
+
+    def validate(self) -> None:
+        """Run before GPU allocation/weight loading. Raises ValueError on mismatches."""
+
+        # --- Token IDs ---
+        if self.audio_dummy_token_id != 151705:
+            raise ValueError(
+                f"audio_dummy_token_id={self.audio_dummy_token_id}, expected 151705 (<audioPatch>). "
+                "Wrong tokenizer/checkpoint?"
+            )
+        if self.audio_eos_token_id != 151704:
+            raise ValueError(
+                f"audio_eos_token_id={self.audio_eos_token_id}, expected 151704 (<end_of_audio>). "
+                "Wrong tokenizer/checkpoint?"
+            )
+        if self.text_eos_token_id != 151669:
+            raise ValueError(
+                f"text_eos_token_id={self.text_eos_token_id}, expected 151669 (<text_eos>). Wrong tokenizer/checkpoint?"
+            )
+
+        # --- Required sub-config ---
+        if self.audio_tokenizer_config is None:
+            raise ValueError("audio_tokenizer_config is None. Nested AudioVAE config was not deserialized correctly.")
+
+        # --- Confirmed checkpoint-family constants ---
+        if self.latent_dim != LATENT_DIM:
+            raise ValueError(
+                f"latent_dim mismatch: got {self.latent_dim}, expected {LATENT_DIM}. "
+                "Check audio_tokenizer_config.enc_kwargs.latent_dim."
+            )
+        if self.patch_size != PATCH_SIZE:
+            raise ValueError(
+                f"patch_size mismatch: got {self.patch_size}, expected {PATCH_SIZE}. Check ditar_config.patch_size."
+            )
+        if self.history_patch_size != HISTORY_PATCH_SIZE:
+            raise ValueError(
+                f"history_patch_size mismatch: got {self.history_patch_size}, expected {HISTORY_PATCH_SIZE}. "
+                "Check ditar_config.history_patch_size."
+            )
+        if self.llm_hidden_size != LLM_HIDDEN_SIZE:
+            raise ValueError(
+                f"llm_hidden_size mismatch: got {self.llm_hidden_size}, expected {LLM_HIDDEN_SIZE}. "
+                "Check llm_config.hidden_size."
+            )
+        if self.llm_vocab_size != LLM_VOCAB_SIZE:
+            raise ValueError(f"llm_vocab_size mismatch: got {self.llm_vocab_size}, expected {LLM_VOCAB_SIZE}.")
+        if self.sample_rate != SAMPLE_RATE:
+            raise ValueError(f"sample_rate mismatch: got {self.sample_rate}, expected {SAMPLE_RATE}.")
+
+        # --- Cross-config consistency checks ---
+        if self.vae_patch_size != self.patch_size:
+            raise ValueError(f"VAE patch size ({self.vae_patch_size}) != flow/DiT patch size ({self.patch_size}).")
+
+        llm_hidden_from_cfg = self.llm_config.get("hidden_size")
+        if llm_hidden_from_cfg is not None and llm_hidden_from_cfg != self.llm_hidden_size:
+            raise ValueError(
+                f"llm_hidden_size ({self.llm_hidden_size}) != llm_config.hidden_size ({llm_hidden_from_cfg})."
+            )
+
+        agg_h = self.aggregator_config.get("hidden_size")
+        dit_h = self.ditar_config.get("hidden_size")
+        if agg_h is not None and dit_h is not None and agg_h != dit_h:
+            raise ValueError(f"aggregator_config.hidden_size ({agg_h}) != ditar_config.hidden_size ({dit_h}).")
+        if agg_h is not None and agg_h != AGGREGATOR_HIDDEN_SIZE:
+            raise ValueError(f"aggregator hidden_size mismatch: got {agg_h}, expected {AGGREGATOR_HIDDEN_SIZE}.")
+        if dit_h is not None and dit_h != AGGREGATOR_HIDDEN_SIZE:
+            raise ValueError(f"ditar hidden_size mismatch: got {dit_h}, expected {AGGREGATOR_HIDDEN_SIZE}.")
+
+        atc = self.audio_tokenizer_config
+        enc_latent = _nested_get(atc, "enc_kwargs", "latent_dim", default=None)
+        dec_latent = _nested_get(atc, "dec_kwargs", "latent_dim", default=None)
+        if enc_latent is not None and enc_latent != self.latent_dim:
+            raise ValueError(f"audio enc latent_dim ({enc_latent}) != Ming latent_dim ({self.latent_dim}).")
+        if dec_latent is not None and dec_latent != self.latent_dim:
+            raise ValueError(f"audio dec latent_dim ({dec_latent}) != Ming latent_dim ({self.latent_dim}).")
+
+        atc_patch = _nested_get(atc, "patch_size", default=None)
+        if atc_patch is not None and atc_patch != self.vae_patch_size:
+            raise ValueError(
+                f"audio_tokenizer_config.patch_size ({atc_patch}) != vae_patch_size ({self.vae_patch_size})."
+            )
+
+        atc_sr = _nested_get(atc, "sample_rate", default=None)
+        if atc_sr is not None and atc_sr != self.sample_rate:
+            raise ValueError(f"audio_tokenizer_config.sample_rate ({atc_sr}) != sample_rate ({self.sample_rate}).")
+
+        enc_input_dim = _nested_get(atc, "enc_kwargs", "input_dim", default=None)
+        enc_hop_size = _nested_get(atc, "enc_kwargs", "hop_size", default=None)
+        dec_output_dim = _nested_get(atc, "dec_kwargs", "output_dim", default=None)
+
+        if enc_input_dim is not None and enc_hop_size is not None and enc_input_dim != enc_hop_size:
+            raise ValueError(f"AudioVAE encoder input_dim ({enc_input_dim}) != hop_size ({enc_hop_size}).")
+        if enc_hop_size is not None and dec_output_dim is not None and enc_hop_size != dec_output_dim:
+            raise ValueError(
+                f"AudioVAE encoder hop_size ({enc_hop_size}) != decoder output_dim ({dec_output_dim}). "
+                "Expected 882 in this checkpoint family."
+            )
+
+        # Runtime tuning sanity
+        if self.latent_chunk_size <= 0:
+            raise ValueError(f"latent_chunk_size must be > 0, got {self.latent_chunk_size}.")
+        if self.latent_left_context < 0:
+            raise ValueError(f"latent_left_context must be >= 0, got {self.latent_left_context}.")
+        if self.max_decode_steps <= 0:
+            raise ValueError(f"max_decode_steps must be > 0, got {self.max_decode_steps}.")
+        if not (0.0 <= self.stop_head_threshold <= 1.0):
+            raise ValueError(f"stop_head_threshold must be in [0,1], got {self.stop_head_threshold}.")
+        if self.stop_head_min_steps < 0:
+            raise ValueError(f"stop_head_min_steps must be >= 0, got {self.stop_head_min_steps}.")
+
+    def make_qwen2_config(self) -> Qwen2Config:
+        """Reconstruct Qwen2Config for Stage-1 LLM backbone init."""
+        if not self.llm_config:
+            raise ValueError("llm_config is empty; from_hf_config() failed to parse nested llm_config.")
+        return Qwen2Config.from_dict(self.llm_config)
+
+    @property
+    def latent_patch_shape(self) -> tuple[int, int]:
+        return (self.patch_size, self.latent_dim)
+
+    @property
+    def chunk_frames(self) -> int:
+        return self.latent_chunk_size * self.patch_size
+
+    @property
+    def approx_chunk_seconds(self) -> float:
+        # One latent frame ~ one 882-sample hop in this checkpoint family.
+        return (self.chunk_frames * self.audio_frame_hop) / float(self.sample_rate)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _to_plain_dict(obj: Any) -> dict[str, Any]:
+    """Normalize nested config objects into plain dicts when possible."""
+    if obj is None:
+        return {}
+    if isinstance(obj, dict):
+        return dict(obj)
+    if isinstance(obj, PretrainedConfig):
+        return obj.to_dict()
+    if hasattr(obj, "to_dict") and callable(obj.to_dict):
+        try:
+            return dict(obj.to_dict())
+        except Exception:
+            pass
+    try:
+        return dict(vars(obj))
+    except Exception:
+        return {}
+
+
+def _coerce_audio_vae_config(atc_raw: Any) -> AudioVAEconfig | None:
+    """
+    Normalize audio_tokenizer_config into AudioVAEconfig when possible.
+    Handles:
+      - already AudioVAEconfig
+      - dict
+      - PretrainedConfig-like object
+    """
+    if atc_raw is None:
+        return None
+    atc_dict = _to_plain_dict(atc_raw)
+    if not atc_dict:
+        # Return raw object as fallback; _nested_get/validate can still work
+        return atc_raw  # type: ignore[return-value]
+
+    if hasattr(AudioVAEconfig, "from_dict") and callable(getattr(AudioVAEconfig, "from_dict")):
+        try:
+            return AudioVAEconfig.from_dict(atc_dict)  # type: ignore[misc]
+        except Exception:
+            pass
+    try:
+        return AudioVAEconfig(**atc_dict)  # type: ignore[arg-type]
+    except Exception:
+        return atc_raw  # type: ignore[return-value]
+
+
+def _nested_get(obj: Any, *keys: str, default: Any = None) -> Any:
+    """Safe nested attribute/key access for dicts and config-like objects."""
+    cur = obj
+    for k in keys:
+        if cur is None:
+            return default
+        if isinstance(cur, dict):
+            cur = cur.get(k)
+        else:
+            cur = getattr(cur, k, None)
+    return cur if cur is not None else default
diff --git a/vllm_omni/model_executor/models/ming_tts/configuration_ming_dense.py b/vllm_omni/model_executor/models/ming_tts/configuration_ming_dense.py
new file mode 100644
index 00000000000..d6f5c8182e9
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/configuration_ming_dense.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from typing import Any
+
+from transformers import PretrainedConfig, Qwen2Config
+
+from .audio_tokenizer.configuration_audio_vae import AudioVAEconfig
+
+
+def _coerce_qwen2_config(value: Any) -> Qwen2Config:
+    if isinstance(value, Qwen2Config):
+        return value
+    if isinstance(value, PretrainedConfig):
+        return Qwen2Config.from_dict(value.to_dict())
+    if isinstance(value, dict):
+        return Qwen2Config.from_dict(dict(value))
+    raise TypeError(f"Unsupported llm_config type for Ming dense config: {type(value)!r}")
+
+
+def _coerce_audio_vae_config(value: Any) -> AudioVAEconfig | None:
+    if value is None:
+        return None
+    if isinstance(value, AudioVAEconfig):
+        value = value.to_dict()
+    elif isinstance(value, PretrainedConfig):
+        value = value.to_dict()
+    elif isinstance(value, dict):
+        value = dict(value)
+    else:
+        raise TypeError(f"Unsupported audio_tokenizer_config type for Ming dense config: {type(value)!r}")
+
+    return AudioVAEconfig(**value)
+
+
+class MingDenseConfig(PretrainedConfig):
+    model_type = "dense"
+
+    def __init__(
+        self,
+        llm_config: Qwen2Config | dict[str, Any] | None = None,
+        ditar_config: dict[str, Any] | None = None,
+        aggregator_config: dict[str, Any] | None = None,
+        audio_tokenizer_config: AudioVAEconfig | dict[str, Any] | None = None,
+        architectures: list[str] | None = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(architectures=architectures, **kwargs)
+        self.llm_config = _coerce_qwen2_config(llm_config or {})
+        self.ditar_config = dict(ditar_config or {})
+        self.aggregator_config = dict(aggregator_config or {})
+        self.audio_tokenizer_config = _coerce_audio_vae_config(audio_tokenizer_config)
+
+    def get_text_config(self, decoder: bool = False, **kwargs: Any) -> Qwen2Config:
+        del decoder, kwargs
+        return self.llm_config
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/__init__.py b/vllm_omni/model_executor/models/ming_tts/fm/__init__.py
new file mode 100644
index 00000000000..208f01a7cb5
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/fm/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/cfm.py b/vllm_omni/model_executor/models/ming_tts/fm/cfm.py
new file mode 100644
index 00000000000..b1924973b47
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/fm/cfm.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/fm/CFM.py
+
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class Solver:
+    def __init__(self, func, y0, sigma=0.25, temperature=1.5) -> None:
+        self.func = func
+        self.y0 = y0
+        self.sigma = sigma
+        self.temperature = temperature
+
+    def integrate(self, t):
+        solution = torch.empty(len(t), *self.y0.shape, dtype=self.y0.dtype, device=self.y0.device)
+        solution[0] = self.y0
+
+        j = 1
+        y0 = self.y0
+        for t0, t1 in zip(t[:-1], t[1:]):
+            dt = t1 - t0
+            f0 = self.func(t0, y0)
+            dy = dt * f0
+            y1 = y0 + dy
+
+            while j < len(t) and t1 >= t[j]:
+                solution[j] = self._linear_interp(t0, t1, y0, y1, t[j])
+                j += 1
+
+            noise = torch.randn_like(y0)
+            shift = self.sigma * (self.temperature**0.5) * (abs(dt) ** 0.5) * noise
+            y0 = y1 + shift
+
+        return solution
+
+    def _linear_interp(self, t0, t1, y0, y1, t):
+        if t == t0:
+            return y0
+        if t == t1:
+            return y1
+        slope = (t - t0) / (t1 - t0)
+        return y0 + slope * (y1 - y0)
+
+
+def get_epss_timesteps(n, device, dtype):
+    dt = 1 / 32
+    predefined_timesteps = {
+        5: [0, 2, 4, 8, 16, 32],
+        6: [0, 2, 4, 6, 8, 16, 32],
+        7: [0, 2, 4, 6, 8, 16, 24, 32],
+        10: [0, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32],
+        12: [0, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32],
+        16: [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32],
+    }
+    t = predefined_timesteps.get(n, [])
+    if not t:
+        return torch.linspace(0, 1, n + 1, device=device, dtype=dtype)
+    return dt * torch.tensor(t, device=device, dtype=dtype)
+
+
+class CFM(nn.Module):
+    def __init__(
+        self,
+        model: nn.Module,
+    ):
+        super().__init__()
+        self.model = model
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def forward(
+        self,
+        cond,
+        target,
+        latent_history,
+        mask,
+        patch_size,
+    ):
+        if patch_size <= 0:
+            raise ValueError(f"patch_size must be positive, got {patch_size}")
+        if cond.ndim != 3:
+            raise ValueError(f"Expected cond rank-3 [Batch, Time, Dimension], got {tuple(cond.shape)}")
+        if target.ndim != 3:
+            raise ValueError(f"Expected target rank-3 [Batch, Time, Dimension], got {tuple(target.shape)}")
+        if latent_history.ndim != 3:
+            raise ValueError(
+                f"Expected latent_history rank-3 [Batch, Time, Dimension], got {tuple(latent_history.shape)}"
+            )
+        if cond.shape[0] != target.shape[0] or cond.shape[0] != latent_history.shape[0]:
+            raise ValueError(
+                "Batch mismatch across cond, target, and latent_history: "
+                f"{cond.shape[0]}, {target.shape[0]}, {latent_history.shape[0]}"
+            )
+        token_mask = _coerce_token_mask(
+            mask, batch_size=target.shape[0], target_steps=target.shape[1], device=target.device
+        )
+
+        x1 = target
+        batch, dtype = x1.shape[0], x1.dtype
+        x0 = torch.randn_like(x1)
+        time = torch.rand((batch,), dtype=dtype, device=self.device)
+        # sample xt (φ_t(x) in the paper)
+        t = time.unsqueeze(-1).unsqueeze(-1)
+        x = (1 - t) * x0 + t * x1
+        flow = x1 - x0
+
+        pred = self.model(x=x, t=time, c=cond, latent_history=latent_history, mask=token_mask)
+        pred = pred[:, -patch_size:, :]
+
+        loss = F.mse_loss(pred, flow, reduction="none")
+        loss_mask = token_mask.unsqueeze(-1).expand_as(loss)
+        loss = loss[loss_mask]
+
+        return loss.mean()
+
+    @torch.no_grad()
+    def sample(
+        self,
+        noise,
+        c,
+        latent_history,
+        steps=10,
+        cfg_scale=1.0,
+        sway_sampling_coef=-1.0,
+        use_epss=True,
+        patch_size=1,
+        sigma=0.25,
+        temperature=1.5,
+    ):
+        if steps <= 0:
+            raise ValueError(f"steps must be positive, got {steps}")
+        if patch_size <= 0:
+            raise ValueError(f"patch_size must be positive, got {patch_size}")
+        if noise.ndim != 3:
+            raise ValueError(f"Expected noise rank-3 [Batch, Dimension, Time], got {tuple(noise.shape)}")
+        if c.ndim != 3:
+            raise ValueError(f"Expected conditioning rank-3 [Batch, Time, Dimension], got {tuple(c.shape)}")
+        if latent_history.ndim != 3:
+            raise ValueError(
+                f"Expected latent_history rank-3 [Batch, Time, Dimension], got {tuple(latent_history.shape)}"
+            )
+        if noise.shape[0] != c.shape[0] or noise.shape[0] != latent_history.shape[0]:
+            raise ValueError(
+                "Batch mismatch across noise, conditioning, and latent_history: "
+                f"{noise.shape[0]}, {c.shape[0]}, {latent_history.shape[0]}"
+            )
+        if noise.shape[-1] != patch_size:
+            raise ValueError(f"noise time dim mismatch: got {noise.shape[-1]}, expected patch_size={patch_size}")
+
+        def fn(t, x):
+            if cfg_scale < 1e-5:
+                if t.ndim == 0:
+                    t = t.repeat(x.shape[0])
+                pred = self.model(
+                    x=x,
+                    t=t,
+                    c=torch.zeros_like(c),
+                    latent_history=latent_history,
+                )
+                return pred[:, -patch_size:, :]
+
+            # predict flow (cond and uncond), for classifier-free guidance
+            pred_cfg = self.model.forward_with_cfg(
+                x=x,
+                t=t,
+                c=c,
+                latent_history=latent_history,
+                cfg_scale=cfg_scale,
+                patch_size=patch_size,
+            )
+            pred, null_pred = torch.chunk(pred_cfg, 2, dim=0)
+            return pred + (pred - null_pred) * cfg_scale
+
+        y0 = noise.transpose(1, 2)
+        t_start = 0
+
+        if t_start == 0 and use_epss:  # use Empirically Pruned Step Sampling for low NFE
+            t = get_epss_timesteps(steps, device=self.device, dtype=noise.dtype)
+        else:
+            t = torch.linspace(t_start, 1, steps + 1, device=self.device, dtype=noise.dtype)
+        if sway_sampling_coef is not None:
+            t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
+
+        solver = Solver(fn, y0, sigma=sigma, temperature=temperature)
+        trajectory = solver.integrate(t)
+        sampled = trajectory[-1]
+        out = sampled
+
+        return out, trajectory
+
+
+def _coerce_token_mask(mask, *, batch_size, target_steps, device):
+    if not isinstance(mask, torch.Tensor):
+        mask = torch.as_tensor(mask, device=device)
+    if mask.ndim == 3 and mask.shape[-1] == 1:
+        mask = mask.squeeze(-1)
+    if mask.ndim != 2:
+        raise ValueError(f"Expected mask rank-2 [Batch, Time] or rank-3 [Batch, Time, 1], got {tuple(mask.shape)}")
+    if mask.shape[0] != batch_size or mask.shape[1] != target_steps:
+        raise ValueError(f"Mask shape mismatch: got {tuple(mask.shape)}, expected {(batch_size, target_steps)}")
+    return mask.to(device=device, dtype=torch.bool)
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/dit.py b/vllm_omni/model_executor/models/ming_tts/fm/dit.py
new file mode 100644
index 00000000000..2024f26ca2d
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/fm/dit.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/fm/dit.py
+
+import math
+
+import torch
+import torch.nn as nn
+from x_transformers.x_transformers import RotaryEmbedding
+
+from .modules import DiTBlock, FinalLayer
+
+
+class SinusPositionEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x, scale=1000):
+        if x.ndim == 0:
+            x = x.reshape(1)
+        if x.ndim != 1:
+            raise ValueError(f"Expected timestep rank-1 [Batch], got {tuple(x.shape)}")
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+
+
+class TimestepEmbedder(nn.Module):
+    def __init__(self, dim, freq_embed_dim=256):
+        super().__init__()
+        self.time_embed = SinusPositionEmbedding(freq_embed_dim)
+        self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+
+    def forward(self, timestep):
+        time_hidden = self.time_embed(timestep)
+        time_hidden = time_hidden.to(timestep.dtype)
+        time = self.time_mlp(time_hidden)  # b d
+        return time
+
+
+class CondEmbedder(nn.Module):
+    def __init__(self, input_feature_size, hidden_size, dropout_prob):
+        super().__init__()
+        self.dropout_prob = dropout_prob
+        self.cond_embedder = nn.Linear(input_feature_size, hidden_size)
+
+    def cond_drop(self, llm_cond):
+        if llm_cond.ndim != 3:
+            raise ValueError(f"Expected conditioning rank-3 [Batch, Time, Dimension], got {tuple(llm_cond.shape)}")
+        bsz = llm_cond.shape[0]
+        drop_latent_mask = torch.rand(bsz) < self.dropout_prob
+        drop_latent_mask = drop_latent_mask.unsqueeze(-1).unsqueeze(-1).to(llm_cond.dtype).to(llm_cond.device)
+        fake_latent = torch.zeros_like(llm_cond)
+        llm_cond = drop_latent_mask * fake_latent + (1 - drop_latent_mask) * llm_cond
+
+        return llm_cond
+
+    def forward(self, llm_cond, train):
+        if llm_cond.ndim != 3:
+            raise ValueError(f"Expected conditioning rank-3 [Batch, Time, Dimension], got {tuple(llm_cond.shape)}")
+        use_dropout = self.dropout_prob > 0
+        if train and use_dropout:
+            llm_cond = self.cond_drop(llm_cond)
+
+        llm_cond = self.cond_embedder(llm_cond)
+
+        return llm_cond
+
+
+class DiT(nn.Module):
+    def __init__(
+        self,
+        in_channels=4,
+        hidden_size=1024,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        llm_cond_dim=896,
+        cfg_dropout_prob=0.1,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.num_heads = num_heads
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.x_embedder = nn.Linear(in_channels, hidden_size)
+        self.c_embedder = CondEmbedder(llm_cond_dim, hidden_size, cfg_dropout_prob)
+        self.hidden_size = hidden_size
+        self.rotary_embed = RotaryEmbedding(hidden_size // num_heads)
+        self.blocks = nn.ModuleList(
+            [DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, **kwargs) for _ in range(depth)]
+        )
+        self.final_layer = FinalLayer(hidden_size, self.out_channels)
+
+    def forward(self, x, t, c, latent_history, mask=None):
+        if x.ndim != 3:
+            raise ValueError(f"Expected x rank-3 [Batch, Time, Dimension], got {tuple(x.shape)}")
+        if latent_history.ndim != 3:
+            raise ValueError(
+                f"Expected latent_history rank-3 [Batch, Time, Dimension], got {tuple(latent_history.shape)}"
+            )
+        if c.ndim != 3:
+            raise ValueError(f"Expected conditioning rank-3 [Batch, Time, Dimension], got {tuple(c.shape)}")
+        if x.shape[0] != latent_history.shape[0] or x.shape[0] != c.shape[0]:
+            raise ValueError(
+                "Batch mismatch across x, conditioning, and latent_history: "
+                f"{x.shape[0]}, {c.shape[0]}, {latent_history.shape[0]}"
+            )
+        if x.shape[-1] != self.in_channels:
+            raise ValueError(f"x feature dim mismatch: got {x.shape[-1]}, expected {self.in_channels}")
+        if latent_history.shape[-1] != self.in_channels:
+            raise ValueError(
+                f"latent_history feature dim mismatch: got {latent_history.shape[-1]}, expected {self.in_channels}"
+            )
+        if t.ndim == 0:
+            t = t.reshape(1)
+        if t.ndim != 1:
+            raise ValueError(f"Expected timestep rank-1 [Batch], got {tuple(t.shape)}")
+        if t.shape[0] != x.shape[0]:
+            raise ValueError(f"Timestep batch mismatch: got {t.shape[0]}, expected {x.shape[0]}")
+
+        t = self.t_embedder(t).unsqueeze(1)
+        x_now = self.x_embedder(x)
+        x_history = self.x_embedder(latent_history)
+        x = torch.cat([x_history, x_now], dim=1)
+        c = self.c_embedder(c, self.training)
+        y = t + c
+        x = torch.cat([y, x], dim=1)
+        rope = self.rotary_embed.forward_from_seq_len(x.shape[1])
+
+        if mask is not None:
+            if mask.ndim != 2:
+                raise ValueError(f"Expected mask rank-2 [Batch, Time], got {tuple(mask.shape)}")
+            if mask.shape[0] != x_now.shape[0] or mask.shape[1] != x_now.shape[1]:
+                raise ValueError(
+                    f"Mask shape mismatch: got {tuple(mask.shape)}, expected {(x_now.shape[0], x_now.shape[1])}"
+                )
+            mask_pad = mask.clone().detach()[:, :1].expand(-1, x_history.shape[1] + c.shape[1])
+            mask = torch.cat([mask_pad, mask], dim=-1)
+        for block in self.blocks:
+            x = block(x, mask, rope)
+        x = self.final_layer(x)
+        return x
+
+    def forward_with_cfg(self, x, t, c, cfg_scale, latent_history, patch_size):
+        if patch_size <= 0:
+            raise ValueError(f"patch_size must be positive, got {patch_size}")
+        if not cfg_scale == 1:
+            x = torch.cat([x, x], dim=0)
+            latent_history = torch.cat([latent_history, latent_history], dim=0)
+            fake_latent = torch.zeros_like(c)
+            c = torch.cat([c, fake_latent], dim=0)
+        if t.ndim == 0:
+            t = t.repeat(x.shape[0])
+        model_out = self.forward(x, t, c, latent_history)
+        return model_out[:, -patch_size:, :]
+
+
+class Aggregator(nn.Module):
+    def __init__(
+        self,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        llm_input_dim=896,
+        **kwargs,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.num_heads = num_heads
+
+        self.word_embedder = nn.Embedding(1, hidden_size)
+        self.x_embedder = nn.Linear(in_channels, hidden_size)
+        self.hidden_size = hidden_size
+
+        self.rotary_embed = RotaryEmbedding(hidden_size // num_heads)
+
+        self.blocks = nn.ModuleList(
+            [DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, **kwargs) for _ in range(depth)]
+        )
+        self.final_layer = FinalLayer(hidden_size, llm_input_dim)
+
+    def forward(self, x, mask=None):
+        if x.ndim != 3:
+            raise ValueError(f"Expected x rank-3 [Batch, Time, Dimension], got {tuple(x.shape)}")
+        if x.shape[-1] != self.in_channels:
+            raise ValueError(f"x feature dim mismatch: got {x.shape[-1]}, expected {self.in_channels}")
+        x = self.x_embedder(x)
+        cls_embed = self.word_embedder(torch.zeros((x.shape[0], 1), dtype=torch.long, device=x.device))
+        x = torch.cat([cls_embed, x], dim=1)
+
+        rope = self.rotary_embed.forward_from_seq_len(x.shape[1])
+        if mask is not None:
+            if mask.ndim != 2:
+                raise ValueError(f"Expected mask rank-2 [Batch, Time], got {tuple(mask.shape)}")
+            if mask.shape[0] != x.shape[0] or mask.shape[1] != x.shape[1] - 1:
+                raise ValueError(
+                    f"Mask shape mismatch: got {tuple(mask.shape)}, expected {(x.shape[0], x.shape[1] - 1)}"
+                )
+            mask_pad = mask.clone().detach()[:, :1]
+            mask = torch.cat([mask_pad, mask], dim=-1)
+        for block in self.blocks:
+            x = block(x, mask, rope)
+        x = self.final_layer(x)
+        x = x[:, :1, :]
+        return x
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/flowloss.py b/vllm_omni/model_executor/models/ming_tts/fm/flowloss.py
new file mode 100644
index 00000000000..18c59186c3a
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/fm/flowloss.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/fm/flowloss.py
+
+import torch
+import torch.nn as nn
+
+from .cfm import CFM
+from .dit import DiT
+
+
+class FlowLoss(nn.Module):
+    """Diffusion Loss"""
+
+    def __init__(self, z_channels, llm_cond_dim, **kwargs):
+        super().__init__()
+        self.z_channels = z_channels
+        self.cfm = CFM(model=DiT(in_channels=z_channels, llm_cond_dim=llm_cond_dim, **kwargs))
+
+    def forward(self, cond, target, latent_history, mask, patch_size):
+        return self.cfm(cond=cond, target=target, latent_history=latent_history, mask=mask, patch_size=patch_size)
+
+    def sample(self, z, latent_history, cfg=2.0, patch_size=1, sigma=0.25, temperature=0):
+        if z.ndim != 3:
+            raise ValueError(f"Expected z rank-3 [Batch, Time, Dimension], got {tuple(z.shape)}")
+        if z.shape[1] != 1:
+            raise ValueError(f"Expected z time dim to be 1 for Ming dense decode, got {z.shape[1]}")
+        if latent_history.ndim != 3:
+            raise ValueError(
+                f"Expected latent_history rank-3 [Batch, Time, Dimension], got {tuple(latent_history.shape)}"
+            )
+        if z.shape[0] != latent_history.shape[0]:
+            raise ValueError(f"Batch mismatch: z batch={z.shape[0]} vs latent_history batch={latent_history.shape[0]}")
+        if patch_size <= 0:
+            raise ValueError(f"patch_size must be positive, got {patch_size}")
+        if not torch.isfinite(z).all():
+            raise RuntimeError("Non-finite conditioning z in FlowLoss.sample().")
+        if not torch.isfinite(latent_history).all():
+            raise RuntimeError("Non-finite latent_history in FlowLoss.sample().")
+        noise = torch.randn(z.shape[0], self.z_channels, patch_size, device=z.device)
+        if not torch.isfinite(noise).all():
+            raise RuntimeError("Non-finite noise in FlowLoss.sample().")
+        noise = noise.to(dtype=z.dtype)  # match conditioning dtype — no autocast in vllm-omni
+        out, _ = self.cfm.sample(
+            noise=noise,
+            c=z,
+            latent_history=latent_history,
+            cfg_scale=cfg,
+            patch_size=patch_size,
+            sigma=sigma,
+            temperature=temperature,
+        )
+        # out shape: [B, patch_size, z_channels]
+        return out
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/modules.py b/vllm_omni/model_executor/models/ming_tts/fm/modules.py
new file mode 100644
index 00000000000..1163f8f1837
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/fm/modules.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/fm/modules.py
+import torch
+import torch.nn.functional as F
+from torch import nn
+from x_transformers.x_transformers import apply_rotary_pos_emb
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.native_rms_norm = float(torch.__version__[:3]) >= 2.4
+
+    def forward(self, x):
+        if self.native_rms_norm:
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                x = x.to(self.weight.dtype)
+            x = F.rms_norm(x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps)
+        else:
+            variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.eps)
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                x = x.to(self.weight.dtype)
+            x = x * self.weight
+
+        return x
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+
+        activation = nn.GELU(approximate=approximate)
+        project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
+        self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
+
+    def forward(self, x):
+        return self.ff(x)
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("SDPA requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+        self.dim = dim
+        self.heads = heads
+        self.inner_dim = dim_head * heads
+        self.dropout = dropout
+        self.to_q = nn.Linear(dim, self.inner_dim)
+        self.to_k = nn.Linear(dim, self.inner_dim)
+        self.to_v = nn.Linear(dim, self.inner_dim)
+
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(self.inner_dim, dim))
+        self.to_out.append(nn.Dropout(dropout))
+
+    def forward(
+        self,
+        x: float,  # noised input x
+        mask=None,
+        rope=None,  # rotary position embedding for x
+    ) -> torch.Tensor:
+        if x.ndim != 3:
+            raise ValueError(f"Expected x rank-3 [Batch, Time, Dimension], got {tuple(x.shape)}")
+        if x.shape[-1] != self.dim:
+            raise ValueError(f"x feature dim mismatch: got {x.shape[-1]}, expected {self.dim}")
+        if mask is not None:
+            if mask.ndim != 2:
+                raise ValueError(f"Expected mask rank-2 [Batch, Time], got {tuple(mask.shape)}")
+            if mask.shape[0] != x.shape[0] or mask.shape[1] != x.shape[1]:
+                raise ValueError(f"Mask shape mismatch: got {tuple(mask.shape)}, expected {tuple(x.shape[:2])}")
+
+        batch_size = x.shape[0]
+
+        query = self.to_q(x)
+        key = self.to_k(x)
+        value = self.to_v(x)
+
+        # attention
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // self.heads
+        query = query.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+
+        # apply rotary position embedding
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+
+        x = F.scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
+        x = x.transpose(1, 2).reshape(batch_size, -1, self.heads * head_dim)
+        x = x.to(query.dtype)
+        x = self.to_out[0](x)
+        x = self.to_out[1](x)
+
+        if mask is not None:
+            mask = mask.unsqueeze(-1)
+            x = x.masked_fill(~mask, 0.0)
+
+        return x
+
+
+class DiTBlock(nn.Module):
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, dropout=0.1, **kwargs):
+        super().__init__()
+        self.norm1 = RMSNorm(hidden_size, eps=1e-6)
+        self.attn = Attention(dim=hidden_size, heads=num_heads, dim_head=hidden_size // num_heads, dropout=dropout)
+        self.norm2 = RMSNorm(hidden_size, eps=1e-6)
+        self.mlp = FeedForward(dim=hidden_size, mult=mlp_ratio, dropout=dropout, approximate="tanh")
+
+    def forward(self, x, mask, rope):
+        x = x + self.attn(self.norm1(x), mask=mask, rope=rope)
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+
+    def __init__(self, hidden_size, out_channels):
+        super().__init__()
+        self.norm_final = RMSNorm(hidden_size, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+
+    def forward(self, x):
+        x = self.norm_final(x)
+        x = self.linear(x)
+        return x
diff --git a/vllm_omni/model_executor/models/ming_tts/ingress.py b/vllm_omni/model_executor/models/ming_tts/ingress.py
new file mode 100644
index 00000000000..77428164d5a
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/ingress.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import copy
+import os
+import time
+from typing import Any
+
+from vllm.logger import init_logger
+
+from .config_ming_tts import (
+    AUDIO_DUMMY_TOKEN_ID,
+    AUDIO_START_TOKEN_ID,
+    KEY_PROMPT_LATENTS,
+    KEY_SPEAKER_EMBEDDING,
+    KEY_TEXT_MODE,
+    MingTTSConfig,
+)
+from .prompt_builder import (
+    build_dense_prompt_token_ids,
+    coerce_speaker_embeddings,
+    count_prompt_waveform_patches,
+    create_instruction,
+)
+
+logger = init_logger(__name__)
+
+
+def _rebuild_prompt_token_ids_with_exact_patch_count(prompt_token_ids: Any, prompt_patch_count: int) -> list[int]:
+    if not isinstance(prompt_token_ids, list) or not prompt_token_ids:
+        raise ValueError("Ming prompt finalization requires existing prompt_token_ids")
+
+    audio_start_index = -1
+    for idx in range(len(prompt_token_ids) - 1, -1, -1):
+        if int(prompt_token_ids[idx]) == AUDIO_START_TOKEN_ID:
+            audio_start_index = idx
+            break
+    if audio_start_index < 0:
+        raise ValueError("Ming prompt finalization could not locate <audio> token")
+
+    trailing_tokens = prompt_token_ids[audio_start_index + 1 :]
+    if any(int(token_id) != AUDIO_DUMMY_TOKEN_ID for token_id in trailing_tokens):
+        raise ValueError("Ming prompt finalization expected only trailing <audioPatch> tokens after <audio>")
+
+    return prompt_token_ids[: audio_start_index + 1] + ([AUDIO_DUMMY_TOKEN_ID] * int(prompt_patch_count))
+
+
+class MingIngressProcessor:
+    def __init__(self, *, vllm_config: Any, tokenizer: Any):
+        if tokenizer is None:
+            raise RuntimeError("Ming ingress processor requires an initialized tokenizer")
+
+        self.tokenizer = tokenizer
+        self.profile_ingress = (
+            os.environ.get("MING_TTS_INGRESS_PROFILE") == "1" or os.environ.get("MING_TTS_ASYNC_DEBUG") == "1"
+        )
+
+        self.ming_config = MingTTSConfig.from_hf_config(vllm_config.model_config.hf_config)
+        self.ming_config.validate()
+
+    def __call__(self, prompt: Any) -> Any:
+        total_start = time.perf_counter()
+        if not isinstance(prompt, dict):
+            return prompt
+
+        raw_additional_information = prompt.get("additional_information")
+        if raw_additional_information is None:
+            additional_information = {}
+        elif isinstance(raw_additional_information, dict):
+            additional_information = raw_additional_information
+        else:
+            return prompt
+
+        modalities = prompt.get("modalities")
+        text_mode = isinstance(modalities, (list, tuple)) and ("text" in modalities) and ("audio" not in modalities)
+        if text_mode:
+            finalized_prompt = copy.copy(prompt)
+            finalized_additional_information = dict(additional_information)
+            finalized_additional_information[KEY_TEXT_MODE] = True
+            prompt_token_ids = finalized_prompt.get("prompt_token_ids")
+            if isinstance(prompt_token_ids, list) and prompt_token_ids:
+                if int(prompt_token_ids[-1]) == AUDIO_START_TOKEN_ID:
+                    finalized_prompt["prompt_token_ids"] = prompt_token_ids[:-1]
+            finalized_prompt["additional_information"] = finalized_additional_information
+            return finalized_prompt
+
+        prompt_waveform = additional_information.get("prompt_waveform", prompt.get("prompt_waveform"))
+        prompt_text = additional_information.get("prompt_text", prompt.get("prompt_text"))
+        if prompt_waveform is None:
+            return prompt
+        if prompt_text is None:
+            raise RuntimeError(
+                "Ming prompt_waveform requires prompt_text before ingress can build prompt latents. "
+                "Use ming_speaker_embedding for reference-audio-only speaker conditioning."
+            )
+
+        prompt_latents = additional_information.get(KEY_PROMPT_LATENTS, prompt.get("prompt_latents"))
+        if prompt_latents is not None:
+            raise ValueError(
+                "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
+                "Choose exactly one source of truth."
+            )
+
+        patch_start = time.perf_counter()
+        prompt_patch_count = count_prompt_waveform_patches(
+            prompt_waveform,
+            patch_size=self.ming_config.patch_size,
+            frame_hop=self.ming_config.audio_frame_hop,
+            vae_patch_size=self.ming_config.vae_patch_size,
+        )
+        patch_ms = (time.perf_counter() - patch_start) * 1000.0
+
+        finalized_prompt = copy.copy(prompt)
+        finalized_additional_information = dict(additional_information)
+        finalized_prompt["additional_information"] = finalized_additional_information
+
+        prompt_prefix = finalized_prompt.get("prompt")
+        text = finalized_prompt.get("text")
+        token_start = time.perf_counter()
+        if isinstance(prompt_prefix, str) and isinstance(text, str):
+            speaker_embedding = finalized_prompt.get("speaker_embedding")
+            if speaker_embedding is None:
+                speaker_embedding = finalized_additional_information.get(KEY_SPEAKER_EMBEDDING)
+            speaker_embeddings = coerce_speaker_embeddings(
+                speaker_embedding,
+                use_zero_spk_emb=bool(finalized_additional_information.get("use_zero_spk_emb", False)),
+            )
+
+            instruction = finalized_prompt.get("instruction")
+            if instruction is None:
+                instruction = finalized_additional_information.get("instruction")
+            instruction_text = instruction if isinstance(instruction, str) else create_instruction(instruction)
+
+            finalized_prompt["prompt_token_ids"] = build_dense_prompt_token_ids(
+                self.tokenizer,
+                prompt=prompt_prefix,
+                text=text,
+                instruction=instruction_text,
+                prompt_text=prompt_text,
+                speaker_count=0 if speaker_embeddings is None else len(speaker_embeddings),
+                prompt_patch_count=prompt_patch_count,
+            )
+            if self.profile_ingress:
+                elapsed_ms = (time.perf_counter() - total_start) * 1000.0
+                token_ms = (time.perf_counter() - token_start) * 1000.0
+                logger.info(
+                    "MING_INGRESS_PROFILE finalize_prompt prompt_patch_count=%d speaker_count=%d "
+                    "patch_ms=%.3f token_rebuild_ms=%.3f elapsed_ms=%.3f",
+                    prompt_patch_count,
+                    0 if speaker_embeddings is None else len(speaker_embeddings),
+                    patch_ms,
+                    token_ms,
+                    elapsed_ms,
+                )
+            return finalized_prompt
+
+        finalized_prompt["prompt_token_ids"] = _rebuild_prompt_token_ids_with_exact_patch_count(
+            finalized_prompt.get("prompt_token_ids"),
+            prompt_patch_count,
+        )
+        if self.profile_ingress:
+            elapsed_ms = (time.perf_counter() - total_start) * 1000.0
+            token_ms = (time.perf_counter() - token_start) * 1000.0
+            logger.info(
+                "MING_INGRESS_PROFILE finalize_prompt prompt_patch_count=%d speaker_count=unknown "
+                "patch_ms=%.3f token_rebuild_ms=%.3f elapsed_ms=%.3f",
+                prompt_patch_count,
+                patch_ms,
+                token_ms,
+                elapsed_ms,
+            )
+        return finalized_prompt
+
+
+def build_ming_ingress_processor(*, vllm_config: Any, tokenizer: Any) -> MingIngressProcessor:
+    return MingIngressProcessor(vllm_config=vllm_config, tokenizer=tokenizer)
diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts.py b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
new file mode 100644
index 00000000000..3490c97f662
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
@@ -0,0 +1,581 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import json
+import os
+from functools import cached_property
+from io import BytesIO
+from pathlib import Path
+from typing import Any
+
+import torch
+import torch.nn as nn
+from safetensors import safe_open
+from vllm.config import VllmConfig
+from vllm.model_executor.models import SupportsPP
+from vllm.model_executor.models.utils import init_vllm_registered_model
+from vllm.v1.sample.sampler import Sampler
+
+from vllm_omni.model_executor.custom_process_mixin import CustomProcessMixin
+
+from .audio_tokenizer.modeling_audio_vae import AudioVAE
+from .config_ming_tts import (
+    AUDIO_START_TOKEN_ID,
+    KEY_CFG,
+    KEY_DECODE_STEP,
+    KEY_LAST_STOP_PROB,
+    KEY_LATENT_HISTORY,
+    KEY_MAX_DECODE_STEPS,
+    KEY_MIN_DECODE_STEPS,
+    KEY_NEXT_EMBEDS,
+    KEY_PROMPT_LATENT_TAIL,
+    KEY_PROMPT_LATENTS,
+    KEY_REQUEST_ID,
+    KEY_SIGMA,
+    KEY_SPEAKER_EMBEDDING,
+    KEY_TEMPERATURE,
+    KEY_TEXT_MODE,
+    VISION_START_TOKEN_ID,
+    MingTTSConfig,
+)
+from .prompt_builder import (
+    coerce_prompt_waveform,
+    coerce_speaker_embeddings,
+    count_prompt_latent_patches,
+    pad_prompt_waveform,
+)
+
+MING_STOP_REASON_KEY = "ming_stop_reason"
+
+
+class _ModelSampleAdapter(nn.Module):
+    def __init__(self, model: nn.Module):
+        super().__init__()
+        self.model = model
+
+    def forward(self, logits, sampling_metadata):
+        return self.model.sample(logits, sampling_metadata)
+
+
+class MingTTSForConditionalGeneration(nn.Module, SupportsPP, CustomProcessMixin):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        del prefix
+        self.vllm_config = vllm_config
+        self.ming_config = MingTTSConfig.from_hf_config(vllm_config.model_config.hf_config)
+        self.ming_config.validate()
+
+        self.have_multimodal_outputs = True
+        self.has_preprocess = False
+        self.has_postprocess = False
+        self.requires_raw_input_tokens = False
+
+        self.model_stage = vllm_config.model_config.model_stage
+        self._prompt_encoder = None
+
+        if self.model_stage == "llm":
+            self.model = init_vllm_registered_model(
+                vllm_config=vllm_config,
+                architectures=["MingLLMModel"],
+            )
+            self.has_preprocess = True
+            self.has_postprocess = True
+            self.set_custom_preprocess(self.preprocess)
+            self.set_custom_postprocess(self.postprocess)
+        elif self.model_stage == "audio_vae":
+            self.model = init_vllm_registered_model(
+                vllm_config=vllm_config,
+                architectures=["MingAudioVAEModel"],
+            )
+            self.requires_raw_input_tokens = True
+        else:
+            raise ValueError(f"Invalid Ming model_stage={self.model_stage}")
+
+        self.make_empty_intermediate_tensors = getattr(self.model, "make_empty_intermediate_tensors", lambda: None)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.model, "sample"):
+            return _ModelSampleAdapter(self.model)
+        if hasattr(self.model, "sampler"):
+            return self.model.sampler
+        return Sampler()
+
+    def embed_input_ids(self, input_ids: torch.Tensor, **kwargs: Any) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids=input_ids, **kwargs)
+
+    def forward(self, *args: Any, **kwargs: Any):
+        return self.model(*args, **kwargs)
+
+    def compute_logits(self, hidden_states, sampling_metadata=None):
+        return self.model.compute_logits(hidden_states, sampling_metadata=sampling_metadata)
+
+    def sample(self, logits, sampling_metadata):
+        if hasattr(self.model, "sample"):
+            return self.model.sample(logits, sampling_metadata)
+        return None
+
+    def load_weights(self, weights):
+        weights = list(weights)
+        if self.model_stage == "llm":
+            allowed = ("model.", "linear_proj_audio.", "flowloss.", "stop_head.", "spk_head.")
+            llm_weights = [(k, v) for k, v in weights if k.startswith(allowed)]
+            if not llm_weights:
+                raise RuntimeError(
+                    "Ming Stage-0 received no loadable checkpoint weights. "
+                    "Expected prefixes: model.*, linear_proj_audio.*, flowloss.*, stop_head.*, spk_head.*"
+                )
+            loaded = self.model.load_weights(llm_weights)
+            return {f"model.{name}" for name in loaded}
+
+        audio_weights = [(k, v) for k, v in weights if k.startswith("audio.")]
+        if not audio_weights:
+            raise RuntimeError("Ming Stage-1 received no loadable checkpoint weights. Expected prefix: audio.*")
+        loaded = self.model.load_weights(audio_weights)
+        return {f"model.{name}" for name in loaded}
+
+    def preprocess(
+        self,
+        input_ids: torch.Tensor,
+        input_embeds: torch.Tensor | None,
+        **info_dict: Any,
+    ):
+        if self.model_stage != "llm":
+            return input_ids, input_embeds, {}
+
+        # vLLM hands Stage-0 a scratch inputs_embeds buffer that is zeroed at
+        # preprocess time and later becomes corrupted before the backbone call.
+        # Rebuild a fresh [T,H] embedding tensor from token ids here instead of
+        # trusting the runtime-provided buffer.
+        input_embeds = self.model.embed_input_ids(input_ids).clone()
+
+        span_len = int(input_ids.shape[0])
+        if span_len > 1:
+            return self._prefill_preprocess(input_ids, input_embeds, **info_dict)
+        return self._decode_preprocess(input_ids, input_embeds, **info_dict)
+
+    def preprocess_input(
+        self,
+        input_ids: torch.Tensor,
+        input_embeds: torch.Tensor | None,
+        **info_dict: Any,
+    ):
+        return self.preprocess(input_ids, input_embeds, **info_dict)
+
+    def postprocess(self, hidden_states: torch.Tensor, **info_dict: Any) -> dict[str, Any]:
+        if self.model_stage != "llm" or hidden_states.numel() == 0:
+            return {}
+
+        req_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
+        pending = self.model.pop_postprocess_update(req_id)
+        if not pending:
+            return {}
+
+        latent_patch = pending.get("ming_latent_patch")
+        next_embeds = pending.get(KEY_NEXT_EMBEDS)
+        new_history = pending.get(KEY_LATENT_HISTORY)
+        stop_prob = _take_scalar(pending.get("ming_stop_prob"), 0)
+        stop_reason = pending.get(MING_STOP_REASON_KEY)
+        if not isinstance(latent_patch, torch.Tensor):
+            return {}
+
+        decode_step = int(info_dict.get(KEY_DECODE_STEP, 0))
+        update = {
+            KEY_LATENT_HISTORY: new_history.detach().to("cpu").contiguous(),
+            KEY_NEXT_EMBEDS: next_embeds.detach().to("cpu").contiguous(),
+            KEY_DECODE_STEP: decode_step + 1,
+        }
+        if stop_prob is not None:
+            update[KEY_LAST_STOP_PROB] = stop_prob
+        if isinstance(stop_reason, str):
+            update[MING_STOP_REASON_KEY] = stop_reason
+        if isinstance(req_id, str):
+            update[KEY_REQUEST_ID] = req_id
+        return update
+
+    def _prefill_preprocess(
+        self,
+        input_ids: torch.Tensor,
+        input_embeds: torch.Tensor,
+        **info_dict: Any,
+    ):
+        if bool(info_dict.get(KEY_TEXT_MODE, False)):
+            update: dict[str, Any] = {KEY_TEXT_MODE: True}
+            request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
+            if request_id is not None:
+                update[KEY_REQUEST_ID] = request_id
+            if int(input_ids.shape[0]) > 1 and int(input_ids[-1].item()) == AUDIO_START_TOKEN_ID:
+                return input_ids[:-1], input_embeds[:-1], update
+            return input_ids, input_embeds, update
+
+        update: dict[str, Any] = {
+            KEY_DECODE_STEP: int(info_dict.get(KEY_DECODE_STEP, 0)),
+        }
+
+        prompt_latents = self._resolve_prompt_latents(info_dict)
+        history = _initial_history(
+            prompt_latents["frames"] if prompt_latents is not None else None,
+            history_size=self.ming_config.history_patch_size,
+            latent_dim=self.ming_config.latent_dim,
+            device=input_embeds.device,
+            dtype=torch.float32,
+        )
+        update[KEY_LATENT_HISTORY] = history.detach().to("cpu").contiguous()
+        update[KEY_PROMPT_LATENT_TAIL] = update[KEY_LATENT_HISTORY]
+
+        speaker_embedding = info_dict.get(KEY_SPEAKER_EMBEDDING, info_dict.get("speaker_embedding"))
+        speaker_embeddings = coerce_speaker_embeddings(
+            speaker_embedding,
+            use_zero_spk_emb=bool(info_dict.get("use_zero_spk_emb", False)),
+        )
+        speaker_slots: list[int] = []
+        if speaker_embeddings:
+            speaker_slots = _find_speaker_placeholder_positions(input_ids, self.vllm_config.model_config.hf_config)
+            if len(speaker_slots) < len(speaker_embeddings):
+                raise RuntimeError(
+                    f"Could not locate enough speaker placeholder slots: found {len(speaker_slots)}, "
+                    f"need {len(speaker_embeddings)}"
+                )
+            for speaker_slot, spk in zip(speaker_slots, speaker_embeddings):
+                spk_proj = self.model.project_speaker_embedding(
+                    spk.to(device=input_embeds.device, dtype=input_embeds.dtype).unsqueeze(0)
+                ).squeeze(0)
+                input_embeds[speaker_slot] = spk_proj
+
+        if prompt_latents is not None and prompt_latents["patches"] is not None:
+            prompt_patches = prompt_latents["patches"].to(
+                dtype=getattr(self.model, "fm_dtype", torch.float32),
+            )
+            prompt_embeds = self.model.linear_proj_audio(prompt_patches).squeeze(1)
+            placeholder_pos = _find_audio_placeholder_positions(input_ids, self.ming_config)
+            take = min(int(placeholder_pos.numel()), int(prompt_embeds.shape[0]))
+            if take > 0:
+                input_embeds[placeholder_pos[:take]] = prompt_embeds[:take].to(dtype=input_embeds.dtype)
+
+        request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
+        if request_id is not None:
+            update[KEY_REQUEST_ID] = request_id
+        _copy_runtime_controls(update, info_dict)
+        return input_ids, input_embeds, update
+
+    def _resolve_prompt_latents(self, info_dict: dict[str, Any]) -> dict[str, torch.Tensor] | None:
+        raw_latents = info_dict.get(KEY_PROMPT_LATENTS, info_dict.get("prompt_latents"))
+        raw_waveform = info_dict.get("prompt_waveform", info_dict.get("prompt_waveforms"))
+        if raw_latents is not None and raw_waveform is not None:
+            raise ValueError(
+                "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
+                "Choose exactly one source of truth."
+            )
+
+        direct_latents = _coerce_prompt_latents(
+            raw_latents,
+            patch_size=self.ming_config.patch_size,
+            latent_dim=self.ming_config.latent_dim,
+        )
+        if direct_latents is not None:
+            return direct_latents
+
+        if raw_waveform is None:
+            return None
+        waveform_length = info_dict.get("prompt_waveform_length")
+        latents = self._encode_prompt_waveform_to_latents(
+            raw_waveform,
+            waveform_length,
+        )
+        return _coerce_prompt_latents(
+            latents,
+            patch_size=self.ming_config.patch_size,
+            latent_dim=self.ming_config.latent_dim,
+        )
+
+    def _load_prompt_encoder(self) -> AudioVAE:
+        if self._prompt_encoder is not None:
+            return self._prompt_encoder
+        if self.ming_config.audio_tokenizer_config is None:
+            raise RuntimeError("Ming Stage-0 requires audio_tokenizer_config to encode prompt audio.")
+
+        encoder = AudioVAE(self.ming_config.audio_tokenizer_config).eval()
+        state_dict = encoder.state_dict()
+        loaded = 0
+        loaded_encoder_params = set()
+        with torch.no_grad():
+            for shard_path in _iter_model_safetensors(
+                _resolve_model_to_local_path(str(self.vllm_config.model_config.model))
+            ):
+                with safe_open(str(shard_path), framework="pt", device="cpu") as handle:
+                    for key in handle.keys():
+                        if not key.startswith("audio.encoder."):
+                            continue
+                        name = key[len("audio.") :]
+                        if name not in state_dict:
+                            continue
+                        target = state_dict[name]
+                        target.copy_(handle.get_tensor(key).to(device=target.device, dtype=target.dtype))
+                        loaded += 1
+                        loaded_encoder_params.add(name)
+        if loaded == 0:
+            raise RuntimeError("Ming prompt encoder received no audio.encoder.* weights from checkpoint.")
+        # Ensure the encode-only Stage-0 VAE is not silently running with random encoder weights.
+        expected_encoder_params = {f"encoder.{name}" for name, _ in encoder.encoder.named_parameters()}
+        missing = expected_encoder_params - loaded_encoder_params
+        if missing:
+            raise RuntimeError(
+                f"Ming prompt encoder: {len(missing)} params not loaded. First few: {sorted(missing)[:5]}"
+            )
+
+        dev = next(self.parameters()).device
+        try:
+            del encoder.decoder
+            encoder.decoder = None
+            if dev.type != "cpu":
+                encoder.encoder.to(dev, dtype=getattr(self.model, "fm_dtype", torch.bfloat16))
+            else:
+                encoder.encoder.to(dev)
+        except Exception as e:
+            raise RuntimeError(f"Failed to move Ming prompt encoder to {dev}: {e}") from e
+        self._prompt_encoder = encoder
+        return encoder
+
+    @torch.inference_mode()
+    def _encode_prompt_waveform_to_latents(self, waveform: Any, waveform_length: Any = None) -> torch.Tensor:
+        encoder = self._load_prompt_encoder()
+        waveform = _normalize_prompt_waveform(waveform, target_sr=self.ming_config.sample_rate)
+        waveform = pad_prompt_waveform(
+            waveform,
+            patch_size=self.ming_config.patch_size,
+            sample_rate=self.ming_config.sample_rate,
+            frame_hop=self.ming_config.audio_frame_hop,
+        )
+        dev = next(encoder.encoder.parameters()).device
+        waveform = waveform.to(device=dev, dtype=next(encoder.encoder.parameters()).dtype)
+        if waveform_length is None:
+            waveform_length = torch.full(
+                (waveform.shape[0],),
+                waveform.shape[-1],
+                dtype=torch.int32,
+                device=dev,
+            )
+        elif not isinstance(waveform_length, torch.Tensor):
+            waveform_length = torch.as_tensor(waveform_length, dtype=torch.int32, device=dev)
+        else:
+            waveform_length = waveform_length.to(device=dev, dtype=torch.int32)
+
+        latents, _ = encoder.encode_latent(waveform, waveform_length)
+        if latents.ndim == 3 and latents.shape[0] == 1:
+            latents = latents.squeeze(0)
+        count_prompt_latent_patches(
+            latents,
+            patch_size=self.ming_config.patch_size,
+            latent_dim=self.ming_config.latent_dim,
+        )
+        return latents.detach().to(dtype=torch.float32).contiguous()
+
+    def _decode_preprocess(
+        self,
+        input_ids: torch.Tensor,
+        input_embeds: torch.Tensor,
+        **info_dict: Any,
+    ):
+        if bool(info_dict.get(KEY_TEXT_MODE, False)):
+            update: dict[str, Any] = {KEY_TEXT_MODE: True}
+            request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
+            if request_id is not None:
+                update[KEY_REQUEST_ID] = request_id
+            return input_ids, input_embeds, update
+
+        update: dict[str, Any] = {
+            KEY_DECODE_STEP: int(info_dict.get(KEY_DECODE_STEP, 0)),
+        }
+
+        history = info_dict.get(KEY_LATENT_HISTORY)
+        if isinstance(history, torch.Tensor):
+            update[KEY_LATENT_HISTORY] = history.detach().to("cpu").contiguous()
+        else:
+            zero_history = torch.zeros(
+                (self.ming_config.history_patch_size, self.ming_config.latent_dim),
+                device=input_embeds.device,
+                dtype=torch.float32,
+            )
+            update[KEY_LATENT_HISTORY] = zero_history.detach().to("cpu").contiguous()
+
+        next_embeds = info_dict.get(KEY_NEXT_EMBEDS)
+        if isinstance(next_embeds, torch.Tensor) and input_ids.numel() == 1:
+            if not torch.isfinite(next_embeds).all():
+                raise RuntimeError("Non-finite next_embeds before decode preprocess write.")
+            next_step = next_embeds.detach().reshape(-1, self.ming_config.llm_hidden_size)[0]
+            input_embeds[0] = next_step.to(device=input_embeds.device, dtype=input_embeds.dtype)
+            if not torch.isfinite(input_embeds[0]).all():
+                raise RuntimeError("Non-finite backbone input_embeds after decode preprocess write.")
+
+        request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
+        if request_id is not None:
+            update[KEY_REQUEST_ID] = request_id
+        _copy_runtime_controls(update, info_dict)
+        return input_ids, input_embeds, update
+
+
+def _copy_runtime_controls(update: dict[str, Any], info_dict: dict[str, Any]) -> None:
+    for key in (KEY_CFG, KEY_SIGMA, KEY_TEMPERATURE, KEY_MAX_DECODE_STEPS, KEY_MIN_DECODE_STEPS):
+        if key in info_dict:
+            update[key] = info_dict[key]
+
+
+def _resolve_model_to_local_path(model: str) -> str:
+    if os.path.isdir(model):
+        return model
+    try:
+        from huggingface_hub import snapshot_download
+
+        return snapshot_download(model, local_files_only=True)
+    except Exception as exc:
+        raise RuntimeError(
+            f"Ming Stage-0 prompt encoder requires a local model snapshot, got {model!r}. "
+            "Download the model first or pass a local path."
+        ) from exc
+
+
+def _iter_model_safetensors(local_model_path: str) -> list[Path]:
+    model_root = Path(local_model_path)
+    index_path = model_root / "model.safetensors.index.json"
+    if index_path.exists():
+        with index_path.open("r", encoding="utf-8") as handle:
+            index_data = json.load(handle)
+        filenames = sorted(set(index_data.get("weight_map", {}).values()))
+        if not filenames:
+            raise RuntimeError(f"No checkpoint shards listed in {index_path}")
+        return [model_root / filename for filename in filenames]
+
+    single_file = model_root / "model.safetensors"
+    if single_file.exists():
+        return [single_file]
+
+    files = sorted(model_root.glob("*.safetensors"))
+    if not files:
+        raise RuntimeError(f"No .safetensors checkpoint found under {local_model_path}")
+    return files
+
+
+def _normalize_prompt_waveform(value: Any, *, target_sr: int) -> torch.Tensor:
+    if isinstance(value, bytes):
+        import torchaudio
+
+        waveform, sr = torchaudio.load(BytesIO(value))
+        waveform = waveform[:1].to(torch.float32)
+        if int(sr) != int(target_sr):
+            from torchaudio.functional import resample as resample_audio
+
+            waveform = resample_audio(waveform, int(sr), int(target_sr))
+        return waveform
+
+    if isinstance(value, tuple) and len(value) == 2 and isinstance(value[1], int):
+        waveform = coerce_prompt_waveform(value[0])
+        if int(value[1]) != int(target_sr):
+            from torchaudio.functional import resample as resample_audio
+
+            waveform = resample_audio(waveform, int(value[1]), int(target_sr))
+        return waveform
+
+    if isinstance(value, dict):
+        samples = value.get("samples", value.get("array", value.get("waveform")))
+        sr = value.get("sample_rate", value.get("sr", target_sr))
+        return _normalize_prompt_waveform((samples, int(sr)), target_sr=target_sr)
+
+    return coerce_prompt_waveform(value)
+
+
+def _coerce_prompt_latents(
+    value: Any,
+    *,
+    patch_size: int,
+    latent_dim: int,
+) -> dict[str, torch.Tensor] | None:
+    if value is None:
+        return None
+    if not isinstance(value, torch.Tensor):
+        value = torch.as_tensor(value)
+
+    latents = value.detach()
+    if latents.ndim == 3 and latents.shape[0] == 1:
+        latents = latents.squeeze(0)
+
+    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
+        patches = latents
+        # [B,T,D] patch history -> [T,D] flat frame history for Stage-1 seeding.
+        frames = patches.reshape(-1, latent_dim)
+        return {"patches": patches, "frames": frames}
+
+    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
+        raise ValueError(f"Unsupported prompt latent shape: {tuple(latents.shape)}")
+
+    if latents.shape[0] % patch_size != 0:
+        raise ValueError(
+            f"Prompt latent frame count must be divisible by patch_size={patch_size}, "
+            f"got frames={int(latents.shape[0])}"
+        )
+    patches = None
+    if latents.shape[0] > 0:
+        # [T,D] flat prompt frames -> [B,T,D] patch groups expected by Aggregator.
+        patches = latents.reshape(-1, patch_size, latent_dim)
+    return {"patches": patches, "frames": latents}
+
+
+def _initial_history(
+    frames: torch.Tensor | None,
+    *,
+    history_size: int,
+    latent_dim: int,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    history = torch.zeros((history_size, latent_dim), device=device, dtype=dtype)
+    if frames is None or frames.numel() == 0:
+        return history
+    frames = frames.to(device=device, dtype=dtype)
+    take = min(history_size, int(frames.shape[0]))
+    history[-take:] = frames[-take:]
+    return history
+
+
+def _take_index(value: Any, idx: int) -> torch.Tensor | None:
+    if not isinstance(value, torch.Tensor) or value.numel() == 0:
+        return None
+    return value[idx]
+
+
+def _take_scalar(value: Any, idx: int) -> float | None:
+    if not isinstance(value, torch.Tensor) or value.numel() == 0:
+        return None
+    return float(value.reshape(-1)[idx].item())
+
+
+def _find_audio_placeholder_positions(input_ids: torch.Tensor, cfg: MingTTSConfig) -> torch.Tensor:
+    dummy_pos = (input_ids == cfg.audio_dummy_token_id).nonzero(as_tuple=True)[0]
+    if dummy_pos.numel() == 0:
+        return dummy_pos
+
+    audio_start_pos = (input_ids == cfg.audio_start_token_id).nonzero(as_tuple=True)[0]
+    audio_end_pos = (input_ids == cfg.audio_end_token_id).nonzero(as_tuple=True)[0]
+    if audio_start_pos.numel() == 0:
+        return dummy_pos
+
+    start = int(audio_start_pos[0].item())
+    end = int(audio_end_pos[0].item()) if audio_end_pos.numel() > 0 else int(input_ids.shape[0])
+    keep = (dummy_pos > start) & (dummy_pos < end)
+    filtered = dummy_pos[keep]
+    return filtered if filtered.numel() > 0 else dummy_pos
+
+
+def _find_speaker_placeholder_positions(input_ids: torch.Tensor, hf_config: Any) -> list[int]:
+    vision_start_token_id = getattr(hf_config, "vision_start_token_id", VISION_START_TOKEN_ID)
+
+    vision_start_pos = (input_ids == int(vision_start_token_id)).nonzero(as_tuple=True)[0]
+    if vision_start_pos.numel() == 0:
+        return []
+
+    slots = []
+    for pos in vision_start_pos:
+        slot = int(pos.item()) + 1
+        if slot < int(input_ids.shape[0]):
+            slots.append(slot)
+    return slots
diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py b/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
new file mode 100644
index 00000000000..17b900fcfe4
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
@@ -0,0 +1,318 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import warnings
+from collections.abc import Iterable
+from typing import Any
+
+import torch
+import torch.nn as nn
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+from vllm_omni.model_executor.models.output_templates import OmniOutput
+
+from .audio_tokenizer.modeling_audio_vae import AudioVAE
+from .config_ming_tts import KEY_CHUNK_ID, KEY_REQUEST_ID, MingTTSConfig
+
+logger = init_logger(__name__)
+
+MING_STOP_REASON_KEY = "ming_stop_reason"
+MING_FINAL_DECODE_STEP_KEY = "ming_final_decode_step"
+
+
+class MingAudioVAEModel(nn.Module):
+    input_modalities = "audio"
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        del prefix
+        self.vllm_config = vllm_config
+        self.ming_config = MingTTSConfig.from_hf_config(vllm_config.model_config.hf_config)
+        if self.ming_config.audio_tokenizer_config is None:
+            raise ValueError("MingAudioVAEModel requires audio_tokenizer_config")
+        self.ming_config.validate()
+
+        self.audio = AudioVAE(self.ming_config.audio_tokenizer_config)
+        self.have_multimodal_outputs = True
+        self.has_preprocess = False
+        self.has_postprocess = False
+        self.enable_update_additional_information = True
+        self.requires_raw_input_tokens = True
+
+        self._past_key_values: dict[str, Any] = {}
+        self._stream_state: dict[str, tuple[Any, Any, Any]] = {}
+        self._patch_totals: dict[str, int] = {}
+        self._sample_totals: dict[str, int] = {}
+
+    def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor:
+        hidden_size = int(self.ming_config.llm_hidden_size)
+        if input_ids is None or input_ids.numel() == 0:
+            return torch.empty((0, hidden_size), device=self.vllm_config.device_config.device, dtype=torch.float32)
+        return torch.zeros((input_ids.shape[0], hidden_size), device=input_ids.device, dtype=torch.float32)
+
+    def compute_logits(self, hidden_states: torch.Tensor | OmniOutput, sampling_metadata: Any = None) -> None:
+        del hidden_states, sampling_metadata
+        return None
+
+    def chunked_decode_streaming(
+        self,
+        latent_chunk: torch.Tensor,
+        *,
+        request_id: str,
+        finished: bool,
+    ) -> tuple[torch.Tensor, Any, Any, bool, bool]:
+        had_past_key_values = request_id in self._past_key_values
+        had_stream_state = _has_stream_state(self._stream_state.get(request_id))
+        stream_state = self._stream_state.get(request_id, (None, None, None))
+        past_key_values = self._past_key_values.get(request_id)
+        waveform_parts: list[torch.Tensor] = []
+
+        patch_count = int(latent_chunk.shape[0])
+        for patch_idx in range(patch_count):
+            # [Batch, Time, Dimension] = [1, patch_size, latent_dim]
+            latent_patch = latent_chunk[patch_idx : patch_idx + 1]
+            is_last_patch = finished and patch_idx == patch_count - 1
+            waveform, stream_state, past_key_values = self.audio.decode(
+                latent_patch,
+                past_key_values=past_key_values,
+                use_cache=True,
+                stream_state=stream_state,
+                last_chunk=is_last_patch,
+            )
+            waveform_parts.append(waveform.reshape(-1).to(torch.float32))
+
+        waveform_flat = (
+            torch.cat(waveform_parts, dim=0)
+            if waveform_parts
+            else torch.zeros((0,), dtype=torch.float32, device=latent_chunk.device)
+        )
+        return waveform_flat, stream_state, past_key_values, had_past_key_values, had_stream_state
+
+    @torch.inference_mode()
+    def forward(
+        self,
+        input_ids: torch.Tensor | None = None,
+        positions: torch.Tensor | None = None,
+        intermediate_tensors: Any = None,
+        inputs_embeds: torch.Tensor | None = None,
+        model_intermediate_buffer: list[dict[str, Any]] | None = None,
+        **kwargs: Any,
+    ) -> OmniOutput:
+        del input_ids, positions, intermediate_tensors, inputs_embeds
+
+        info_list = model_intermediate_buffer if isinstance(model_intermediate_buffer, list) else None
+        if info_list is None:
+            runtime_infos = kwargs.get("runtime_additional_information")
+            info_list = runtime_infos if isinstance(runtime_infos, list) else None
+        info_list = info_list or [{}]
+        num_reqs = max(len(info_list), 1)
+        sr_tensor = torch.tensor(int(self.ming_config.sample_rate), dtype=torch.int32)
+        empty = torch.zeros((0,), dtype=torch.float32, device=self.vllm_config.device_config.device)
+
+        outputs: list[torch.Tensor] = []
+        srs: list[torch.Tensor] = []
+
+        for idx in range(num_reqs):
+            info = info_list[idx] if idx < len(info_list) and isinstance(info_list[idx], dict) else {}
+            has_ming_context = _has_ming_context(info)
+            if has_ming_context and KEY_REQUEST_ID not in info:
+                raise RuntimeError(
+                    f"Ming Stage-2 received a payload without {KEY_REQUEST_ID}. keys={sorted(info.keys())}"
+                )
+            request_id = _resolve_request_id(info, idx)
+            chunk_id = _coerce_optional_int(info.get(KEY_CHUNK_ID))
+            finished = _coerce_finished(info.get("stream_finished", torch.tensor(True)))
+            latent = info.get("ming_latent_patches")
+            stripped = bool(info.get("_ming_payload_stripped", False))
+            if stripped:
+                raise RuntimeError(
+                    "Ming Stage-2 payload was stripped before model entry. "
+                    f"request_id={request_id} chunk_id={chunk_id} keys={sorted(info.keys())}"
+                )
+            if latent is None:
+                if has_ming_context and not finished:
+                    raise RuntimeError(
+                        "Ming Stage-2 received no latent chunk for an unfinished request. "
+                        f"request_id={request_id} chunk_id={chunk_id} keys={sorted(info.keys())}"
+                    )
+                if finished:
+                    self._clear_request_state(request_id)
+                outputs.append(empty)
+                srs.append(sr_tensor)
+                continue
+
+            latent_tensor = _coerce_latent_chunk(
+                latent,
+                device=self.vllm_config.device_config.device,
+                dtype=next(self.audio.parameters()).dtype,
+                latent_dim=self.ming_config.latent_dim,
+                patch_size=self.ming_config.patch_size,
+            )
+            if latent_tensor is None or latent_tensor.numel() == 0:
+                if not finished:
+                    raise RuntimeError(
+                        "Ming Stage-2 received an empty latent chunk before final flush. "
+                        f"request_id={request_id} chunk_id={chunk_id} latent_shape={_shape_of(latent_tensor)}"
+                    )
+                if finished:
+                    self._clear_request_state(request_id)
+                outputs.append(empty)
+                srs.append(sr_tensor)
+                continue
+
+            patch_count = int(latent_tensor.shape[0])
+            waveform_flat, stream_state, past_key_values, had_past_key_values, had_stream_state = (
+                self.chunked_decode_streaming(
+                    latent_tensor,
+                    request_id=request_id,
+                    finished=finished,
+                )
+            )
+            total_patch_count = self._patch_totals.get(request_id, 0) + patch_count
+            total_waveform_numel = self._sample_totals.get(request_id, 0) + int(waveform_flat.numel())
+            self._patch_totals[request_id] = total_patch_count
+            self._sample_totals[request_id] = total_waveform_numel
+            if (had_past_key_values or had_stream_state) and waveform_flat.numel() == 0 and not finished:
+                raise RuntimeError(
+                    "Ming Stage-2 produced an empty waveform after cached streaming state already existed. "
+                    f"request_id={request_id} chunk_id={chunk_id} latent_shape={tuple(latent_tensor.shape)} "
+                    f"had_past_key_values={had_past_key_values} had_stream_state={had_stream_state}"
+                )
+
+            if finished:
+                logger.info(
+                    "MING_STAGE1_FINAL %s",
+                    {
+                        "request_id": request_id,
+                        "chunk_id": chunk_id,
+                        "stop_reason": info.get(MING_STOP_REASON_KEY),
+                        "final_decode_step": _coerce_optional_int(info.get(MING_FINAL_DECODE_STEP_KEY)),
+                        "final_chunk_patch_count": patch_count,
+                        "total_patch_count": total_patch_count,
+                        "final_chunk_waveform_numel": int(waveform_flat.numel()),
+                        "total_waveform_numel": total_waveform_numel,
+                    },
+                )
+                self._clear_request_state(request_id)
+            else:
+                self._past_key_values[request_id] = past_key_values
+                self._stream_state[request_id] = stream_state
+
+            outputs.append(waveform_flat)
+            srs.append(sr_tensor)
+
+        return OmniOutput(
+            text_hidden_states=None,
+            multimodal_outputs={
+                "model_outputs": outputs,
+                "sr": srs,
+            },
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        weights = list(weights)
+        if not weights:
+            raise RuntimeError("MingAudioVAEModel received no checkpoint weights.")
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded: set[str] = set()
+        skipped: list[str] = []
+
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                skipped.append(name)
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded.add(name)
+
+        missing = {name for name in params_dict if name.startswith("audio.")} - loaded
+        if missing:
+            raise RuntimeError(f"MingAudioVAEModel: {len(missing)} params not loaded. First few: {sorted(missing)[:5]}")
+        if skipped:
+            warnings.warn(
+                f"MingAudioVAEModel: skipped {len(skipped)} checkpoint keys during load. First few: {skipped[:8]}",
+                stacklevel=2,
+            )
+        return loaded
+
+    def _clear_request_state(self, request_id: str) -> None:
+        self._past_key_values.pop(request_id, None)
+        self._stream_state.pop(request_id, None)
+        self._patch_totals.pop(request_id, None)
+        self._sample_totals.pop(request_id, None)
+
+
+def _coerce_finished(value: Any) -> bool:
+    if isinstance(value, torch.Tensor):
+        if value.numel() == 0:
+            return False
+        return bool(value.reshape(-1)[0].item())
+    return bool(value)
+
+
+def _coerce_optional_int(value: Any, default: int | None = None) -> int | None:
+    if value is None:
+        return default
+    if isinstance(value, list):
+        value = value[0] if value else default
+    if isinstance(value, torch.Tensor):
+        if value.numel() == 0:
+            return default
+        return int(value.reshape(-1)[0].item())
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return default
+
+
+def _resolve_request_id(info: dict[str, Any], idx: int) -> str:
+    request_id = info.get(KEY_REQUEST_ID)
+    if request_id is None:
+        return str(idx)
+    if not isinstance(request_id, str) or not request_id:
+        raise RuntimeError(f"Ming Stage-2 received invalid request id: {request_id!r}")
+    return request_id
+
+
+def _has_ming_context(info: dict[str, Any]) -> bool:
+    return any(key in info for key in (KEY_REQUEST_ID, KEY_CHUNK_ID, "ming_latent_patches", "_ming_payload_stripped"))
+
+
+def _shape_of(value: Any) -> tuple[int, ...] | None:
+    if isinstance(value, torch.Tensor):
+        return tuple(value.shape)
+    return None
+
+
+def _has_stream_state(value: Any) -> bool:
+    if not isinstance(value, tuple):
+        return False
+    return any(item is not None for item in value)
+
+
+def _coerce_latent_chunk(
+    value: Any,
+    *,
+    device: torch.device,
+    dtype: torch.dtype,
+    latent_dim: int,
+    patch_size: int,
+) -> torch.Tensor | None:
+    if not isinstance(value, torch.Tensor):
+        value = torch.as_tensor(value)
+
+    latent = value.detach()
+    if latent.ndim == 2:
+        latent = latent.unsqueeze(0)
+    if latent.ndim != 3:
+        raise ValueError(f"Expected latent chunk rank-3 [Batch, Time, Dimension], got {tuple(latent.shape)}")
+    if latent.shape[-2] != patch_size:
+        raise ValueError(f"Latent patch size mismatch: got {latent.shape[-2]}, expected {patch_size}")
+    if latent.shape[-1] != latent_dim:
+        raise ValueError(f"Latent dim mismatch: got {latent.shape[-1]}, expected {latent_dim}")
+    return latent.to(device=device, dtype=dtype)
diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py b/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
new file mode 100644
index 00000000000..a811009e0ba
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
@@ -0,0 +1,864 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import warnings
+from collections.abc import Iterable
+from typing import Any
+
+import torch
+import torch.nn as nn
+from vllm.config import VllmConfig
+from vllm.forward_context import get_forward_context, is_forward_context_available
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
+from vllm.model_executor.models.utils import init_vllm_registered_model, is_pp_missing_parameter, maybe_prefix
+from vllm.sequence import IntermediateTensors
+from vllm.v1.outputs import SamplerOutput
+from vllm.v1.sample.metadata import SamplingMetadata
+
+from vllm_omni.model_executor.models.output_templates import OmniOutput
+
+from .config_ming_tts import (
+    KEY_CFG,
+    KEY_DECODE_STEP,
+    KEY_LATENT_HISTORY,
+    KEY_MAX_DECODE_STEPS,
+    KEY_MIN_DECODE_STEPS,
+    KEY_NEXT_EMBEDS,
+    KEY_REQUEST_ID,
+    KEY_SIGMA,
+    KEY_TEMPERATURE,
+    KEY_TEXT_MODE,
+    MingTTSConfig,
+)
+from .fm.dit import Aggregator
+from .fm.flowloss import FlowLoss
+
+logger = init_logger(__name__)
+
+MING_STOP_REASON_CONTINUE = "continue"
+MING_STOP_REASON_STOP_HEAD = "stop_head"
+MING_STOP_REASON_MAX_DECODE_STEPS = "max_decode_steps"
+MING_STOP_REASON_KEY = "ming_stop_reason"
+
+
+class MingLLMModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        hf_config = vllm_config.model_config.hf_config
+        self.ming_config = MingTTSConfig.from_hf_config(hf_config)
+        self.ming_config.validate()
+
+        self.vllm_config = vllm_config
+        self.prefix = prefix
+        self.quant_config = vllm_config.quant_config
+        self.fm_dtype = _resolve_ming_runtime_dtype(vllm_config)
+
+        self.model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.linear_proj_audio = Aggregator(
+            in_channels=self.ming_config.latent_dim,
+            llm_input_dim=self.ming_config.llm_hidden_size,
+            **self.ming_config.aggregator_config,
+        )
+        self.flowloss = FlowLoss(
+            z_channels=self.ming_config.latent_dim,
+            llm_cond_dim=self.ming_config.llm_hidden_size,
+            **self.ming_config.ditar_config,
+        )
+
+        self.stop_head = nn.Linear(self.ming_config.llm_hidden_size, 2, bias=True)
+        self.spk_head = nn.Linear(192, self.ming_config.llm_hidden_size, bias=True)
+        self.flowloss.to(dtype=self.fm_dtype)
+        self.linear_proj_audio.to(dtype=self.fm_dtype)
+        self.stop_head.to(dtype=self.fm_dtype)
+        self.spk_head.to(dtype=self.fm_dtype)
+        self._pending_postprocess_updates: dict[str, dict[str, Any]] = {}
+        self._last_sample_decode_steps: torch.Tensor | None = None
+        self._last_sample_stop_probs: torch.Tensor | None = None
+        self._last_sample_max_decode_steps: torch.Tensor | None = None
+        self._last_sample_min_decode_steps: torch.Tensor | None = None
+        self._pending_sample_stop_inputs = None
+        self._last_text_mode: bool = False
+
+    def get_input_embeddings(self) -> nn.Module:
+        if hasattr(self.model, "embed_tokens"):
+            return self.model.embed_tokens
+        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):
+            return self.model.model.embed_tokens
+        raise AttributeError("Could not locate token embeddings on Ming Qwen2 backbone.")
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+        **_: Any,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            return inputs_embeds
+        if hasattr(self.model, "embed_input_ids"):
+            return self.model.embed_input_ids(input_ids)
+        return self.get_input_embeddings()(input_ids)
+
+    def project_speaker_embedding(self, spk_emb: torch.Tensor) -> torch.Tensor:
+        return self.spk_head(spk_emb)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        latent_history: torch.Tensor | None = None,
+        model_intermediate_buffer: list[dict[str, Any]] | None = None,
+        seq_token_counts: list[int] | None = None,
+        **kwargs: object,
+    ) -> OmniOutput | IntermediateTensors | torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_input_ids(input_ids)
+
+        if model_intermediate_buffer is None:
+            model_intermediate_buffer = kwargs.get("runtime_additional_information")
+        request_infos = _normalize_request_infos(model_intermediate_buffer)
+        backbone_out = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        if isinstance(backbone_out, IntermediateTensors):
+            return backbone_out
+
+        hidden_states = _extract_hidden_states(backbone_out)
+        token_counts = _get_request_token_counts(hidden_states, request_infos, seq_token_counts)
+        text_mode = bool(request_infos) and all(bool(info.get(KEY_TEXT_MODE, False)) for info in request_infos)
+        if request_infos and any(bool(info.get(KEY_TEXT_MODE, False)) for info in request_infos) and not text_mode:
+            raise RuntimeError("Mixed Ming text/audio modes in one Stage-0 batch are unsupported.")
+
+        if text_mode:
+            self._last_text_mode = True
+            self._last_sample_decode_steps = None
+            self._last_sample_stop_probs = None
+            self._last_sample_max_decode_steps = None
+            self._last_sample_min_decode_steps = None
+            return OmniOutput(
+                text_hidden_states=hidden_states,
+                multimodal_outputs={KEY_TEXT_MODE: True},
+                intermediate_tensors=intermediate_tensors,
+            )
+        self._last_text_mode = False
+
+        if latent_history is None and not token_counts:
+            return OmniOutput(
+                text_hidden_states=hidden_states,
+                multimodal_outputs=None,
+                intermediate_tensors=intermediate_tensors,
+            )
+
+        if latent_history is not None and not token_counts:
+            token_counts = [hidden_states.shape[0]]
+            request_infos = [{KEY_LATENT_HISTORY: latent_history}]
+
+        total_tokens = hidden_states.shape[0]
+        latent_patch_tokens = None
+        next_embed_tokens = None
+        new_history_tokens = None
+        stop_prob_tokens = None
+        decode_step_tokens = None
+        has_patch = None
+        max_decode_step_tokens = None
+        pending_updates: dict[str, dict[str, Any]] = {}
+        stop_reason_tokens: list[str] | None = None
+        sampled_decode_steps = []
+        sampled_stop_probs = []
+        sampled_max_decode_steps = []
+        sampled_min_decode_steps = []
+
+        cursor = 0
+        any_decode = False
+        for req_idx, token_count in enumerate(token_counts):
+            end = min(cursor + token_count, total_tokens)
+            if end <= cursor:
+                continue
+
+            req_info = request_infos[req_idx] if req_idx < len(request_infos) else {}
+            req_id = req_info.get(KEY_REQUEST_ID)
+            req_history = req_info.get(KEY_LATENT_HISTORY)
+            if req_history is None:
+                cursor = end
+                continue
+            decode_step = int(req_info.get(KEY_DECODE_STEP, req_info.get("generated_len", 0)))
+
+            req_history = _coerce_latent_history(
+                req_history,
+                device=hidden_states.device,
+                dtype=self.fm_dtype,
+                cfg=self.ming_config,
+            )
+            if req_history is None:
+                cursor = end
+                continue
+
+            if token_count == 1:
+                decode_hidden = hidden_states[cursor:end]
+                output_index = cursor
+            else:
+                # [T,H] prefill span -> use the last prompt token [1,H] to seed
+                # the first FlowLoss patch, matching upstream Ming.
+                decode_hidden = hidden_states[end - 1 : end]
+                output_index = end - 1
+            req_cfg = _resolve_runtime_float(req_info, KEY_CFG, self.ming_config.cfg)
+            req_sigma = _resolve_runtime_float(req_info, KEY_SIGMA, self.ming_config.sigma)
+            req_temperature = _resolve_runtime_float(req_info, KEY_TEMPERATURE, self.ming_config.temperature)
+            req_max_decode_steps = _resolve_runtime_int(
+                req_info, KEY_MAX_DECODE_STEPS, self.ming_config.max_decode_steps
+            )
+            req_min_decode_steps = _resolve_optional_runtime_int(req_info, KEY_MIN_DECODE_STEPS, 0)
+            sampled_token_latent, next_embeds, new_history, stop_probs = self._decode_one_step(
+                hidden_states=decode_hidden,
+                latent_history=req_history,
+                cfg_scale=req_cfg,
+                sigma=req_sigma,
+                temperature=req_temperature,
+            )
+
+            if latent_patch_tokens is None:
+                latent_patch_tokens = sampled_token_latent.new_zeros(
+                    (total_tokens, self.ming_config.patch_size, self.ming_config.latent_dim)
+                )
+                next_embed_tokens = next_embeds.new_zeros((total_tokens, 1, self.ming_config.llm_hidden_size))
+                new_history_tokens = new_history.new_zeros(
+                    (total_tokens, self.ming_config.history_patch_size, self.ming_config.latent_dim)
+                )
+                stop_prob_tokens = stop_probs.new_zeros((total_tokens,))
+                decode_step_tokens = torch.zeros((total_tokens,), dtype=torch.int32, device=hidden_states.device)
+                max_decode_step_tokens = torch.zeros((total_tokens,), dtype=torch.int32, device=hidden_states.device)
+                min_decode_step_tokens = torch.zeros((total_tokens,), dtype=torch.int32, device=hidden_states.device)
+                has_patch = torch.zeros((total_tokens,), dtype=torch.bool, device=hidden_states.device)
+                stop_reason_tokens = [MING_STOP_REASON_CONTINUE] * total_tokens
+
+            latent_patch_tokens[output_index : output_index + 1] = sampled_token_latent
+            next_embed_tokens[output_index : output_index + 1] = next_embeds
+            new_history_tokens[output_index : output_index + 1] = new_history
+            stop_prob_tokens[output_index : output_index + 1] = stop_probs
+            decode_step_tokens[output_index : output_index + 1] = decode_step
+            max_decode_step_tokens[output_index : output_index + 1] = req_max_decode_steps
+            min_decode_step_tokens[output_index : output_index + 1] = req_min_decode_steps
+            has_patch[output_index : output_index + 1] = True
+            sampled_decode_steps.append(decode_step)
+            sampled_stop_probs.append(stop_probs.reshape(-1)[0])
+            sampled_max_decode_steps.append(req_max_decode_steps)
+            sampled_min_decode_steps.append(req_min_decode_steps)
+            stop_reason, _, _, _, _ = _resolve_ming_stop_decision(
+                step=decode_step,
+                stop_prob=float(stop_probs.reshape(-1)[0].item()),
+                stop_threshold=float(self.ming_config.stop_head_threshold),
+                min_stop_step=int(self.ming_config.stop_head_min_steps),
+                min_decode_steps=req_min_decode_steps,
+                max_decode_steps=req_max_decode_steps,
+                audio_dummy_token_id=int(self.ming_config.audio_dummy_token_id),
+                text_eos_token_id=int(self.ming_config.text_eos_token_id),
+            )
+            if stop_reason_tokens is not None:
+                stop_reason_tokens[output_index] = stop_reason
+            if isinstance(req_id, str):
+                pending_updates[req_id] = {
+                    KEY_LATENT_HISTORY: new_history,
+                    KEY_NEXT_EMBEDS: next_embeds,
+                    "ming_latent_patch": sampled_token_latent,
+                    "ming_stop_prob": stop_probs,
+                    MING_STOP_REASON_KEY: stop_reason,
+                }
+            any_decode = True
+            cursor = end
+
+        self._pending_postprocess_updates = pending_updates
+        if not any_decode:
+            self._last_sample_decode_steps = None
+            self._last_sample_stop_probs = None
+            self._last_sample_max_decode_steps = None
+            self._last_sample_min_decode_steps = None
+            return OmniOutput(
+                text_hidden_states=hidden_states,
+                multimodal_outputs=None,
+                intermediate_tensors=intermediate_tensors,
+            )
+
+        if sampled_decode_steps:
+            self._last_sample_decode_steps = torch.tensor(
+                sampled_decode_steps, dtype=torch.int32, device=hidden_states.device
+            )
+            self._last_sample_stop_probs = torch.stack(sampled_stop_probs).to(device=hidden_states.device)
+            self._last_sample_max_decode_steps = torch.tensor(
+                sampled_max_decode_steps, dtype=torch.int32, device=hidden_states.device
+            )
+            self._last_sample_min_decode_steps = torch.tensor(
+                sampled_min_decode_steps, dtype=torch.int32, device=hidden_states.device
+            )
+        else:
+            self._last_sample_decode_steps = None
+            self._last_sample_stop_probs = None
+            self._last_sample_max_decode_steps = None
+            self._last_sample_min_decode_steps = None
+
+        return OmniOutput(
+            text_hidden_states=hidden_states,
+            multimodal_outputs={
+                "ming_latent_patch": latent_patch_tokens,
+                "ming_next_embeds": next_embed_tokens,
+                "ming_new_history": new_history_tokens,
+                "ming_stop_prob": stop_prob_tokens,
+                "ming_decode_step": decode_step_tokens,
+                "ming_max_decode_steps": max_decode_step_tokens,
+                "ming_min_decode_steps": min_decode_step_tokens,
+                "ming_has_patch": has_patch,
+                MING_STOP_REASON_KEY: tuple(stop_reason_tokens or []),
+            },
+            intermediate_tensors=intermediate_tensors,
+        )
+
+    def pop_postprocess_update(self, req_id: str) -> dict[str, Any]:
+        if not isinstance(req_id, str):
+            return {}
+        return self._pending_postprocess_updates.pop(req_id, {})
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor | OmniOutput,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor | None:
+        decode_steps = None
+        stop_probs_tensor = None
+        text_mode = self._last_text_mode
+        if isinstance(hidden_states, OmniOutput):
+            text_mode = bool((hidden_states.multimodal_outputs or {}).get(KEY_TEXT_MODE, text_mode))
+            decode_steps = (hidden_states.multimodal_outputs or {}).get("ming_decode_step")
+            stop_probs_tensor = (hidden_states.multimodal_outputs or {}).get("ming_stop_prob")
+            max_decode_steps_tensor = (hidden_states.multimodal_outputs or {}).get("ming_max_decode_steps")
+            min_decode_steps_tensor = (hidden_states.multimodal_outputs or {}).get("ming_min_decode_steps")
+            hidden_states = hidden_states.text_hidden_states
+        else:
+            max_decode_steps_tensor = None
+            min_decode_steps_tensor = None
+        if text_mode:
+            self._pending_sample_stop_inputs = None
+            if hidden_states is None or hidden_states.numel() == 0:
+                return None
+            return self.model.compute_logits(hidden_states)
+        if max_decode_steps_tensor is None and isinstance(self._last_sample_max_decode_steps, torch.Tensor):
+            if self._last_sample_max_decode_steps.numel() > 0:
+                max_decode_steps_tensor = self._last_sample_max_decode_steps
+        if min_decode_steps_tensor is None and isinstance(self._last_sample_min_decode_steps, torch.Tensor):
+            if self._last_sample_min_decode_steps.numel() > 0:
+                min_decode_steps_tensor = self._last_sample_min_decode_steps
+        if decode_steps is None and isinstance(self._last_sample_decode_steps, torch.Tensor):
+            if self._last_sample_decode_steps.numel() > 0:
+                decode_steps = self._last_sample_decode_steps
+        if stop_probs_tensor is None and isinstance(self._last_sample_stop_probs, torch.Tensor):
+            if self._last_sample_stop_probs.numel() > 0:
+                stop_probs_tensor = self._last_sample_stop_probs
+
+        if hidden_states is None or hidden_states.numel() == 0:
+            self._pending_sample_stop_inputs = None
+            return None
+        if hidden_states.dim() != 2:
+            raise RuntimeError(
+                f"Expected hidden_states rank-2 [B,H] in compute_logits, got {tuple(hidden_states.shape)}"
+            )
+
+        batch_size = hidden_states.shape[0]
+        stop_prob_values = _resolve_stop_probs_batch(stop_probs_tensor, batch_size=batch_size)
+        if stop_prob_values is None:
+            stop_hidden = hidden_states.to(dtype=self.fm_dtype)
+            stop_probs = self.stop_head(stop_hidden).softmax(dim=-1)[:, 1]
+            if not torch.isfinite(stop_probs).all():
+                raise RuntimeError("Non-finite stop_probs in Ming compute_logits.")
+            stop_prob_values = [float(stop_probs[i].item()) for i in range(batch_size)]
+        steps = self._get_decode_steps(decode_steps, sampling_metadata, batch_size)
+        max_decode_steps = _resolve_max_decode_steps_batch(
+            max_decode_steps_tensor,
+            batch_size=batch_size,
+            default_value=self.ming_config.max_decode_steps,
+        )
+        min_decode_steps = _resolve_min_decode_steps_batch(
+            min_decode_steps_tensor,
+            batch_size=batch_size,
+        )
+        min_stop_step = int(self.ming_config.stop_head_min_steps)
+
+        logits = torch.full(
+            (batch_size, self.ming_config.llm_vocab_size),
+            float("-inf"),
+            device=hidden_states.device,
+            dtype=torch.float32,
+        )
+
+        for i in range(batch_size):
+            _, _, _, _, next_token_id = _resolve_ming_stop_decision(
+                step=steps[i],
+                stop_prob=stop_prob_values[i],
+                stop_threshold=float(self.ming_config.stop_head_threshold),
+                min_stop_step=min_stop_step,
+                min_decode_steps=min_decode_steps[i],
+                max_decode_steps=max_decode_steps[i],
+                audio_dummy_token_id=int(self.ming_config.audio_dummy_token_id),
+                text_eos_token_id=int(self.ming_config.text_eos_token_id),
+            )
+            logits[i, int(next_token_id)] = 0.0
+        self._pending_sample_stop_inputs = {
+            "steps": steps,
+            "stop_probs": stop_prob_values,
+            "max_decode_steps": max_decode_steps,
+            "min_decode_steps": min_decode_steps,
+        }
+        return logits
+
+    def sample(self, logits, sampling_metadata):
+        if logits is None:
+            return None
+        if self._last_text_mode:
+            return self.model.sample(logits, sampling_metadata)
+
+        del sampling_metadata
+        stop_inputs = self._pending_sample_stop_inputs
+        self._pending_sample_stop_inputs = None
+        if stop_inputs is None:
+            sampled = logits.argmax(dim=-1, keepdim=True)
+            return SamplerOutput(
+                sampled_token_ids=sampled.to(dtype=torch.int32),
+                logprobs_tensors=None,
+            )
+
+        steps = stop_inputs["steps"]
+        stop_probs = stop_inputs["stop_probs"]
+        max_decode_steps = stop_inputs["max_decode_steps"]
+        min_decode_steps = stop_inputs["min_decode_steps"]
+        sampled_ids = []
+        for i in range(logits.shape[0]):
+            _, _, _, _, next_token_id = _resolve_ming_stop_decision(
+                step=int(steps[i]),
+                stop_prob=float(stop_probs[i]),
+                stop_threshold=float(self.ming_config.stop_head_threshold),
+                min_stop_step=int(self.ming_config.stop_head_min_steps),
+                min_decode_steps=int(min_decode_steps[i]),
+                max_decode_steps=int(max_decode_steps[i]),
+                audio_dummy_token_id=int(self.ming_config.audio_dummy_token_id),
+                text_eos_token_id=int(self.ming_config.text_eos_token_id),
+            )
+            sampled_ids.append(next_token_id)
+        sampled = torch.tensor(sampled_ids, dtype=torch.int32, device=logits.device).reshape(-1, 1)
+        return SamplerOutput(
+            sampled_token_ids=sampled,
+            logprobs_tensors=None,
+        )
+
+    def _decode_one_step(
+        self,
+        *,
+        hidden_states: torch.Tensor,
+        latent_history: torch.Tensor,
+        cfg_scale: float,
+        sigma: float,
+        temperature: float,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        if hidden_states.dim() != 2:
+            raise RuntimeError(f"Expected decode hidden_states rank-2 [B,H], got {tuple(hidden_states.shape)}")
+        if latent_history.dim() != 3:
+            raise RuntimeError(f"Expected latent_history rank-3 [B,T,D], got {tuple(latent_history.shape)}")
+        if hidden_states.shape[0] != latent_history.shape[0]:
+            raise RuntimeError(
+                f"Batch mismatch: hidden_states B={hidden_states.shape[0]} vs "
+                f"latent_history B={latent_history.shape[0]}"
+            )
+
+        # [B,H] -> [B,1,H] for FlowLoss conditioning.
+        z_diff_cond = hidden_states.to(dtype=self.fm_dtype).unsqueeze(1)
+        if not torch.isfinite(z_diff_cond).all():
+            raise RuntimeError("Non-finite z_diff_cond before FlowLoss.sample().")
+        flow_out = self.flowloss.sample(
+            z=z_diff_cond,
+            latent_history=latent_history,
+            cfg=cfg_scale,
+            patch_size=self.ming_config.patch_size,
+            sigma=sigma,
+            temperature=temperature,
+        )
+        sampled_token_latent = flow_out[0] if isinstance(flow_out, tuple) else flow_out
+
+        expected_shape = (
+            hidden_states.shape[0],
+            self.ming_config.patch_size,
+            self.ming_config.latent_dim,
+        )
+        if tuple(sampled_token_latent.shape) != expected_shape:
+            raise RuntimeError(
+                f"FlowLoss output shape mismatch: got {tuple(sampled_token_latent.shape)}, expected {expected_shape}"
+            )
+
+        # [B,32,64] -> shift left by one patch and append [B,4,64] => [B,32,64].
+        new_history = torch.cat(
+            [latent_history[:, self.ming_config.patch_size :, :], sampled_token_latent],
+            dim=1,
+        )
+        # Aggregator expects [B,T,D] = [B,4,64] and returns [B,1,H].
+        next_embeds = self.linear_proj_audio(sampled_token_latent)
+        stop_hidden = hidden_states.to(dtype=self.fm_dtype)
+        stop_probs = self.stop_head(stop_hidden).softmax(dim=-1)[:, 1]
+        if not torch.isfinite(sampled_token_latent).all():
+            raise RuntimeError("Non-finite sampled_token_latent in Ming decode step.")
+        if not torch.isfinite(next_embeds).all():
+            raise RuntimeError("Non-finite next_embeds in Ming decode step.")
+        if not torch.isfinite(stop_probs).all():
+            raise RuntimeError("Non-finite stop_probs in Ming decode step.")
+        return sampled_token_latent, next_embeds, new_history, stop_probs
+
+    def _get_decode_steps(
+        self,
+        decode_steps: torch.Tensor | None,
+        sampling_metadata: SamplingMetadata,
+        batch_size: int,
+    ) -> list[int]:
+        if isinstance(decode_steps, torch.Tensor) and decode_steps.numel() > 0:
+            flat_steps = decode_steps.reshape(-1)
+            return [int(flat_steps[min(i, flat_steps.numel() - 1)].item()) for i in range(batch_size)]
+
+        steps: list[int] = []
+        output_token_ids = getattr(sampling_metadata, "output_token_ids", None)
+        if isinstance(output_token_ids, list):
+            for token_ids in output_token_ids[:batch_size]:
+                if isinstance(token_ids, torch.Tensor):
+                    steps.append(int(token_ids.numel()))
+                elif isinstance(token_ids, (list, tuple)):
+                    steps.append(len(token_ids))
+                else:
+                    raise RuntimeError(
+                        f"Expected output_token_ids entries to be list/tuple/Tensor, got {type(token_ids)!r}"
+                    )
+
+        while len(steps) < batch_size:
+            steps.append(0)
+        return steps[:batch_size]
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        skipped: list[str] = []
+
+        for ckpt_name, loaded_weight in weights:
+            name = ckpt_name
+
+            if self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name)):
+                if scale_name not in params_dict:
+                    skipped.append(ckpt_name)
+                    continue
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            mapped_name = None
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                mapped_name = name.replace(weight_name, param_name)
+                if mapped_name.endswith(".bias") and mapped_name not in params_dict:
+                    mapped_name = None
+                    break
+                if is_pp_missing_parameter(mapped_name, self):
+                    mapped_name = None
+                    break
+                if mapped_name not in params_dict:
+                    mapped_name = None
+                    continue
+                param = params_dict[mapped_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, loaded_weight)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(mapped_name)
+                break
+
+            if mapped_name in loaded_params:
+                continue
+
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+
+            name = maybe_remap_kv_scale_name(name, params_dict)
+            if name is None:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            if name not in params_dict:
+                skipped.append(ckpt_name)
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        _warn_missing_prefix("flowloss", params_dict, loaded_params, prefix="flowloss.", fatal=True)
+        _warn_missing_prefix("linear_proj_audio", params_dict, loaded_params, prefix="linear_proj_audio.", fatal=True)
+        _warn_missing_prefix("stop_head", params_dict, loaded_params, prefix="stop_head.", fatal=True)
+        _warn_missing_prefix("spk_head", params_dict, loaded_params, prefix="spk_head.", fatal=True)
+
+        if skipped:
+            warnings.warn(
+                f"MingLLMModel: skipped {len(skipped)} checkpoint keys during load. First few: {skipped[:8]}",
+                stacklevel=2,
+            )
+
+        return loaded_params
+
+
+def _extract_hidden_states(backbone_out: object) -> torch.Tensor:
+    if isinstance(backbone_out, torch.Tensor):
+        return backbone_out
+    if hasattr(backbone_out, "last_hidden_state"):
+        return backbone_out.last_hidden_state
+    if isinstance(backbone_out, (tuple, list)) and len(backbone_out) > 0:
+        if isinstance(backbone_out[0], torch.Tensor):
+            return backbone_out[0]
+    raise TypeError(f"Unsupported backbone forward output type: {type(backbone_out)}")
+
+
+def _resolve_ming_runtime_dtype(vllm_config: VllmConfig) -> torch.dtype:
+    dtype = getattr(vllm_config.model_config, "dtype", None)
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, str):
+        normalized = dtype.strip().lower()
+        if normalized in ("float16", "half", "torch.float16"):
+            return torch.float16
+        if normalized in ("bfloat16", "bf16", "torch.bfloat16"):
+            return torch.bfloat16
+        if normalized in ("float32", "fp32", "torch.float32"):
+            return torch.float32
+    return torch.float32
+
+
+def _warn_missing_prefix(
+    module_name: str,
+    params_dict: dict[str, nn.Parameter],
+    loaded_params: set[str],
+    prefix: str,
+    fatal: bool = False,
+) -> None:
+    expected = {key for key in params_dict if key.startswith(prefix)}
+    missing = expected - loaded_params
+    if not missing:
+        return
+    msg = (
+        f"MingLLMModel: {len(missing)} {module_name} params not loaded "
+        f"(prefix={prefix}). First few: {sorted(missing)[:5]}"
+    )
+    if fatal:
+        raise RuntimeError(msg)
+    warnings.warn(msg, stacklevel=3)
+
+
+def _normalize_request_infos(model_intermediate_buffer: object) -> list[dict[str, Any]]:
+    if not isinstance(model_intermediate_buffer, list):
+        return []
+    infos: list[dict[str, Any]] = []
+    for item in model_intermediate_buffer:
+        infos.append(item if isinstance(item, dict) else {})
+    return infos
+
+
+def _get_request_token_counts(
+    hidden_states: torch.Tensor,
+    request_infos: list[dict[str, Any]],
+    seq_token_counts: list[int] | None,
+) -> list[int]:
+    if seq_token_counts:
+        return [int(x) for x in seq_token_counts]
+
+    if is_forward_context_available():
+        slices = getattr(get_forward_context(), "ubatch_slices", None)
+        if slices is not None and len(slices) > 0:
+            counts: list[int] = []
+            for item in slices:
+                if isinstance(item, int):
+                    counts.append(int(item))
+                elif hasattr(item, "stop") and hasattr(item, "start"):
+                    counts.append(int(item.stop) - int(item.start))
+            if counts:
+                return counts
+
+    if request_infos:
+        if len(request_infos) == hidden_states.shape[0]:
+            return [1] * hidden_states.shape[0]
+        return [hidden_states.shape[0]]
+
+    return []
+
+
+def _coerce_latent_history(
+    value: object,
+    *,
+    device: torch.device,
+    dtype: torch.dtype,
+    cfg: MingTTSConfig,
+) -> torch.Tensor | None:
+    if value is None:
+        return None
+    if not isinstance(value, torch.Tensor):
+        value = torch.as_tensor(value)
+
+    history = value.detach()
+    if history.ndim == 2:
+        history = history.unsqueeze(0)
+    if history.ndim != 3:
+        raise RuntimeError(f"Expected latent_history rank-3 [B,T,D], got {tuple(history.shape)}")
+    if history.shape[1] != cfg.history_patch_size or history.shape[2] != cfg.latent_dim:
+        raise RuntimeError(
+            f"latent_history shape mismatch: got {tuple(history.shape)}, "
+            f"expected [B,{cfg.history_patch_size},{cfg.latent_dim}]"
+        )
+    return history.to(device=device, dtype=dtype)
+
+
+def _resolve_runtime_float(req_info: dict[str, Any], key: str, default_value: float) -> float:
+    raw = req_info.get(key, default_value)
+    try:
+        value = float(raw)
+    except (TypeError, ValueError) as exc:
+        raise RuntimeError(f"Invalid {key}: expected float-like value, got {raw!r}") from exc
+    if not value >= 0.0:
+        raise RuntimeError(f"Invalid {key}: expected non-negative value, got {value}")
+    return value
+
+
+def _resolve_runtime_int(req_info: dict[str, Any], key: str, default_value: int) -> int:
+    raw = req_info.get(key, default_value)
+    try:
+        value = int(raw)
+    except (TypeError, ValueError) as exc:
+        raise RuntimeError(f"Invalid {key}: expected int-like value, got {raw!r}") from exc
+    if value <= 0:
+        raise RuntimeError(f"Invalid {key}: expected positive value, got {value}")
+    return value
+
+
+def _resolve_optional_runtime_int(req_info: dict[str, Any], key: str, default_value: int) -> int:
+    raw = req_info.get(key, default_value)
+    try:
+        value = int(raw)
+    except (TypeError, ValueError) as exc:
+        raise RuntimeError(f"Invalid {key}: expected int-like value, got {raw!r}") from exc
+    if value < 0:
+        raise RuntimeError(f"Invalid {key}: expected non-negative value, got {value}")
+    return value
+
+
+def _resolve_max_decode_steps_batch(
+    value: torch.Tensor | None,
+    *,
+    batch_size: int,
+    default_value: int,
+) -> list[int]:
+    if value is None:
+        return [int(default_value)] * batch_size
+    flat = value.reshape(-1).tolist()
+    if not flat:
+        return [int(default_value)] * batch_size
+    resolved = [int(item) for item in flat]
+    for item in resolved:
+        if item <= 0:
+            raise RuntimeError(f"Invalid ming_max_decode_steps in runtime batch: got {item}")
+    if len(resolved) < batch_size:
+        resolved.extend([resolved[-1]] * (batch_size - len(resolved)))
+    return resolved[:batch_size]
+
+
+def _resolve_min_decode_steps_batch(
+    value: torch.Tensor | None,
+    *,
+    batch_size: int,
+) -> list[int]:
+    if value is None:
+        return [0] * batch_size
+    flat = value.reshape(-1).tolist()
+    if not flat:
+        return [0] * batch_size
+    resolved = [max(0, int(item)) for item in flat]
+    if len(resolved) < batch_size:
+        resolved.extend([resolved[-1]] * (batch_size - len(resolved)))
+    return resolved[:batch_size]
+
+
+def _resolve_ming_stop_decision(
+    *,
+    step: int,
+    stop_prob: float,
+    stop_threshold: float,
+    min_stop_step: int,
+    min_decode_steps: int,
+    max_decode_steps: int,
+    audio_dummy_token_id: int,
+    text_eos_token_id: int,
+) -> tuple[str, bool, bool, int, int]:
+    min_required_decode_steps = max(min_stop_step + 1, min_decode_steps)
+    if max_decode_steps < min_required_decode_steps:
+        raise RuntimeError(
+            "Invalid Ming decode window: "
+            f"max_decode_steps={max_decode_steps} is smaller than "
+            f"min_required_decode_steps={min_required_decode_steps}"
+        )
+    should_force_stop = (step + 1) >= max_decode_steps
+    should_stop_head = ((step + 1) >= min_required_decode_steps) and stop_prob > stop_threshold
+
+    if should_force_stop:
+        return (
+            MING_STOP_REASON_MAX_DECODE_STEPS,
+            True,
+            True,
+            min_required_decode_steps,
+            text_eos_token_id,
+        )
+    if should_stop_head:
+        return (
+            MING_STOP_REASON_STOP_HEAD,
+            True,
+            False,
+            min_required_decode_steps,
+            text_eos_token_id,
+        )
+    return (
+        MING_STOP_REASON_CONTINUE,
+        False,
+        False,
+        min_required_decode_steps,
+        audio_dummy_token_id,
+    )
+
+
+def _resolve_stop_probs_batch(
+    value: torch.Tensor | None,
+    *,
+    batch_size: int,
+) -> list[float] | None:
+    if value is None:
+        return None
+    flat = value.reshape(-1)
+    if flat.numel() == 0:
+        return None
+    return [float(flat[min(i, flat.numel() - 1)].item()) for i in range(batch_size)]
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_builder.py b/vllm_omni/model_executor/models/ming_tts/prompt_builder.py
new file mode 100644
index 00000000000..ae00cf5ae28
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/prompt_builder.py
@@ -0,0 +1,429 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import copy
+import json
+import math
+import re
+from typing import Any
+
+import torch
+
+from .config_ming_tts import (
+    AUDIO_FRAME_HOP,
+    KEY_CFG,
+    KEY_MAX_DECODE_STEPS,
+    KEY_MIN_DECODE_STEPS,
+    KEY_PROMPT_LATENTS,
+    KEY_REQUEST_ID,
+    KEY_SIGMA,
+    KEY_SPEAKER_EMBEDDING,
+    KEY_TEMPERATURE,
+    LATENT_DIM,
+    PATCH_SIZE,
+    SAMPLE_RATE,
+    VAE_PATCH_SIZE,
+)
+
+BASE_CAPTION_TEMPLATE = {
+    "audio_sequence": [
+        {
+            "序号": 1,
+            "说话人": "speaker_1",
+            "方言": None,
+            "风格": None,
+            "语速": None,
+            "基频": None,
+            "音量": None,
+            "情感": None,
+            "BGM": {
+                "Genre": None,
+                "Mood": None,
+                "Instrument": None,
+                "Theme": None,
+                "ENV": None,
+                "SNR": None,
+            },
+            "IP": None,
+        }
+    ]
+}
+
+_DURATION_SECONDS_RE = re.compile(r"Duration:\s*([0-9]+(?:\.[0-9]+)?)\s*s\b", re.IGNORECASE)
+
+
+def create_instruction(value: Any) -> str | None:
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value
+    if not isinstance(value, dict):
+        raise ValueError(f"Ming instruction must be str or dict, got {type(value).__name__}")
+
+    caption = copy.deepcopy(BASE_CAPTION_TEMPLATE)
+    target = caption["audio_sequence"][0]
+    for key, item in value.items():
+        if key in target:
+            target[key] = item
+
+    if target["BGM"].get("SNR") is not None:
+        order = ["序号", "说话人", "BGM", "情感", "方言", "风格", "语速", "基频", "音量", "IP"]
+        caption["audio_sequence"][0] = {key: target[key] for key in order if key in target}
+    return json.dumps(caption, ensure_ascii=False)
+
+
+def parse_duration_seconds(text: str | None) -> float | None:
+    if not isinstance(text, str):
+        return None
+    match = _DURATION_SECONDS_RE.search(text)
+    if match is None:
+        return None
+    try:
+        value = float(match.group(1))
+    except ValueError:
+        return None
+    if value <= 0.0:
+        return None
+    return value
+
+
+def estimate_decode_steps_for_duration(
+    duration_seconds: float,
+    *,
+    sample_rate: int = SAMPLE_RATE,
+    frame_hop: int = AUDIO_FRAME_HOP,
+    patch_size: int = PATCH_SIZE,
+    vae_patch_size: int = VAE_PATCH_SIZE,
+) -> int:
+    if duration_seconds <= 0.0:
+        return 0
+    samples_per_decode_step = int(frame_hop) * int(patch_size) * int(vae_patch_size)
+    required_samples = float(duration_seconds) * float(sample_rate)
+    return max(1, int(math.ceil(required_samples / float(samples_per_decode_step))))
+
+
+def estimate_decode_step_window_for_duration(duration_seconds: float) -> tuple[int, int]:
+    target_steps = estimate_decode_steps_for_duration(duration_seconds)
+    # Ming emits about 0.32s per decode step in the current dense path. Keep a narrow
+    # duration window so BGM does not undershoot badly or run all the way to the generic cap.
+    min_steps = max(1, target_steps - 3)
+    max_steps = max(min_steps, target_steps + 3)
+    return min_steps, max_steps
+
+
+def resolve_effective_runtime_controls(
+    *,
+    text: str,
+    runtime_controls: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    controls = {} if runtime_controls is None else dict(runtime_controls)
+    has_explicit_min = KEY_MIN_DECODE_STEPS in controls and controls[KEY_MIN_DECODE_STEPS] is not None
+    has_explicit_max = KEY_MAX_DECODE_STEPS in controls and controls[KEY_MAX_DECODE_STEPS] is not None
+    if has_explicit_min or has_explicit_max:
+        return controls
+
+    duration_seconds = parse_duration_seconds(text)
+    if duration_seconds is None:
+        return controls
+
+    min_decode_steps, max_decode_steps = estimate_decode_step_window_for_duration(duration_seconds)
+    controls[KEY_MIN_DECODE_STEPS] = min_decode_steps
+    controls[KEY_MAX_DECODE_STEPS] = max_decode_steps
+    return controls
+
+
+def pad_prompt_waveform(
+    waveform: Any,
+    *,
+    patch_size: int = PATCH_SIZE,
+    sample_rate: int = SAMPLE_RATE,
+    frame_hop: int = AUDIO_FRAME_HOP,
+) -> torch.Tensor:
+    tensor = coerce_prompt_waveform(waveform)
+    del frame_hop
+    # Match upstream Ming exactly: tokenizer framerate is 12.5 Hz, so prompt
+    # waveform padding aligns to sample_rate / 12.5 * patch_size samples.
+    pad_align = int((float(sample_rate) / 12.5) * int(patch_size))
+    new_len = ((int(tensor.shape[-1]) + pad_align - 1) // pad_align) * pad_align
+    if new_len == int(tensor.shape[-1]):
+        return tensor
+    padded = torch.zeros((1, new_len), dtype=tensor.dtype, device=tensor.device)
+    padded[:, : tensor.shape[-1]] = tensor
+    return padded
+
+
+def coerce_prompt_waveform(value: Any) -> torch.Tensor:
+    if value is None:
+        raise ValueError("prompt waveform cannot be None")
+    if isinstance(value, torch.Tensor):
+        tensor = value.detach()
+        if tensor.ndim == 1:
+            return tensor.unsqueeze(0).to(torch.float32)
+        if tensor.ndim == 2:
+            if tensor.shape[0] != 1:
+                return tensor.reshape(1, -1).to(torch.float32)
+            return tensor.to(torch.float32)
+        raise ValueError(f"Unsupported Ming prompt waveform rank: {tuple(tensor.shape)}")
+
+    if isinstance(value, (list, tuple)):
+        parts = []
+        for item in value:
+            if item is None:
+                continue
+            parts.append(coerce_prompt_waveform(item))
+        if not parts:
+            raise ValueError("prompt waveform list was empty")
+        return torch.cat(parts, dim=-1)
+
+    return coerce_prompt_waveform(torch.as_tensor(value))
+
+
+def coerce_speaker_embeddings(value: Any, *, use_zero_spk_emb: bool = False) -> list[torch.Tensor] | None:
+    if value is None:
+        if use_zero_spk_emb:
+            return [torch.zeros((192,), dtype=torch.float32)]
+        return None
+
+    if isinstance(value, torch.Tensor):
+        tensor = value.detach()
+        if tensor.ndim == 1:
+            tensor = tensor.unsqueeze(0)
+        if tensor.ndim != 2:
+            raise ValueError(f"Unsupported Ming speaker embedding shape: {tuple(tensor.shape)}")
+        items = [row.reshape(-1).to(torch.float32).cpu() for row in tensor]
+    elif isinstance(value, (list, tuple)):
+        if value and all(not isinstance(item, (list, tuple, torch.Tensor)) for item in value):
+            items = [torch.as_tensor(value).detach().reshape(-1).to(torch.float32).cpu()]
+        else:
+            items = []
+            for item in value:
+                if item is None:
+                    continue
+                if not isinstance(item, torch.Tensor):
+                    item = torch.as_tensor(item)
+                flat = item.detach().reshape(-1).to(torch.float32).cpu()
+                items.append(flat)
+    else:
+        return coerce_speaker_embeddings(torch.as_tensor(value), use_zero_spk_emb=use_zero_spk_emb)
+
+    if not items:
+        return [torch.zeros((192,), dtype=torch.float32)] if use_zero_spk_emb else None
+    for item in items:
+        if int(item.numel()) != 192:
+            raise ValueError(f"Ming speaker embedding must have 192 dims, got {int(item.numel())}")
+    return items
+
+
+def count_prompt_latent_patches(
+    value: Any,
+    *,
+    patch_size: int = PATCH_SIZE,
+    latent_dim: int = LATENT_DIM,
+) -> int:
+    if value is None:
+        return 0
+    if not isinstance(value, torch.Tensor):
+        value = torch.as_tensor(value)
+
+    latents = value.detach()
+    if latents.ndim == 3 and latents.shape[0] == 1:
+        latents = latents.squeeze(0)
+
+    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
+        return int(latents.shape[0])
+
+    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
+        raise ValueError(f"Unsupported Ming prompt_latents shape: {tuple(latents.shape)}")
+    if latents.shape[0] % patch_size != 0:
+        raise ValueError(
+            f"Ming prompt_latents frame count must be divisible by patch_size={patch_size}, "
+            f"got frames={int(latents.shape[0])}"
+        )
+    return int(latents.shape[0] // patch_size)
+
+
+def count_prompt_waveform_patches(
+    value: Any,
+    *,
+    patch_size: int = PATCH_SIZE,
+    frame_hop: int = AUDIO_FRAME_HOP,
+    vae_patch_size: int = VAE_PATCH_SIZE,
+) -> int:
+    if value is None:
+        return 0
+    waveform = pad_prompt_waveform(value, patch_size=patch_size, frame_hop=frame_hop)
+    frame_count = int(math.ceil(float(waveform.shape[-1]) / float(frame_hop)))
+    latent_frames = int(math.ceil(float(frame_count) / float(vae_patch_size)))
+    if latent_frames % int(patch_size) != 0:
+        raise ValueError(
+            f"Ming prompt waveform produced latent frame count not divisible by patch_size={patch_size}: "
+            f"frames={latent_frames}"
+        )
+    return int(latent_frames // int(patch_size))
+
+
+def build_dense_prompt_token_ids(
+    tokenizer: Any,
+    *,
+    prompt: str,
+    text: str,
+    instruction: str | None = None,
+    prompt_text: str | None = None,
+    speaker_count: int = 0,
+    prompt_patch_count: int = 0,
+) -> list[int]:
+    speaker_prompt = []
+    for idx in range(int(speaker_count)):
+        speaker_prompt.extend(
+            tokenizer.encode(f"  speaker_{idx + 1}:")
+            + tokenizer.encode("<|vision_start|>")
+            + tokenizer.encode("<|vision_pad|>")
+            + tokenizer.encode("<|vision_end|>\n")
+        )
+
+    instruction_prompt = []
+    if instruction is not None:
+        instruction_prompt = tokenizer.encode(instruction) + tokenizer.encode("<|endoftext|>")
+
+    prompt_text_tokens = []
+    prompt_latent_tokens = []
+    if int(prompt_patch_count) > 0:
+        if prompt_text is not None:
+            prompt_text_tokens = tokenizer.encode(prompt_text)
+        prompt_latent_tokens = [tokenizer.convert_tokens_to_ids("<audioPatch>")] * int(prompt_patch_count)
+
+    text_input_prefix = tokenizer.encode(" Text input:\n")
+    if "Genre: " in text and "Mood: " in text and "Instrument: " in text and "Theme: " in text and "Duration: " in text:
+        text_input_prefix = []
+
+    return (
+        tokenizer.encode("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n")
+        + tokenizer.encode("<|im_start|>user\n")
+        + tokenizer.encode(prompt)
+        + speaker_prompt
+        + text_input_prefix
+        + prompt_text_tokens
+        + tokenizer.encode(text)
+        + tokenizer.encode("<|im_end|>\n")
+        + tokenizer.encode("<|im_start|>assistant\n")
+        + instruction_prompt
+        + tokenizer.encode("<audio>")
+        + prompt_latent_tokens
+    )
+
+
+def build_ming_dense_prompt(
+    tokenizer: Any,
+    *,
+    prompt: str,
+    text: str,
+    runtime_controls: dict[str, Any] | None = None,
+    instruction: Any = None,
+    prompt_text: str | None = None,
+    prompt_waveform: Any = None,
+    prompt_latents: Any = None,
+    speaker_embedding: Any = None,
+    use_zero_spk_emb: bool = False,
+    request_id: str | None = None,
+) -> dict[str, Any]:
+    instruction_text = create_instruction(instruction)
+    speaker_embeddings = coerce_speaker_embeddings(speaker_embedding, use_zero_spk_emb=use_zero_spk_emb)
+    effective_runtime_controls = resolve_effective_runtime_controls(
+        text=text,
+        runtime_controls=runtime_controls,
+    )
+
+    prompt_waveform_tensor = None
+    prompt_patch_count = 0
+    if prompt_waveform is not None:
+        prompt_waveform_tensor = pad_prompt_waveform(prompt_waveform)
+        prompt_patch_count = count_prompt_waveform_patches(prompt_waveform_tensor)
+    if prompt_waveform_tensor is not None and prompt_latents is not None:
+        raise ValueError(
+            "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
+            "Choose exactly one source of truth."
+        )
+
+    prompt_latent_value = None
+    if prompt_waveform_tensor is not None and prompt_text is None:
+        raise ValueError(
+            "Ming prompt_waveform requires prompt_text for prompt-latent conditioning. "
+            "Use speaker_embedding for reference-audio-only speaker conditioning."
+        )
+    if prompt_latents is not None:
+        prompt_latent_value = torch.as_tensor(prompt_latents)
+        prompt_patch_count = count_prompt_latent_patches(
+            prompt_latent_value,
+            patch_size=PATCH_SIZE,
+            latent_dim=LATENT_DIM,
+        )
+
+    prompt_token_ids = build_dense_prompt_token_ids(
+        tokenizer,
+        prompt=prompt,
+        text=text,
+        instruction=instruction_text,
+        prompt_text=prompt_text if prompt_patch_count > 0 else None,
+        speaker_count=0 if speaker_embeddings is None else len(speaker_embeddings),
+        prompt_patch_count=prompt_patch_count,
+    )
+
+    additional_information = {}
+    if effective_runtime_controls:
+        for key, value in effective_runtime_controls.items():
+            if isinstance(value, torch.Tensor):
+                additional_information[key] = value
+            elif key in (KEY_MIN_DECODE_STEPS, KEY_MAX_DECODE_STEPS):
+                additional_information[key] = torch.tensor(int(value), dtype=torch.int32)
+            else:
+                additional_information[key] = torch.tensor(float(value), dtype=torch.float32)
+    if request_id is not None:
+        additional_information[KEY_REQUEST_ID] = request_id
+    if instruction_text is not None:
+        additional_information["instruction"] = instruction_text
+    if prompt_text is not None:
+        additional_information["prompt_text"] = prompt_text
+    if prompt_waveform_tensor is not None:
+        additional_information["prompt_waveform"] = prompt_waveform_tensor
+        additional_information["prompt_waveform_length"] = torch.tensor(
+            [int(prompt_waveform_tensor.shape[-1])],
+            dtype=torch.int32,
+        )
+    if prompt_latent_value is not None:
+        additional_information[KEY_PROMPT_LATENTS] = prompt_latent_value
+    if speaker_embeddings is not None:
+        additional_information[KEY_SPEAKER_EMBEDDING] = (
+            speaker_embeddings[0] if len(speaker_embeddings) == 1 else torch.stack(speaker_embeddings, dim=0)
+        )
+    if use_zero_spk_emb:
+        additional_information["use_zero_spk_emb"] = True
+
+    return {
+        "prompt": prompt,
+        "text": text,
+        "prompt_token_ids": prompt_token_ids,
+        "additional_information": additional_information,
+    }
+
+
+def build_runtime_controls(
+    *,
+    cfg: float | None = None,
+    sigma: float | None = None,
+    temperature: float | None = None,
+    min_decode_steps: int | None = None,
+    max_decode_steps: int | None = None,
+) -> dict[str, torch.Tensor]:
+    controls = {}
+    if cfg is not None:
+        controls[KEY_CFG] = torch.tensor(float(cfg), dtype=torch.float32)
+    if sigma is not None:
+        controls[KEY_SIGMA] = torch.tensor(float(sigma), dtype=torch.float32)
+    if temperature is not None:
+        controls[KEY_TEMPERATURE] = torch.tensor(float(temperature), dtype=torch.float32)
+    if min_decode_steps is not None:
+        controls[KEY_MIN_DECODE_STEPS] = torch.tensor(int(min_decode_steps), dtype=torch.int32)
+    if max_decode_steps is not None:
+        controls[KEY_MAX_DECODE_STEPS] = torch.tensor(int(max_decode_steps), dtype=torch.int32)
+    return controls
diff --git a/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py b/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py
new file mode 100644
index 00000000000..b32d302658c
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/spkemb_extractor.py
+import os
+
+import onnxruntime
+import torch
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+
+
+def resolve_model_to_local_path(model):
+    if os.path.isdir(model):
+        return model
+
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(model)
+
+
+class MingSpeakerEmbeddingExtractor:
+    def __init__(self, model, target_sr=16000):
+        local_model_path = resolve_model_to_local_path(model)
+        campplus_path = os.path.join(local_model_path, "campplus.onnx")
+        if not os.path.exists(campplus_path):
+            raise RuntimeError(f"Missing Ming speaker extractor model: {campplus_path}")
+
+        options = onnxruntime.SessionOptions()
+        options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        options.intra_op_num_threads = 2
+        self.session = onnxruntime.InferenceSession(
+            campplus_path,
+            sess_options=options,
+            providers=["CPUExecutionProvider"],
+        )
+        self.target_sr = int(target_sr)
+
+    def extract_from_waveform(self, waveform, sample_rate):
+        if not isinstance(waveform, torch.Tensor):
+            waveform = torch.as_tensor(waveform)
+
+        tensor = waveform.detach().to(torch.float32)
+        if tensor.ndim == 1:
+            tensor = tensor.unsqueeze(0)
+        if int(sample_rate) != self.target_sr:
+            tensor = torchaudio.transforms.Resample(orig_freq=int(sample_rate), new_freq=self.target_sr)(tensor)
+
+        feat = kaldi.fbank(
+            tensor,
+            num_mel_bins=80,
+            dither=0,
+            sample_frequency=self.target_sr,
+        )
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        embedding = self.session.run(
+            None,
+            {self.session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()},
+        )[0].flatten()
+        return torch.tensor(embedding, dtype=torch.float32)
+
+    def extract_from_file(self, audio_path):
+        waveform, sample_rate = torchaudio.load(audio_path)
+        return self.extract_from_waveform(waveform, sample_rate)
+
+    def extract_many(self, audio_paths):
+        return [self.extract_from_file(path) for path in audio_paths]
diff --git a/vllm_omni/model_executor/models/registry.py b/vllm_omni/model_executor/models/registry.py
index 71a3ac70a65..25d5a892984 100644
--- a/vllm_omni/model_executor/models/registry.py
+++ b/vllm_omni/model_executor/models/registry.py
@@ -118,6 +118,22 @@
         "mimo_audio_code2wav",
         "MiMoAudioToken2WavForConditionalGenerationVLLM",
     ),
+    ## ming-tts
+    "MingTTSForConditionalGeneration": (
+        "ming_tts",
+        "ming_tts",
+        "MingTTSForConditionalGeneration",
+    ),
+    "MingLLMModel": (
+        "ming_tts",
+        "ming_tts_llm",
+        "MingLLMModel",
+    ),
+    "MingAudioVAEModel": (
+        "ming_tts",
+        "ming_tts_audio_vae",
+        "MingAudioVAEModel",
+    ),
     ## glm_image
     "GlmImageForConditionalGeneration": (
         "glm_image",
diff --git a/vllm_omni/model_executor/stage_configs/ming_tts.yaml b/vllm_omni/model_executor/stage_configs/ming_tts.yaml
new file mode 100644
index 00000000000..01063afcf86
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/ming_tts.yaml
@@ -0,0 +1,65 @@
+async_chunk: false
+stage_args:
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      process: true
+      devices: "0"
+    engine_args:
+      dtype: bfloat16
+      max_num_seqs: 1
+      model_stage: llm
+      model_arch: MingTTSForConditionalGeneration
+      hf_config_name: llm_config
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.45
+      enforce_eager: true
+      trust_remote_code: false
+      async_scheduling: false
+      enable_prefix_caching: false
+      engine_output_type: latent
+      max_model_len: 8192
+      max_num_batched_tokens: 8192
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 512
+      detokenize: true
+
+  - stage_id: 1
+    stage_type: llm
+    runtime:
+      process: true
+      devices: "0"
+    engine_args:
+      dtype: bfloat16
+      max_num_seqs: 1
+      model_stage: audio_vae
+      model_arch: MingTTSForConditionalGeneration
+      hf_config_name: llm_config
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.25
+      enforce_eager: true
+      trust_remote_code: false
+      async_scheduling: false
+      enable_prefix_caching: false
+      engine_output_type: audio
+      max_model_len: 8192
+      max_num_batched_tokens: 8192
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.ming_tts.llm2audio_vae
+    is_comprehension: false
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 1
+      detokenize: false
diff --git a/vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml
new file mode 100644
index 00000000000..b7ffc8212ee
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml
@@ -0,0 +1,86 @@
+async_chunk: true
+stage_args:
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      process: true
+      devices: "0"
+    engine_args:
+      dtype: bfloat16
+      max_num_seqs: 1
+      model_stage: llm
+      model_arch: MingTTSForConditionalGeneration
+      hf_config_name: llm_config
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.45
+      enforce_eager: true
+      trust_remote_code: false
+      enable_prefix_caching: false
+      engine_output_type: latent
+      max_model_len: 8192
+      max_num_batched_tokens: 8192
+      custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.ming_tts.llm2audio_vae_async_chunk
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 512
+      detokenize: true
+    output_connectors:
+      to_stage_1: connector_of_shared_memory
+
+  - stage_id: 1
+    stage_type: llm
+    runtime:
+      process: true
+      devices: "0"
+    engine_args:
+      dtype: bfloat16
+      max_num_seqs: 1
+      model_stage: audio_vae
+      model_arch: MingTTSForConditionalGeneration
+      hf_config_name: llm_config
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.25
+      enforce_eager: true
+      trust_remote_code: false
+      async_scheduling: false
+      enable_prefix_caching: false
+      engine_output_type: audio
+      max_model_len: 8192
+      max_num_batched_tokens: 8192
+    engine_input_source: [0]
+    is_comprehension: false
+    final_output: true
+    final_output_type: audio
+    input_connectors:
+      from_stage_0: connector_of_shared_memory
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 1
+      detokenize: false
+
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 1
+
+  connectors:
+    connector_of_shared_memory:
+      name: SharedMemoryConnector
+      extra:
+        latent_chunk_size: 25
+        latent_left_context: 0
+
+  edges:
+    - from: 0
+      to: 1
+      window_size: -1
diff --git a/vllm_omni/model_executor/stage_input_processors/ming_tts.py b/vllm_omni/model_executor/stage_input_processors/ming_tts.py
new file mode 100644
index 00000000000..0017361a9d1
--- /dev/null
+++ b/vllm_omni/model_executor/stage_input_processors/ming_tts.py
@@ -0,0 +1,278 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+from vllm.inputs import TextPrompt
+from vllm.logger import init_logger
+
+from vllm_omni.inputs.data import OmniTokensPrompt
+from vllm_omni.model_executor.models.ming_tts.config_ming_tts import (
+    KEY_CHUNK_ID,
+    KEY_REQUEST_ID,
+    LATENT_CHUNK_SIZE,
+    LATENT_DIM,
+    LATENT_LEFT_CONTEXT,
+    PATCH_SIZE,
+)
+
+logger = init_logger(__name__)
+
+MING_EMIT_PATCH_COUNT_KEY = "ming_emit_patch_count"
+MING_LATENT_SHAPE_KEY = "ming_latent_shape"
+MING_ESTIMATED_BYTES_KEY = "ming_estimated_bytes"
+MING_FINAL_FLUSH_KEY = "ming_final_flush"
+MING_STOP_REASON_KEY = "ming_stop_reason"
+MING_FINAL_DECODE_STEP_KEY = "ming_final_decode_step"
+
+
+def _extract_last_patch(pooling_output: dict[str, Any] | None) -> torch.Tensor | None:
+    if not isinstance(pooling_output, dict):
+        return None
+    has_patch = pooling_output.get("ming_has_patch")
+    patch = pooling_output.get("ming_latent_patch")
+    if not isinstance(patch, torch.Tensor) or patch.numel() == 0:
+        return None
+
+    if isinstance(has_patch, torch.Tensor) and has_patch.numel() > 0:
+        active = (has_patch.reshape(-1) > 0).nonzero(as_tuple=True)[0]
+        if active.numel() == 0:
+            return None
+        patch = patch[int(active[-1].item())]
+    elif patch.ndim == 3:
+        patch = patch[-1]
+
+    if patch.ndim != 2:
+        raise ValueError(f"Invalid Ming latent patch shape: {tuple(patch.shape)}")
+    return patch.to(torch.float32).cpu()
+
+
+def _extract_all_patches(pooling_output: dict[str, Any] | None) -> torch.Tensor | None:
+    if not isinstance(pooling_output, dict):
+        return None
+    has_patch = pooling_output.get("ming_has_patch")
+    patch = pooling_output.get("ming_latent_patch")
+    if not isinstance(patch, torch.Tensor) or patch.numel() == 0:
+        return None
+
+    if patch.ndim == 2:
+        patch = patch.unsqueeze(0)
+    if patch.ndim != 3:
+        raise ValueError(f"Invalid Ming latent patch tensor shape: {tuple(patch.shape)}")
+
+    if isinstance(has_patch, torch.Tensor) and has_patch.numel() > 0:
+        active = (has_patch.reshape(-1) > 0).nonzero(as_tuple=True)[0]
+        if active.numel() == 0:
+            return None
+        patch = patch.index_select(0, active.to(device=patch.device))
+
+    if patch.numel() == 0:
+        return None
+    return patch.to(torch.float32).cpu()
+
+
+def _extract_last_value(pooling_output: dict[str, Any] | None, key: str) -> Any:
+    if not isinstance(pooling_output, dict):
+        return None
+    value = pooling_output.get(key)
+    if value is None:
+        return None
+
+    has_patch = pooling_output.get("ming_has_patch")
+    selected_index = -1
+    if isinstance(has_patch, torch.Tensor) and has_patch.numel() > 0:
+        active = (has_patch.reshape(-1) > 0).nonzero(as_tuple=True)[0]
+        if active.numel() == 0:
+            return None
+        selected_index = int(active[-1].item())
+
+    if isinstance(value, torch.Tensor):
+        flat = value.reshape(-1)
+        if flat.numel() == 0:
+            return None
+        return flat[min(selected_index, flat.numel() - 1)].item()
+    if isinstance(value, (list, tuple)):
+        if not value:
+            return None
+        if selected_index < 0:
+            return value[-1]
+        return value[min(selected_index, len(value) - 1)]
+    return value
+
+
+def _get_async_chunk_config_value(cfg: dict[str, Any], key: str, fallback: int) -> int:
+    if key not in cfg:
+        logger.warning("Ming async chunk config missing %s, using fallback value %s", key, fallback)
+        return fallback
+    return int(cfg[key])
+
+
+def _get_async_chunk_config(transfer_manager: Any) -> tuple[int, int]:
+    connector = getattr(transfer_manager, "connector", None)
+    raw_cfg = getattr(connector, "config", {}) or {}
+    cfg = raw_cfg.get("extra", raw_cfg) if isinstance(raw_cfg, dict) else {}
+
+    chunk_size = _get_async_chunk_config_value(cfg, "latent_chunk_size", LATENT_CHUNK_SIZE)
+    left_context = _get_async_chunk_config_value(cfg, "latent_left_context", LATENT_LEFT_CONTEXT)
+    if chunk_size <= 0:
+        raise ValueError(f"Invalid Ming latent_chunk_size={chunk_size}")
+    # Stage-2 VAE caches past_key_values and stream_state by request_id.
+    # Replaying left-context latents would double-feed cached decoder state.
+    if left_context != 0:
+        raise ValueError(
+            "Ming async chunk transport does not support latent_left_context replay. "
+            "Ming boundary continuity is handled by per-request decoder state cache, not "
+            f"latent replay. Got latent_left_context={left_context}."
+        )
+    return chunk_size, left_context
+
+
+def _build_chunk_observability(
+    latent_patches: torch.Tensor | None,
+    *,
+    final_flush: bool,
+) -> dict[str, Any]:
+    if latent_patches is None:
+        emit_patch_count = 0
+        latent_shape = None
+        estimated_bytes = 0
+    else:
+        emit_patch_count = int(latent_patches.shape[0])
+        latent_shape = tuple(latent_patches.shape)
+        estimated_bytes = int(latent_patches.numel() * latent_patches.element_size())
+    return {
+        MING_EMIT_PATCH_COUNT_KEY: emit_patch_count,
+        MING_LATENT_SHAPE_KEY: latent_shape,
+        MING_ESTIMATED_BYTES_KEY: estimated_bytes,
+        MING_FINAL_FLUSH_KEY: bool(final_flush),
+    }
+
+
+def llm2audio_vae_async_chunk(
+    transfer_manager: Any,
+    pooling_output: dict[str, Any] | None,
+    request: Any,
+    is_finished: bool = False,
+) -> dict[str, Any] | None:
+    request_id = request.external_req_id
+    chunk_id = int(transfer_manager.put_req_chunk[request_id])
+    finished = bool(is_finished or request.is_finished())
+    final_decode_step = _extract_last_value(pooling_output, "ming_decode_step")
+    stop_reason = _extract_last_value(pooling_output, MING_STOP_REASON_KEY)
+    request_state = transfer_manager.request_payload.get(request_id)
+    if not isinstance(request_state, dict) or "_ming_async_state" not in request_state:
+        request_state = {
+            "_ming_async_state": {
+                "seen_patch_len": 0,
+                "terminal_sent": False,
+            }
+        }
+        transfer_manager.request_payload[request_id] = request_state
+    state = request_state["_ming_async_state"]
+    if bool(state.get("terminal_sent", False)):
+        return None
+
+    patch = _extract_last_patch(pooling_output)
+    if patch is not None:
+        transfer_manager.code_prompt_token_ids[request_id].append(patch)
+
+    chunk_size, _ = _get_async_chunk_config(transfer_manager)
+
+    patches = transfer_manager.code_prompt_token_ids[request_id]
+    seen_patch_len = int(state.get("seen_patch_len", 0))
+    new_patches = patches[seen_patch_len:] if seen_patch_len < len(patches) else []
+    length = len(new_patches)
+    if length <= 0:
+        if finished and not bool(state.get("terminal_sent", False)):
+            observability = _build_chunk_observability(None, final_flush=True)
+            payload = {
+                "code_predictor_codes": [],
+                "finished": torch.tensor(True, dtype=torch.bool),
+                "stream_finished": torch.tensor(True, dtype=torch.bool),
+                KEY_CHUNK_ID: chunk_id,
+                KEY_REQUEST_ID: request_id,
+                **observability,
+            }
+            if final_decode_step is not None:
+                payload[MING_FINAL_DECODE_STEP_KEY] = int(final_decode_step)
+            if stop_reason is not None:
+                payload[MING_STOP_REASON_KEY] = stop_reason
+            state["terminal_sent"] = True
+            return payload
+        return None
+
+    chunk_length = length % chunk_size
+    if chunk_length != 0 and not finished:
+        return None
+
+    emit_count = chunk_length if chunk_length != 0 else chunk_size
+    emit_patches = list(new_patches[:emit_count])
+    state["seen_patch_len"] = seen_patch_len + len(emit_patches)
+    latent_patches = torch.stack(emit_patches, dim=0)
+    observability = _build_chunk_observability(latent_patches, final_flush=finished)
+
+    payload = {
+        "code_predictor_codes": [0],
+        "ming_latent_patches": latent_patches,
+        "finished": torch.tensor(finished, dtype=torch.bool),
+        "stream_finished": torch.tensor(finished, dtype=torch.bool),
+        KEY_CHUNK_ID: chunk_id,
+        KEY_REQUEST_ID: request_id,
+        **observability,
+    }
+    if final_decode_step is not None:
+        payload[MING_FINAL_DECODE_STEP_KEY] = int(final_decode_step)
+    if stop_reason is not None:
+        payload[MING_STOP_REASON_KEY] = stop_reason
+    if finished:
+        state["terminal_sent"] = True
+    return payload
+
+
+def llm2audio_vae(
+    stage_list: list[Any],
+    engine_input_source: list[int],
+    prompt: OmniTokensPrompt | TextPrompt | None = None,
+    requires_multimodal_data: bool = False,
+) -> list[OmniTokensPrompt]:
+    del prompt, requires_multimodal_data
+    if not engine_input_source:
+        raise ValueError("engine_input_source cannot be empty")
+
+    source_stage_id = engine_input_source[0]
+    if source_stage_id >= len(stage_list):
+        raise IndexError(f"Invalid stage_id: {source_stage_id}")
+    if stage_list[source_stage_id].engine_outputs is None:
+        raise RuntimeError(f"Stage {source_stage_id} has no outputs yet")
+
+    outputs = []
+    for stage_output in stage_list[source_stage_id].engine_outputs:
+        finished = bool(getattr(stage_output, "finished", True))
+        if not finished:
+            continue
+        output = stage_output.outputs[0]
+        patches = _extract_all_patches(output.multimodal_output)
+        additional_information = {
+            "ming_latent_patches": patches
+            if patches is not None
+            else torch.zeros((0, PATCH_SIZE, LATENT_DIM), dtype=torch.float32),
+            KEY_REQUEST_ID: getattr(stage_output, "request_id", None),
+            "finished": torch.tensor(finished, dtype=torch.bool),
+        }
+        final_decode_step = _extract_last_value(output.multimodal_output, "ming_decode_step")
+        stop_reason = _extract_last_value(output.multimodal_output, MING_STOP_REASON_KEY)
+        if final_decode_step is not None:
+            additional_information[MING_FINAL_DECODE_STEP_KEY] = int(final_decode_step)
+        if stop_reason is not None:
+            additional_information[MING_STOP_REASON_KEY] = stop_reason
+        outputs.append(
+            OmniTokensPrompt(
+                prompt_token_ids=[0],
+                multi_modal_data=None,
+                mm_processor_kwargs=None,
+                additional_information=additional_information,
+            )
+        )
+    return outputs
diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
index 7a6f3b4538d..f102b890ad8 100644
--- a/vllm_omni/worker/gpu_model_runner.py
+++ b/vllm_omni/worker/gpu_model_runner.py
@@ -1287,6 +1287,8 @@ def _preprocess(
             decode_req_ids = []
             for req_index, req_id in enumerate(self.input_batch.req_ids):
                 req_infos = self.model_intermediate_buffer.get(req_id, {})
+                req_infos = dict(req_infos) if isinstance(req_infos, dict) else {}
+                req_infos.setdefault("req_id", req_id)
 
                 # mimo-audio check
                 req_state = self.requests.get(req_id)

From 9cda9108051d0759cbad41bba0c47ea6ae1a5c28 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Sun, 19 Apr 2026 00:34:08 +0530
Subject: [PATCH 02/54] fix(ming-tts): serialize stage0 stop reason as tensor

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../models/ming_tts/ming_tts_llm.py             | 14 +++++++++-----
 .../stage_input_processors/ming_tts.py          | 17 +++++++++++++++--
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py b/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
index a811009e0ba..1619991d3b9 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
@@ -41,6 +41,11 @@
 MING_STOP_REASON_STOP_HEAD = "stop_head"
 MING_STOP_REASON_MAX_DECODE_STEPS = "max_decode_steps"
 MING_STOP_REASON_KEY = "ming_stop_reason"
+MING_STOP_REASON_CODES = {
+    MING_STOP_REASON_CONTINUE: 0,
+    MING_STOP_REASON_STOP_HEAD: 1,
+    MING_STOP_REASON_MAX_DECODE_STEPS: 2,
+}
 
 
 class MingLLMModel(nn.Module):
@@ -173,8 +178,8 @@ def forward(
         decode_step_tokens = None
         has_patch = None
         max_decode_step_tokens = None
+        stop_reason_code_tokens = None
         pending_updates: dict[str, dict[str, Any]] = {}
-        stop_reason_tokens: list[str] | None = None
         sampled_decode_steps = []
         sampled_stop_probs = []
         sampled_max_decode_steps = []
@@ -241,7 +246,7 @@ def forward(
                 max_decode_step_tokens = torch.zeros((total_tokens,), dtype=torch.int32, device=hidden_states.device)
                 min_decode_step_tokens = torch.zeros((total_tokens,), dtype=torch.int32, device=hidden_states.device)
                 has_patch = torch.zeros((total_tokens,), dtype=torch.bool, device=hidden_states.device)
-                stop_reason_tokens = [MING_STOP_REASON_CONTINUE] * total_tokens
+                stop_reason_code_tokens = torch.zeros((total_tokens,), dtype=torch.int32, device=hidden_states.device)
 
             latent_patch_tokens[output_index : output_index + 1] = sampled_token_latent
             next_embed_tokens[output_index : output_index + 1] = next_embeds
@@ -265,8 +270,7 @@ def forward(
                 audio_dummy_token_id=int(self.ming_config.audio_dummy_token_id),
                 text_eos_token_id=int(self.ming_config.text_eos_token_id),
             )
-            if stop_reason_tokens is not None:
-                stop_reason_tokens[output_index] = stop_reason
+            stop_reason_code_tokens[output_index : output_index + 1] = MING_STOP_REASON_CODES[stop_reason]
             if isinstance(req_id, str):
                 pending_updates[req_id] = {
                     KEY_LATENT_HISTORY: new_history,
@@ -318,7 +322,7 @@ def forward(
                 "ming_max_decode_steps": max_decode_step_tokens,
                 "ming_min_decode_steps": min_decode_step_tokens,
                 "ming_has_patch": has_patch,
-                MING_STOP_REASON_KEY: tuple(stop_reason_tokens or []),
+                MING_STOP_REASON_KEY: stop_reason_code_tokens,
             },
             intermediate_tensors=intermediate_tensors,
         )
diff --git a/vllm_omni/model_executor/stage_input_processors/ming_tts.py b/vllm_omni/model_executor/stage_input_processors/ming_tts.py
index 0017361a9d1..b50f22e6a02 100644
--- a/vllm_omni/model_executor/stage_input_processors/ming_tts.py
+++ b/vllm_omni/model_executor/stage_input_processors/ming_tts.py
@@ -26,6 +26,11 @@
 MING_FINAL_FLUSH_KEY = "ming_final_flush"
 MING_STOP_REASON_KEY = "ming_stop_reason"
 MING_FINAL_DECODE_STEP_KEY = "ming_final_decode_step"
+MING_STOP_REASON_BY_CODE = {
+    0: "continue",
+    1: "stop_head",
+    2: "max_decode_steps",
+}
 
 
 def _extract_last_patch(pooling_output: dict[str, Any] | None) -> torch.Tensor | None:
@@ -102,6 +107,14 @@ def _extract_last_value(pooling_output: dict[str, Any] | None, key: str) -> Any:
     return value
 
 
+def _decode_stop_reason(value: Any) -> str | None:
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value
+    return MING_STOP_REASON_BY_CODE.get(int(value))
+
+
 def _get_async_chunk_config_value(cfg: dict[str, Any], key: str, fallback: int) -> int:
     if key not in cfg:
         logger.warning("Ming async chunk config missing %s, using fallback value %s", key, fallback)
@@ -160,7 +173,7 @@ def llm2audio_vae_async_chunk(
     chunk_id = int(transfer_manager.put_req_chunk[request_id])
     finished = bool(is_finished or request.is_finished())
     final_decode_step = _extract_last_value(pooling_output, "ming_decode_step")
-    stop_reason = _extract_last_value(pooling_output, MING_STOP_REASON_KEY)
+    stop_reason = _decode_stop_reason(_extract_last_value(pooling_output, MING_STOP_REASON_KEY))
     request_state = transfer_manager.request_payload.get(request_id)
     if not isinstance(request_state, dict) or "_ming_async_state" not in request_state:
         request_state = {
@@ -262,7 +275,7 @@ def llm2audio_vae(
             "finished": torch.tensor(finished, dtype=torch.bool),
         }
         final_decode_step = _extract_last_value(output.multimodal_output, "ming_decode_step")
-        stop_reason = _extract_last_value(output.multimodal_output, MING_STOP_REASON_KEY)
+        stop_reason = _decode_stop_reason(_extract_last_value(output.multimodal_output, MING_STOP_REASON_KEY))
         if final_decode_step is not None:
             additional_information[MING_FINAL_DECODE_STEP_KEY] = int(final_decode_step)
         if stop_reason is not None:

From 9add4ef7c63fe7f409dd7d117ca933ff83fc3691 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Sun, 19 Apr 2026 01:30:04 +0530
Subject: [PATCH 03/54] docs: Update Ming TTS example

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 docs/models/supported_models.md               |  2 +-
 .../examples/offline_inference/ming_tts.md    | 15 +++++++++---
 .../examples/online_serving/ming_tts.md       | 23 +++++++++++++++++++
 examples/offline_inference/ming_tts/README.md | 18 +++++++++++----
 4 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index b298415e177..04cdae58a2a 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -55,7 +55,7 @@ th {
 | `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-CustomVoice | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-VoiceDesign | `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-Base | `Qwen/Qwen3-TTS-12Hz-0.6B-Base` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
-| `MingTTSForConditionalGeneration` | Ming-omni-tts-0.5B | `inclusionAI/Ming-omni-tts-0.5B` | ✅︎ | | | |
+| `MingTTSForConditionalGeneration` | Ming-omni-tts dense 0.5B | `inclusionAI/Ming-omni-tts-0.5B` | ✅︎ | | | |
 | `NextStep11Pipeline` | NextStep-1.1 | `stepfun-ai/NextStep-1.1` | ✅︎ | ✅︎ | | ✅︎ |
 | `MiMoAudioForConditionalGeneration` | MiMo-Audio-7B-Instruct | `XiaomiMiMo/MiMo-Audio-7B-Instruct` | ✅︎ | ✅︎ | | |
 | `Flux2Pipeline` | FLUX.2-dev | `black-forest-labs/FLUX.2-dev` | ✅︎ | ✅︎ | | |
diff --git a/docs/user_guide/examples/offline_inference/ming_tts.md b/docs/user_guide/examples/offline_inference/ming_tts.md
index 7a8cd65ed32..4a572873e42 100644
--- a/docs/user_guide/examples/offline_inference/ming_tts.md
+++ b/docs/user_guide/examples/offline_inference/ming_tts.md
@@ -2,7 +2,7 @@
 
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/ming_tts>.
 
-This directory contains an offline Ming example that uses the in-repo Ming prompt builder directly. It now covers the broader upstream dense TTS cookbook surface: style, IP, music-only generation, emotion, dialect, zero-shot clone, podcast, speech+bgm, and speech+sound.
+This directory contains an offline Ming example that uses the in-repo Ming prompt builder directly. It covers the broader upstream dense 0.5B surface: style, IP, music-only generation, TTA, emotion, dialect, zero-shot clone, podcast, speech+bgm, and speech+sound.
 
 ## Quick Start
 
@@ -46,6 +46,15 @@ python examples/offline_inference/ming_tts/end2end.py \
     --enforce-eager
 ```
 
+Run text-to-audio event generation:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case tta \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --enforce-eager
+```
+
 Run with stats and a manifest:
 
 ```bash
@@ -63,6 +72,7 @@ python examples/offline_inference/ming_tts/end2end.py \
 - `style`: zero-speaker style-conditioned speech
 - `ip`: zero-speaker IP voice generation
 - `bgm`: music generation
+- `tta`: text-to-audio event generation with FlowLoss controls
 - `emotion`: reference-audio speech with emotion control
 - `basic`: reference-audio cloning with speed / pitch / volume control
 - `dialect`: reference-audio cloning with dialect control
@@ -71,8 +81,6 @@ python examples/offline_inference/ming_tts/end2end.py \
 - `speech_bgm`: speech with background music conditioning
 - `speech_sound`: speech with environment sound conditioning
 
-`TTA` from the upstream Ming notebook is not included here because it uses `inclusionAI/Ming-omni-tta-0.5B`, not the dense TTS model covered by this example.
-
 ## Streaming
 
 Use async_chunk streaming with `AsyncOmni`:
@@ -99,6 +107,7 @@ validation helper:
 | `style` | Yes | Optional smoke test | none |
 | `ip` | Yes | Optional smoke test | none |
 | `bgm` | Yes | Optional smoke test | none |
+| `tta` | Yes | Optional smoke test | none |
 | `emotion` | Yes | Yes | reference WAV |
 | `basic` | Yes | Yes | reference WAV |
 | `dialect` | Yes | Yes | reference WAV |
diff --git a/docs/user_guide/examples/online_serving/ming_tts.md b/docs/user_guide/examples/online_serving/ming_tts.md
index e5bc5144dda..dd16a732106 100644
--- a/docs/user_guide/examples/online_serving/ming_tts.md
+++ b/docs/user_guide/examples/online_serving/ming_tts.md
@@ -126,6 +126,29 @@ The bundled `run_curl.sh basic` mode is plain/default TTS and does not require
 `REF_AUDIO`. The upstream cookbook-style `basic` case uses `ref_audio` plus
 structured speed / pitch / volume instructions.
 
+## Request Types
+
+Ming online serving supports these request families through `/v1/audio/speech`:
+
+| Case | Online support | Required fields |
+|------|----------------|-----------------|
+| default TTS | Supported | `input`, `max_new_tokens=200` |
+| `style` | Supported | `input`, `instructions`, `max_new_tokens=200` |
+| `ip` | Supported | `input`, `voice`, `max_new_tokens=200` |
+| `basic` helper | Supported | `input`, `max_new_tokens=200` |
+| upstream `basic` case | Supported | `input`, `ref_audio`, structured speed / pitch / volume `instructions`, `max_new_tokens=200` |
+| `emotion` | Supported | `input`, `ref_audio`, structured emotion `instructions`, `max_new_tokens=200` |
+| `dialect` | Supported | `input`, `language` or structured `instructions`, `ref_audio`, `max_new_tokens=200` |
+| `zero_shot` | Supported | `input`, `ref_audio`, `ref_text`, `max_new_tokens=200` |
+| `podcast` | Supported | `input`, repeated/list `ref_audio`, `ref_text`, `max_new_tokens=200` |
+| `speech_bgm` | Supported | `input`, `ref_audio`, structured `instructions` with `{"BGM": ...}`, `max_new_tokens=200` |
+| `speech_sound` | Supported | `input`, `ref_audio`, structured `instructions` with `{"BGM": {"ENV": ...}}`, `max_new_tokens=200` |
+| `bgm` | Not supported online | Requires a future `prompt_mode=music` API extension |
+| `tta` | Not supported online | Requires a future `prompt_mode=tta` API extension |
+
+The online endpoint is speech-shaped today. Music-only `bgm` and text-to-audio
+`tta` remain offline workflows.
+
 ## Field Mapping
 
 For Ming, the generic OpenAI request fields map to Ming controls like this:
diff --git a/examples/offline_inference/ming_tts/README.md b/examples/offline_inference/ming_tts/README.md
index c67772c43c1..4077210f25f 100644
--- a/examples/offline_inference/ming_tts/README.md
+++ b/examples/offline_inference/ming_tts/README.md
@@ -29,11 +29,12 @@ resampling, and CampPlus speaker extraction, including `soundfile`,
 
 ## Supported Cases
 
-These cases cover the upstream dense TTS cookbook surface that maps cleanly onto the current vLLM-Omni example:
+These cases cover the upstream dense 0.5B cookbook surface that maps cleanly onto the current vLLM-Omni example:
 
 - `style`: zero-speaker style-conditioned speech
 - `ip`: zero-speaker IP voice generation
 - `bgm`: music-only generation
+- `tta`: text-to-audio event generation with FlowLoss controls
 - `emotion`: reference-audio speech with emotion control
 - `basic`: reference-audio speech with speed / pitch / volume control
 - `dialect`: reference-audio speech with dialect control
@@ -42,10 +43,6 @@ These cases cover the upstream dense TTS cookbook surface that maps cleanly onto
 - `speech_bgm`: speech with background music conditioning
 - `speech_sound`: speech with environmental sound conditioning
 
-Not included:
-
-- `TTA` from the upstream cookbook. That notebook switches to `inclusionAI/Ming-omni-tta-0.5B`, which is a different model family and is out of scope for this dense TTS example.
-
 ## Quick Start
 
 Run the zero-speaker style example:
@@ -98,6 +95,15 @@ If you already have precomputed multi-speaker embeddings, you can override extra
 
 where the JSON is a list of speaker embeddings, one 192-d vector per speaker.
 
+Run text-to-audio event generation:
+
+```bash
+python examples/offline_inference/ming_tts/end2end.py \
+    --case tta \
+    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --enforce-eager
+```
+
 Use async_chunk streaming:
 
 ```bash
@@ -146,6 +152,7 @@ by the local Ming validation script:
 | `style` | Yes | Optional smoke test | none |
 | `ip` | Yes | Optional smoke test | none |
 | `bgm` | Yes | Optional smoke test | none |
+| `tta` | Yes | Optional smoke test | none |
 | `emotion` | Yes | Yes | `--ref-audio emotion_prompt.wav` |
 | `basic` | Yes | Yes | `--ref-audio 10002287-00000095.wav` |
 | `dialect` | Yes | Yes | `--ref-audio yue_prompt.wav` |
@@ -165,6 +172,7 @@ and Stage-1 patch counts for every case:
 | `style` | 409248 / 29 / 9.28 | 409248 / 29 / 9.28 |
 | `ip` | 183456 / 13 / 4.16 | 183456 / 13 / 4.16 |
 | `bgm` | 1326528 / 94 / 30.08 | 1326528 / 94 / 30.08 |
+| `tta` | 465696 / 33 / 10.56 | 465696 / 33 / 10.56 |
 | `emotion` | 324576 / 23 / 7.36 | 324576 / 23 / 7.36 |
 | `basic` | 211680 / 15 / 4.80 | 211680 / 15 / 4.80 |
 | `dialect` | 239904 / 17 / 5.44 | 239904 / 17 / 5.44 |

From 1bee58d878ec8a4ac51d0adb0e7ae2d6c0f34f7c Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Thu, 23 Apr 2026 23:58:51 +0530
Subject: [PATCH 04/54] Refactor Ming TTS model layout

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../ming_tts/test_ming_tts_components.py      |   4 +-
 tests/worker/test_ming_tts_runner.py          |   4 +-
 .../models/ming_tts/aggregator.py             |  76 +++
 .../models/ming_tts/backbone.py               |  62 ++
 .../models/ming_tts/config_ming_tts.py        | 342 +++-------
 .../models/ming_tts/constants.py              |  71 +++
 .../{fm/flowloss.py => flowloss_head.py}      |   7 +-
 .../model_executor/models/ming_tts/fm/dit.py  |  53 --
 .../model_executor/models/ming_tts/loader.py  | 296 +++++++++
 .../models/ming_tts/ming_tts.py               | 445 ++-----------
 .../models/ming_tts/ming_tts_llm.py           | 586 +++++-------------
 .../models/ming_tts/patch_emission.py         | 210 +++++++
 .../models/ming_tts/prompt_builder.py         | 429 -------------
 .../ming_tts/prompt_builder/__init__.py       |  37 ++
 .../models/ming_tts/prompt_builder/_base.py   | 210 +++++++
 .../ming_tts/prompt_builder/builders.py       | 210 +++++++
 .../models/ming_tts/validation.py             | 175 ++++++
 17 files changed, 1658 insertions(+), 1559 deletions(-)
 create mode 100644 vllm_omni/model_executor/models/ming_tts/aggregator.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/backbone.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/constants.py
 rename vllm_omni/model_executor/models/ming_tts/{fm/flowloss.py => flowloss_head.py} (92%)
 create mode 100644 vllm_omni/model_executor/models/ming_tts/loader.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/patch_emission.py
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/prompt_builder.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/prompt_builder/__init__.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/prompt_builder/_base.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/prompt_builder/builders.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/validation.py

diff --git a/tests/model_executor/models/ming_tts/test_ming_tts_components.py b/tests/model_executor/models/ming_tts/test_ming_tts_components.py
index 14c4c02db05..866ef4b3f29 100644
--- a/tests/model_executor/models/ming_tts/test_ming_tts_components.py
+++ b/tests/model_executor/models/ming_tts/test_ming_tts_components.py
@@ -6,19 +6,19 @@
 import torch
 import torch.nn as nn
 
+from vllm_omni.model_executor.models.ming_tts.aggregator import Aggregator
 from vllm_omni.model_executor.models.ming_tts.audio_tokenizer.configuration_audio_vae import AudioVAEconfig
 from vllm_omni.model_executor.models.ming_tts.audio_tokenizer.istft import ISTFT, ISTFTHead
 from vllm_omni.model_executor.models.ming_tts.audio_tokenizer.modeling_audio_vae import AudioVAE
 from vllm_omni.model_executor.models.ming_tts.audio_tokenizer.vae_modules import StreamingLinearUpsample
+from vllm_omni.model_executor.models.ming_tts.flowloss_head import FlowLoss
 from vllm_omni.model_executor.models.ming_tts.fm.cfm import CFM, Solver, get_epss_timesteps
 from vllm_omni.model_executor.models.ming_tts.fm.dit import (
-    Aggregator,
     CondEmbedder,
     DiT,
     SinusPositionEmbedding,
     TimestepEmbedder,
 )
-from vllm_omni.model_executor.models.ming_tts.fm.flowloss import FlowLoss
 from vllm_omni.model_executor.models.ming_tts.fm.modules import Attention, DiTBlock, RMSNorm
 from vllm_omni.model_executor.models.ming_tts.ming_tts import (
     _coerce_prompt_latents,
diff --git a/tests/worker/test_ming_tts_runner.py b/tests/worker/test_ming_tts_runner.py
index 89deda2ddb1..b964bf7659b 100644
--- a/tests/worker/test_ming_tts_runner.py
+++ b/tests/worker/test_ming_tts_runner.py
@@ -501,7 +501,9 @@ def _low_stop(_hidden_states):
         seq_token_counts=[1],
     )
 
-    assert output.multimodal_outputs[MING_STOP_REASON_KEY] == (MING_STOP_REASON_CONTINUE,)
+    stop_reason_codes = output.multimodal_outputs[MING_STOP_REASON_KEY]
+    assert isinstance(stop_reason_codes, torch.Tensor)
+    assert int(stop_reason_codes.reshape(-1)[0].item()) == 0
     pending = runner.llm.pop_postprocess_update("req-stop-reason")
     assert pending[MING_STOP_REASON_KEY] == MING_STOP_REASON_CONTINUE
 
diff --git a/vllm_omni/model_executor/models/ming_tts/aggregator.py b/vllm_omni/model_executor/models/ming_tts/aggregator.py
new file mode 100644
index 00000000000..3af25b516aa
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/aggregator.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+from x_transformers.x_transformers import RotaryEmbedding
+
+from .config_ming_tts import MingTTSConfig
+from .fm.modules import DiTBlock, FinalLayer
+
+
+class Aggregator(nn.Module):
+    def __init__(
+        self,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        llm_input_dim=896,
+        **kwargs,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.num_heads = num_heads
+
+        self.word_embedder = nn.Embedding(1, hidden_size)
+        self.x_embedder = nn.Linear(in_channels, hidden_size)
+        self.hidden_size = hidden_size
+        self.rotary_embed = RotaryEmbedding(hidden_size // num_heads)
+        self.blocks = nn.ModuleList(
+            [DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, **kwargs) for _ in range(depth)]
+        )
+        self.final_layer = FinalLayer(hidden_size, llm_input_dim)
+
+    def forward(self, x, mask=None):
+        if x.ndim != 3:
+            raise ValueError(f"Expected x rank-3 [Batch, Time, Dimension], got {tuple(x.shape)}")
+        if x.shape[-1] != self.in_channels:
+            raise ValueError(f"x feature dim mismatch: got {x.shape[-1]}, expected {self.in_channels}")
+
+        # [Batch, Time, Dimension] -> [Batch, Time, Hidden].
+        x = self.x_embedder(x)
+        cls_embed = self.word_embedder(torch.zeros((x.shape[0], 1), dtype=torch.long, device=x.device))
+        # Prepend a learned CLS token: [Batch, Time, Hidden] -> [Batch, Time + 1, Hidden].
+        x = torch.cat([cls_embed, x], dim=1)
+
+        rope = self.rotary_embed.forward_from_seq_len(x.shape[1])
+        if mask is not None:
+            if mask.ndim != 2:
+                raise ValueError(f"Expected mask rank-2 [Batch, Time], got {tuple(mask.shape)}")
+            if mask.shape[0] != x.shape[0] or mask.shape[1] != x.shape[1] - 1:
+                raise ValueError(
+                    f"Mask shape mismatch: got {tuple(mask.shape)}, expected {(x.shape[0], x.shape[1] - 1)}"
+                )
+            mask_pad = mask.clone().detach()[:, :1]
+            mask = torch.cat([mask_pad, mask], dim=-1)
+        for block in self.blocks:
+            x = block(x, mask, rope)
+        x = self.final_layer(x)
+        # Keep the CLS projection only: [Batch, Time + 1, Hidden] -> [Batch, 1, Hidden].
+        return x[:, :1, :]
+
+
+def build_ming_aggregator(cfg: MingTTSConfig) -> Aggregator:
+    """Build the Ming Stage-1 latent patch projector from the parsed config."""
+    return Aggregator(
+        in_channels=cfg.latent_dim,
+        llm_input_dim=cfg.llm_hidden_size,
+        **cfg.aggregator_config,
+    )
+
+
+__all__ = ["Aggregator", "build_ming_aggregator"]
diff --git a/vllm_omni/model_executor/models/ming_tts/backbone.py b/vllm_omni/model_executor/models/ming_tts/backbone.py
new file mode 100644
index 00000000000..76a3033fe66
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/backbone.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+import torch.nn as nn
+from vllm.config import VllmConfig
+from vllm.model_executor.models.qwen2 import Qwen2Model
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.sequence import IntermediateTensors
+
+
+class MingQwen2Backbone(nn.Module):
+    """Thin Ming wrapper around upstream vLLM Qwen2Model."""
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.model = Qwen2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model"))
+
+    def get_input_embeddings(self) -> nn.Module:
+        if hasattr(self.model, "embed_tokens"):
+            return self.model.embed_tokens
+        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):
+            return self.model.model.embed_tokens
+        raise AttributeError("Could not locate token embeddings on Ming Qwen2 backbone.")
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor | None = None,
+        **_: Any,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            return inputs_embeds
+        if hasattr(self.model, "embed_input_ids"):
+            return self.model.embed_input_ids(input_ids)
+        return self.get_input_embeddings()(input_ids)
+
+    def forward(
+        self,
+        *,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_input_ids(input_ids)
+        return self.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
+        return self.model.compute_logits(hidden_states)
+
+    def sample(self, logits: torch.Tensor, sampling_metadata: Any):
+        return self.model.sample(logits, sampling_metadata)
diff --git a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
index 09ae85be69a..0261b39b5c6 100644
--- a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# config_ming_tts.py
 from __future__ import annotations
 
 from dataclasses import dataclass, field
@@ -9,97 +8,71 @@
 from transformers import PretrainedConfig, Qwen2Config
 
 from .audio_tokenizer.configuration_audio_vae import AudioVAEconfig
-
-# ---------------------------------------------------------------------------
-# Token IDs (confirmed from tokenizer_config.json)
-# ---------------------------------------------------------------------------
-
-AUDIO_DUMMY_TOKEN_ID: int = 151705  # <audioPatch>
-AUDIO_START_TOKEN_ID: int = 151706  # <audio>
-AUDIO_END_TOKEN_ID: int = 151707  # </audio>
-AUDIO_EOS_TOKEN_ID: int = 151704  # <end_of_audio>
-VISION_START_TOKEN_ID: int = 151652  # <|vision_start|>
-
-TEXT_EOS_TOKEN_ID: int = 151669  # <text_eos>
-PAD_TOKEN_ID: int = 151643  # <|endoftext|>
-
-# Backward-compat alias for older code paths
-EOS_TOKEN_ID: int = TEXT_EOS_TOKEN_ID
-
-
-# ---------------------------------------------------------------------------
-# Architectural constants (confirmed from original config.json)
-# ---------------------------------------------------------------------------
-
-LATENT_DIM: int = 64
-PATCH_SIZE: int = 4
-HISTORY_PATCH_SIZE: int = 32
-LLM_HIDDEN_SIZE: int = 896
-LLM_VOCAB_SIZE: int = 151936
-AGGREGATOR_HIDDEN_SIZE: int = 1024
-VAE_PATCH_SIZE: int = 4
-SAMPLE_RATE: int = 44100
-
-# AudioVAE frame/hop geometry (confirmed)
-AUDIO_FRAME_HOP: int = 882  # enc input_dim / hop_size / dec output_dim
-
-# stop_head defaults
-STOP_HEAD_MIN_STEPS: int = 3
-STOP_HEAD_THRESHOLD: float = 0.5
-
-# FlowLoss sampling defaults
-DEFAULT_CFG: float = 2.0
-DEFAULT_SIGMA: float = 0.25
-DEFAULT_TEMPERATURE: float = 0.0
-
-# Connector / Stage-2 streaming defaults (runtime tuning)
-LATENT_CHUNK_SIZE: int = 25
-LATENT_LEFT_CONTEXT: int = 0
-MAX_DECODE_STEPS: int = 200
-
-# seq_data.extra_data keys
-KEY_LATENT_HISTORY: str = "ming_latent_history"
-KEY_DECODE_STEP: str = "ming_decode_step"
-KEY_LAST_STOP_PROB: str = "ming_last_stop_prob"
-KEY_NEXT_EMBEDS: str = "ming_next_embeds"
-KEY_PROMPT_LATENTS: str = "ming_prompt_latents"
-KEY_PROMPT_LATENT_TAIL: str = "ming_prompt_latent_tail"
-KEY_SPEAKER_EMBEDDING: str = "ming_speaker_embedding"
-KEY_REQUEST_ID: str = "ming_request_id"
-KEY_CHUNK_ID: str = "ming_chunk_id"
-KEY_CFG: str = "ming_cfg"
-KEY_SIGMA: str = "ming_sigma"
-KEY_TEMPERATURE: str = "ming_temperature"
-KEY_MAX_DECODE_STEPS: str = "ming_max_decode_steps"
-KEY_MIN_DECODE_STEPS: str = "ming_min_decode_steps"
-KEY_TEXT_MODE: str = "ming_text_mode"
+from .constants import (
+    AGGREGATOR_HIDDEN_SIZE,
+    AUDIO_DUMMY_TOKEN_ID,
+    AUDIO_END_TOKEN_ID,
+    AUDIO_EOS_TOKEN_ID,
+    AUDIO_FRAME_HOP,
+    AUDIO_START_TOKEN_ID,
+    DEFAULT_CFG,
+    DEFAULT_SIGMA,
+    DEFAULT_TEMPERATURE,
+    EOS_TOKEN_ID,
+    HISTORY_PATCH_SIZE,
+    KEY_CFG,
+    KEY_CHUNK_ID,
+    KEY_DECODE_STEP,
+    KEY_LAST_STOP_PROB,
+    KEY_LATENT_HISTORY,
+    KEY_MAX_DECODE_STEPS,
+    KEY_MIN_DECODE_STEPS,
+    KEY_NEXT_EMBEDS,
+    KEY_PROMPT_LATENT_TAIL,
+    KEY_PROMPT_LATENTS,
+    KEY_REQUEST_ID,
+    KEY_SIGMA,
+    KEY_SPEAKER_EMBEDDING,
+    KEY_TEMPERATURE,
+    KEY_TEXT_MODE,
+    LATENT_CHUNK_SIZE,
+    LATENT_DIM,
+    LATENT_LEFT_CONTEXT,
+    LLM_HIDDEN_SIZE,
+    LLM_VOCAB_SIZE,
+    MAX_DECODE_STEPS,
+    PAD_TOKEN_ID,
+    PATCH_SIZE,
+    SAMPLE_RATE,
+    STOP_HEAD_MIN_STEPS,
+    STOP_HEAD_THRESHOLD,
+    TEXT_EOS_TOKEN_ID,
+    VAE_PATCH_SIZE,
+    VISION_START_TOKEN_ID,
+)
+from .validation import _coerce_audio_vae_config, _nested_get, _to_plain_dict, validate_ming_tts_config
 
 
 @dataclass
 class MingTTSConfig:
     """Flat config object shared by Stage-1 and Stage-2. Build via from_hf_config()."""
 
-    # --- LLM backbone ---
     llm_hidden_size: int = LLM_HIDDEN_SIZE
     llm_vocab_size: int = LLM_VOCAB_SIZE
     llm_config: dict[str, Any] = field(default_factory=dict)
 
-    # --- Audio latent space ---
     latent_dim: int = LATENT_DIM
     patch_size: int = PATCH_SIZE
     history_patch_size: int = HISTORY_PATCH_SIZE
 
-    # --- Flow / Aggregator sub-configs ---
     ditar_config: dict[str, Any] = field(default_factory=dict)
     aggregator_config: dict[str, Any] = field(default_factory=dict)
 
-    # --- AudioVAE ---
     audio_tokenizer_config: AudioVAEconfig | None = None
     vae_patch_size: int = VAE_PATCH_SIZE
     sample_rate: int = SAMPLE_RATE
     audio_frame_hop: int = AUDIO_FRAME_HOP
 
-    # --- Generation control ---
     cfg: float = DEFAULT_CFG
     sigma: float = DEFAULT_SIGMA
     temperature: float = DEFAULT_TEMPERATURE
@@ -107,13 +80,11 @@ class MingTTSConfig:
     stop_head_threshold: float = STOP_HEAD_THRESHOLD
     max_decode_steps: int = MAX_DECODE_STEPS
 
-    # --- Stage-2 chunking (runtime tuning) ---
     latent_chunk_size: int = LATENT_CHUNK_SIZE
     latent_left_context: int = LATENT_LEFT_CONTEXT
 
-    # --- Token IDs ---
     text_eos_token_id: int = TEXT_EOS_TOKEN_ID
-    eos_token_id: int = TEXT_EOS_TOKEN_ID  # compat alias
+    eos_token_id: int = EOS_TOKEN_ID
     pad_token_id: int = PAD_TOKEN_ID
     audio_dummy_token_id: int = AUDIO_DUMMY_TOKEN_ID
     audio_start_token_id: int = AUDIO_START_TOKEN_ID
@@ -122,11 +93,6 @@ class MingTTSConfig:
 
     @classmethod
     def from_hf_config(cls, hf_config: PretrainedConfig) -> MingTTSConfig:
-        """
-        Build from vllm-omni's hf_config. Supports nested configs as objects or dicts.
-        """
-
-        # --- Read nested sub-configs (must NOT read flat hf_config attrs for these) ---
         llm_raw = getattr(hf_config, "llm_config", {}) or {}
         ditar_raw = getattr(hf_config, "ditar_config", {}) or {}
         agg_raw = getattr(hf_config, "aggregator_config", {}) or {}
@@ -135,13 +101,9 @@ def from_hf_config(cls, hf_config: PretrainedConfig) -> MingTTSConfig:
         llm_dict = _to_plain_dict(llm_raw)
         ditar = _to_plain_dict(ditar_raw)
         agg = _to_plain_dict(agg_raw)
-
-        # Keep Ming DiT backend explicit; original checkpoint uses "torch"
         ditar.setdefault("attn_backend", "torch")
 
         atc = _coerce_audio_vae_config(atc_raw)
-
-        # --- Pull nested values safely ---
         atc_enc_latent_dim = _nested_get(atc, "enc_kwargs", "latent_dim", default=LATENT_DIM)
         atc_patch_size = _nested_get(atc, "patch_size", default=VAE_PATCH_SIZE)
         atc_sample_rate = _nested_get(atc, "sample_rate", default=SAMPLE_RATE)
@@ -164,122 +126,13 @@ def from_hf_config(cls, hf_config: PretrainedConfig) -> MingTTSConfig:
             sample_rate=atc_sample_rate,
             audio_frame_hop=enc_hop_size if enc_hop_size is not None else AUDIO_FRAME_HOP,
         )
-
-        # Optional debug cache (safe to keep)
         cfg._enc_input_dim = enc_input_dim
         cfg._enc_hop_size = enc_hop_size
         cfg._dec_output_dim = dec_output_dim
-
         return cfg
 
     def validate(self) -> None:
-        """Run before GPU allocation/weight loading. Raises ValueError on mismatches."""
-
-        # --- Token IDs ---
-        if self.audio_dummy_token_id != 151705:
-            raise ValueError(
-                f"audio_dummy_token_id={self.audio_dummy_token_id}, expected 151705 (<audioPatch>). "
-                "Wrong tokenizer/checkpoint?"
-            )
-        if self.audio_eos_token_id != 151704:
-            raise ValueError(
-                f"audio_eos_token_id={self.audio_eos_token_id}, expected 151704 (<end_of_audio>). "
-                "Wrong tokenizer/checkpoint?"
-            )
-        if self.text_eos_token_id != 151669:
-            raise ValueError(
-                f"text_eos_token_id={self.text_eos_token_id}, expected 151669 (<text_eos>). Wrong tokenizer/checkpoint?"
-            )
-
-        # --- Required sub-config ---
-        if self.audio_tokenizer_config is None:
-            raise ValueError("audio_tokenizer_config is None. Nested AudioVAE config was not deserialized correctly.")
-
-        # --- Confirmed checkpoint-family constants ---
-        if self.latent_dim != LATENT_DIM:
-            raise ValueError(
-                f"latent_dim mismatch: got {self.latent_dim}, expected {LATENT_DIM}. "
-                "Check audio_tokenizer_config.enc_kwargs.latent_dim."
-            )
-        if self.patch_size != PATCH_SIZE:
-            raise ValueError(
-                f"patch_size mismatch: got {self.patch_size}, expected {PATCH_SIZE}. Check ditar_config.patch_size."
-            )
-        if self.history_patch_size != HISTORY_PATCH_SIZE:
-            raise ValueError(
-                f"history_patch_size mismatch: got {self.history_patch_size}, expected {HISTORY_PATCH_SIZE}. "
-                "Check ditar_config.history_patch_size."
-            )
-        if self.llm_hidden_size != LLM_HIDDEN_SIZE:
-            raise ValueError(
-                f"llm_hidden_size mismatch: got {self.llm_hidden_size}, expected {LLM_HIDDEN_SIZE}. "
-                "Check llm_config.hidden_size."
-            )
-        if self.llm_vocab_size != LLM_VOCAB_SIZE:
-            raise ValueError(f"llm_vocab_size mismatch: got {self.llm_vocab_size}, expected {LLM_VOCAB_SIZE}.")
-        if self.sample_rate != SAMPLE_RATE:
-            raise ValueError(f"sample_rate mismatch: got {self.sample_rate}, expected {SAMPLE_RATE}.")
-
-        # --- Cross-config consistency checks ---
-        if self.vae_patch_size != self.patch_size:
-            raise ValueError(f"VAE patch size ({self.vae_patch_size}) != flow/DiT patch size ({self.patch_size}).")
-
-        llm_hidden_from_cfg = self.llm_config.get("hidden_size")
-        if llm_hidden_from_cfg is not None and llm_hidden_from_cfg != self.llm_hidden_size:
-            raise ValueError(
-                f"llm_hidden_size ({self.llm_hidden_size}) != llm_config.hidden_size ({llm_hidden_from_cfg})."
-            )
-
-        agg_h = self.aggregator_config.get("hidden_size")
-        dit_h = self.ditar_config.get("hidden_size")
-        if agg_h is not None and dit_h is not None and agg_h != dit_h:
-            raise ValueError(f"aggregator_config.hidden_size ({agg_h}) != ditar_config.hidden_size ({dit_h}).")
-        if agg_h is not None and agg_h != AGGREGATOR_HIDDEN_SIZE:
-            raise ValueError(f"aggregator hidden_size mismatch: got {agg_h}, expected {AGGREGATOR_HIDDEN_SIZE}.")
-        if dit_h is not None and dit_h != AGGREGATOR_HIDDEN_SIZE:
-            raise ValueError(f"ditar hidden_size mismatch: got {dit_h}, expected {AGGREGATOR_HIDDEN_SIZE}.")
-
-        atc = self.audio_tokenizer_config
-        enc_latent = _nested_get(atc, "enc_kwargs", "latent_dim", default=None)
-        dec_latent = _nested_get(atc, "dec_kwargs", "latent_dim", default=None)
-        if enc_latent is not None and enc_latent != self.latent_dim:
-            raise ValueError(f"audio enc latent_dim ({enc_latent}) != Ming latent_dim ({self.latent_dim}).")
-        if dec_latent is not None and dec_latent != self.latent_dim:
-            raise ValueError(f"audio dec latent_dim ({dec_latent}) != Ming latent_dim ({self.latent_dim}).")
-
-        atc_patch = _nested_get(atc, "patch_size", default=None)
-        if atc_patch is not None and atc_patch != self.vae_patch_size:
-            raise ValueError(
-                f"audio_tokenizer_config.patch_size ({atc_patch}) != vae_patch_size ({self.vae_patch_size})."
-            )
-
-        atc_sr = _nested_get(atc, "sample_rate", default=None)
-        if atc_sr is not None and atc_sr != self.sample_rate:
-            raise ValueError(f"audio_tokenizer_config.sample_rate ({atc_sr}) != sample_rate ({self.sample_rate}).")
-
-        enc_input_dim = _nested_get(atc, "enc_kwargs", "input_dim", default=None)
-        enc_hop_size = _nested_get(atc, "enc_kwargs", "hop_size", default=None)
-        dec_output_dim = _nested_get(atc, "dec_kwargs", "output_dim", default=None)
-
-        if enc_input_dim is not None and enc_hop_size is not None and enc_input_dim != enc_hop_size:
-            raise ValueError(f"AudioVAE encoder input_dim ({enc_input_dim}) != hop_size ({enc_hop_size}).")
-        if enc_hop_size is not None and dec_output_dim is not None and enc_hop_size != dec_output_dim:
-            raise ValueError(
-                f"AudioVAE encoder hop_size ({enc_hop_size}) != decoder output_dim ({dec_output_dim}). "
-                "Expected 882 in this checkpoint family."
-            )
-
-        # Runtime tuning sanity
-        if self.latent_chunk_size <= 0:
-            raise ValueError(f"latent_chunk_size must be > 0, got {self.latent_chunk_size}.")
-        if self.latent_left_context < 0:
-            raise ValueError(f"latent_left_context must be >= 0, got {self.latent_left_context}.")
-        if self.max_decode_steps <= 0:
-            raise ValueError(f"max_decode_steps must be > 0, got {self.max_decode_steps}.")
-        if not (0.0 <= self.stop_head_threshold <= 1.0):
-            raise ValueError(f"stop_head_threshold must be in [0,1], got {self.stop_head_threshold}.")
-        if self.stop_head_min_steps < 0:
-            raise ValueError(f"stop_head_min_steps must be >= 0, got {self.stop_head_min_steps}.")
+        validate_ming_tts_config(self)
 
     def make_qwen2_config(self) -> Qwen2Config:
         """Reconstruct Qwen2Config for Stage-1 LLM backbone init."""
@@ -297,68 +150,49 @@ def chunk_frames(self) -> int:
 
     @property
     def approx_chunk_seconds(self) -> float:
-        # One latent frame ~ one 882-sample hop in this checkpoint family.
         return (self.chunk_frames * self.audio_frame_hop) / float(self.sample_rate)
 
 
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _to_plain_dict(obj: Any) -> dict[str, Any]:
-    """Normalize nested config objects into plain dicts when possible."""
-    if obj is None:
-        return {}
-    if isinstance(obj, dict):
-        return dict(obj)
-    if isinstance(obj, PretrainedConfig):
-        return obj.to_dict()
-    if hasattr(obj, "to_dict") and callable(obj.to_dict):
-        try:
-            return dict(obj.to_dict())
-        except Exception:
-            pass
-    try:
-        return dict(vars(obj))
-    except Exception:
-        return {}
-
-
-def _coerce_audio_vae_config(atc_raw: Any) -> AudioVAEconfig | None:
-    """
-    Normalize audio_tokenizer_config into AudioVAEconfig when possible.
-    Handles:
-      - already AudioVAEconfig
-      - dict
-      - PretrainedConfig-like object
-    """
-    if atc_raw is None:
-        return None
-    atc_dict = _to_plain_dict(atc_raw)
-    if not atc_dict:
-        # Return raw object as fallback; _nested_get/validate can still work
-        return atc_raw  # type: ignore[return-value]
-
-    if hasattr(AudioVAEconfig, "from_dict") and callable(getattr(AudioVAEconfig, "from_dict")):
-        try:
-            return AudioVAEconfig.from_dict(atc_dict)  # type: ignore[misc]
-        except Exception:
-            pass
-    try:
-        return AudioVAEconfig(**atc_dict)  # type: ignore[arg-type]
-    except Exception:
-        return atc_raw  # type: ignore[return-value]
-
-
-def _nested_get(obj: Any, *keys: str, default: Any = None) -> Any:
-    """Safe nested attribute/key access for dicts and config-like objects."""
-    cur = obj
-    for k in keys:
-        if cur is None:
-            return default
-        if isinstance(cur, dict):
-            cur = cur.get(k)
-        else:
-            cur = getattr(cur, k, None)
-    return cur if cur is not None else default
+__all__ = [
+    "AGGREGATOR_HIDDEN_SIZE",
+    "AUDIO_DUMMY_TOKEN_ID",
+    "AUDIO_END_TOKEN_ID",
+    "AUDIO_EOS_TOKEN_ID",
+    "AUDIO_FRAME_HOP",
+    "AUDIO_START_TOKEN_ID",
+    "DEFAULT_CFG",
+    "DEFAULT_SIGMA",
+    "DEFAULT_TEMPERATURE",
+    "EOS_TOKEN_ID",
+    "HISTORY_PATCH_SIZE",
+    "KEY_CFG",
+    "KEY_CHUNK_ID",
+    "KEY_DECODE_STEP",
+    "KEY_LAST_STOP_PROB",
+    "KEY_LATENT_HISTORY",
+    "KEY_MAX_DECODE_STEPS",
+    "KEY_MIN_DECODE_STEPS",
+    "KEY_NEXT_EMBEDS",
+    "KEY_PROMPT_LATENT_TAIL",
+    "KEY_PROMPT_LATENTS",
+    "KEY_REQUEST_ID",
+    "KEY_SIGMA",
+    "KEY_SPEAKER_EMBEDDING",
+    "KEY_TEMPERATURE",
+    "KEY_TEXT_MODE",
+    "LATENT_CHUNK_SIZE",
+    "LATENT_DIM",
+    "LATENT_LEFT_CONTEXT",
+    "LLM_HIDDEN_SIZE",
+    "LLM_VOCAB_SIZE",
+    "MAX_DECODE_STEPS",
+    "MingTTSConfig",
+    "PAD_TOKEN_ID",
+    "PATCH_SIZE",
+    "SAMPLE_RATE",
+    "STOP_HEAD_MIN_STEPS",
+    "STOP_HEAD_THRESHOLD",
+    "TEXT_EOS_TOKEN_ID",
+    "VAE_PATCH_SIZE",
+    "VISION_START_TOKEN_ID",
+]
diff --git a/vllm_omni/model_executor/models/ming_tts/constants.py b/vllm_omni/model_executor/models/ming_tts/constants.py
new file mode 100644
index 00000000000..b7e0b9bb78a
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/constants.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+# ---------------------------------------------------------------------------
+# Token IDs (confirmed from tokenizer_config.json)
+# ---------------------------------------------------------------------------
+
+AUDIO_DUMMY_TOKEN_ID = 151705  # <audioPatch>
+AUDIO_START_TOKEN_ID = 151706  # <audio>
+AUDIO_END_TOKEN_ID = 151707  # </audio>
+AUDIO_EOS_TOKEN_ID = 151704  # <end_of_audio>
+VISION_START_TOKEN_ID = 151652  # <|vision_start|>
+
+TEXT_EOS_TOKEN_ID = 151669  # <text_eos>
+PAD_TOKEN_ID = 151643  # <|endoftext|>
+
+# Backward-compat alias for older code paths
+EOS_TOKEN_ID = TEXT_EOS_TOKEN_ID
+
+
+# ---------------------------------------------------------------------------
+# Architectural constants (confirmed from original config.json)
+# ---------------------------------------------------------------------------
+
+LATENT_DIM = 64
+PATCH_SIZE = 4
+HISTORY_PATCH_SIZE = 32
+LLM_HIDDEN_SIZE = 896
+LLM_VOCAB_SIZE = 151936
+AGGREGATOR_HIDDEN_SIZE = 1024
+VAE_PATCH_SIZE = 4
+SAMPLE_RATE = 44100
+
+# AudioVAE frame/hop geometry (confirmed)
+AUDIO_FRAME_HOP = 882  # enc input_dim / hop_size / dec output_dim
+
+# stop_head defaults
+STOP_HEAD_MIN_STEPS = 3
+STOP_HEAD_THRESHOLD = 0.5
+
+# FlowLoss sampling defaults
+DEFAULT_CFG = 2.0
+DEFAULT_SIGMA = 0.25
+DEFAULT_TEMPERATURE = 0.0
+
+# Connector / Stage-2 streaming defaults (runtime tuning)
+LATENT_CHUNK_SIZE = 25
+LATENT_LEFT_CONTEXT = 0
+MAX_DECODE_STEPS = 200
+
+
+# ---------------------------------------------------------------------------
+# seq_data.extra_data keys
+# ---------------------------------------------------------------------------
+
+KEY_LATENT_HISTORY = "ming_latent_history"
+KEY_DECODE_STEP = "ming_decode_step"
+KEY_LAST_STOP_PROB = "ming_last_stop_prob"
+KEY_NEXT_EMBEDS = "ming_next_embeds"
+KEY_PROMPT_LATENTS = "ming_prompt_latents"
+KEY_PROMPT_LATENT_TAIL = "ming_prompt_latent_tail"
+KEY_SPEAKER_EMBEDDING = "ming_speaker_embedding"
+KEY_REQUEST_ID = "ming_request_id"
+KEY_CHUNK_ID = "ming_chunk_id"
+KEY_CFG = "ming_cfg"
+KEY_SIGMA = "ming_sigma"
+KEY_TEMPERATURE = "ming_temperature"
+KEY_MAX_DECODE_STEPS = "ming_max_decode_steps"
+KEY_MIN_DECODE_STEPS = "ming_min_decode_steps"
+KEY_TEXT_MODE = "ming_text_mode"
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/flowloss.py b/vllm_omni/model_executor/models/ming_tts/flowloss_head.py
similarity index 92%
rename from vllm_omni/model_executor/models/ming_tts/fm/flowloss.py
rename to vllm_omni/model_executor/models/ming_tts/flowloss_head.py
index 18c59186c3a..f2c5cc0753b 100644
--- a/vllm_omni/model_executor/models/ming_tts/fm/flowloss.py
+++ b/vllm_omni/model_executor/models/ming_tts/flowloss_head.py
@@ -5,8 +5,8 @@
 import torch
 import torch.nn as nn
 
-from .cfm import CFM
-from .dit import DiT
+from .fm.cfm import CFM
+from .fm.dit import DiT
 
 
 class FlowLoss(nn.Module):
@@ -40,7 +40,7 @@ def sample(self, z, latent_history, cfg=2.0, patch_size=1, sigma=0.25, temperatu
         noise = torch.randn(z.shape[0], self.z_channels, patch_size, device=z.device)
         if not torch.isfinite(noise).all():
             raise RuntimeError("Non-finite noise in FlowLoss.sample().")
-        noise = noise.to(dtype=z.dtype)  # match conditioning dtype — no autocast in vllm-omni
+        noise = noise.to(dtype=z.dtype)
         out, _ = self.cfm.sample(
             noise=noise,
             c=z,
@@ -50,5 +50,4 @@ def sample(self, z, latent_history, cfg=2.0, patch_size=1, sigma=0.25, temperatu
             sigma=sigma,
             temperature=temperature,
         )
-        # out shape: [B, patch_size, z_channels]
         return out
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/dit.py b/vllm_omni/model_executor/models/ming_tts/fm/dit.py
index 2024f26ca2d..39cc5693507 100644
--- a/vllm_omni/model_executor/models/ming_tts/fm/dit.py
+++ b/vllm_omni/model_executor/models/ming_tts/fm/dit.py
@@ -161,56 +161,3 @@ def forward_with_cfg(self, x, t, c, cfg_scale, latent_history, patch_size):
             t = t.repeat(x.shape[0])
         model_out = self.forward(x, t, c, latent_history)
         return model_out[:, -patch_size:, :]
-
-
-class Aggregator(nn.Module):
-    def __init__(
-        self,
-        in_channels=4,
-        hidden_size=1152,
-        depth=28,
-        num_heads=16,
-        mlp_ratio=4.0,
-        llm_input_dim=896,
-        **kwargs,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = in_channels
-        self.num_heads = num_heads
-
-        self.word_embedder = nn.Embedding(1, hidden_size)
-        self.x_embedder = nn.Linear(in_channels, hidden_size)
-        self.hidden_size = hidden_size
-
-        self.rotary_embed = RotaryEmbedding(hidden_size // num_heads)
-
-        self.blocks = nn.ModuleList(
-            [DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, **kwargs) for _ in range(depth)]
-        )
-        self.final_layer = FinalLayer(hidden_size, llm_input_dim)
-
-    def forward(self, x, mask=None):
-        if x.ndim != 3:
-            raise ValueError(f"Expected x rank-3 [Batch, Time, Dimension], got {tuple(x.shape)}")
-        if x.shape[-1] != self.in_channels:
-            raise ValueError(f"x feature dim mismatch: got {x.shape[-1]}, expected {self.in_channels}")
-        x = self.x_embedder(x)
-        cls_embed = self.word_embedder(torch.zeros((x.shape[0], 1), dtype=torch.long, device=x.device))
-        x = torch.cat([cls_embed, x], dim=1)
-
-        rope = self.rotary_embed.forward_from_seq_len(x.shape[1])
-        if mask is not None:
-            if mask.ndim != 2:
-                raise ValueError(f"Expected mask rank-2 [Batch, Time], got {tuple(mask.shape)}")
-            if mask.shape[0] != x.shape[0] or mask.shape[1] != x.shape[1] - 1:
-                raise ValueError(
-                    f"Mask shape mismatch: got {tuple(mask.shape)}, expected {(x.shape[0], x.shape[1] - 1)}"
-                )
-            mask_pad = mask.clone().detach()[:, :1]
-            mask = torch.cat([mask_pad, mask], dim=-1)
-        for block in self.blocks:
-            x = block(x, mask, rope)
-        x = self.final_layer(x)
-        x = x[:, :1, :]
-        return x
diff --git a/vllm_omni/model_executor/models/ming_tts/loader.py b/vllm_omni/model_executor/models/ming_tts/loader.py
new file mode 100644
index 00000000000..dc1f25fcb5a
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/loader.py
@@ -0,0 +1,296 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import json
+from io import BytesIO
+from pathlib import Path
+from typing import Any
+
+import torch
+from safetensors import safe_open
+
+from vllm_omni.engine.stage_init_utils import _resolve_model_to_local_path
+
+from .audio_tokenizer.modeling_audio_vae import AudioVAE
+from .config_ming_tts import (
+    KEY_PROMPT_LATENTS,
+    VISION_START_TOKEN_ID,
+    MingTTSConfig,
+)
+from .prompt_builder import (
+    coerce_prompt_waveform,
+    count_prompt_latent_patches,
+    pad_prompt_waveform,
+)
+
+
+def load_weights(model_stage: str, model: Any, weights: list[tuple[str, torch.Tensor]]):
+    if model_stage == "llm":
+        allowed = ("model.", "linear_proj_audio.", "flowloss.", "stop_head.", "spk_head.")
+        llm_weights = [(k, v) for k, v in weights if k.startswith(allowed)]
+        if not llm_weights:
+            raise RuntimeError(
+                "Ming Stage-0 received no loadable checkpoint weights. "
+                "Expected prefixes: model.*, linear_proj_audio.*, flowloss.*, stop_head.*, spk_head.*"
+            )
+        loaded = model.load_weights(llm_weights)
+        return {f"model.{name}" for name in loaded}
+
+    audio_weights = [(k, v) for k, v in weights if k.startswith("audio.")]
+    if not audio_weights:
+        raise RuntimeError("Ming Stage-1 received no loadable checkpoint weights. Expected prefix: audio.*")
+    loaded = model.load_weights(audio_weights)
+    return {f"model.{name}" for name in loaded}
+
+
+def _resolve_prompt_latents(wrapper: Any, info_dict: dict[str, Any]) -> dict[str, torch.Tensor] | None:
+    raw_latents = info_dict.get(KEY_PROMPT_LATENTS, info_dict.get("prompt_latents"))
+    raw_waveform = info_dict.get("prompt_waveform", info_dict.get("prompt_waveforms"))
+    if raw_latents is not None and raw_waveform is not None:
+        raise ValueError(
+            "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
+            "Choose exactly one source of truth."
+        )
+
+    direct_latents = _coerce_prompt_latents(
+        raw_latents,
+        patch_size=wrapper.ming_config.patch_size,
+        latent_dim=wrapper.ming_config.latent_dim,
+    )
+    if direct_latents is not None:
+        return direct_latents
+    if raw_waveform is None:
+        return None
+
+    encode_fn = getattr(wrapper, "_encode_prompt_waveform_to_latents", None)
+    if callable(encode_fn):
+        latents = encode_fn(raw_waveform, info_dict.get("prompt_waveform_length"))
+    else:
+        latents = _encode_prompt_waveform_to_latents(
+            wrapper,
+            raw_waveform,
+            info_dict.get("prompt_waveform_length"),
+        )
+    return _coerce_prompt_latents(
+        latents,
+        patch_size=wrapper.ming_config.patch_size,
+        latent_dim=wrapper.ming_config.latent_dim,
+    )
+
+
+def _load_prompt_encoder(wrapper: Any) -> AudioVAE:
+    if wrapper._prompt_encoder is not None:
+        return wrapper._prompt_encoder
+    if wrapper.ming_config.audio_tokenizer_config is None:
+        raise RuntimeError("Ming Stage-0 requires audio_tokenizer_config to encode prompt audio.")
+
+    encoder = AudioVAE(wrapper.ming_config.audio_tokenizer_config).eval()
+    state_dict = encoder.state_dict()
+    loaded = 0
+    loaded_encoder_params = set()
+    with torch.no_grad():
+        for shard_path in _iter_model_safetensors(
+            _resolve_model_to_local_path(str(wrapper.vllm_config.model_config.model))
+        ):
+            with safe_open(str(shard_path), framework="pt", device="cpu") as handle:
+                for key in handle.keys():
+                    if not key.startswith("audio.encoder."):
+                        continue
+                    name = key[len("audio.") :]
+                    if name not in state_dict:
+                        continue
+                    target = state_dict[name]
+                    target.copy_(handle.get_tensor(key).to(device=target.device, dtype=target.dtype))
+                    loaded += 1
+                    loaded_encoder_params.add(name)
+    if loaded == 0:
+        raise RuntimeError("Ming prompt encoder received no audio.encoder.* weights from checkpoint.")
+
+    expected_encoder_params = {f"encoder.{name}" for name, _ in encoder.encoder.named_parameters()}
+    missing = expected_encoder_params - loaded_encoder_params
+    if missing:
+        raise RuntimeError(f"Ming prompt encoder: {len(missing)} params not loaded. First few: {sorted(missing)[:5]}")
+
+    dev = next(wrapper.parameters()).device
+    try:
+        del encoder.decoder
+        encoder.decoder = None
+        if dev.type != "cpu":
+            encoder.encoder.to(dev, dtype=getattr(wrapper.model, "fm_dtype", torch.bfloat16))
+        else:
+            encoder.encoder.to(dev)
+    except Exception as exc:
+        raise RuntimeError(f"Failed to move Ming prompt encoder to {dev}: {exc}") from exc
+    wrapper._prompt_encoder = encoder
+    return encoder
+
+
+@torch.inference_mode()
+def _encode_prompt_waveform_to_latents(wrapper: Any, waveform: Any, waveform_length: Any = None) -> torch.Tensor:
+    encoder = _load_prompt_encoder(wrapper)
+    waveform = _normalize_prompt_waveform(waveform, target_sr=wrapper.ming_config.sample_rate)
+    waveform = pad_prompt_waveform(
+        waveform,
+        patch_size=wrapper.ming_config.patch_size,
+        sample_rate=wrapper.ming_config.sample_rate,
+        frame_hop=wrapper.ming_config.audio_frame_hop,
+    )
+    dev = next(encoder.encoder.parameters()).device
+    waveform = waveform.to(device=dev, dtype=next(encoder.encoder.parameters()).dtype)
+    if waveform_length is None:
+        waveform_length = torch.full((waveform.shape[0],), waveform.shape[-1], dtype=torch.int32, device=dev)
+    elif not isinstance(waveform_length, torch.Tensor):
+        waveform_length = torch.as_tensor(waveform_length, dtype=torch.int32, device=dev)
+    else:
+        waveform_length = waveform_length.to(device=dev, dtype=torch.int32)
+
+    latents, _ = encoder.encode_latent(waveform, waveform_length)
+    if latents.ndim == 3 and latents.shape[0] == 1:
+        latents = latents.squeeze(0)
+    count_prompt_latent_patches(
+        latents,
+        patch_size=wrapper.ming_config.patch_size,
+        latent_dim=wrapper.ming_config.latent_dim,
+    )
+    return latents.detach().to(dtype=torch.float32).contiguous()
+
+
+def _iter_model_safetensors(local_model_path: str) -> list[Path]:
+    model_root = Path(local_model_path)
+    index_path = model_root / "model.safetensors.index.json"
+    if index_path.exists():
+        with index_path.open("r", encoding="utf-8") as handle:
+            index_data = json.load(handle)
+        filenames = sorted(set(index_data.get("weight_map", {}).values()))
+        if not filenames:
+            raise RuntimeError(f"No checkpoint shards listed in {index_path}")
+        return [model_root / filename for filename in filenames]
+
+    single_file = model_root / "model.safetensors"
+    if single_file.exists():
+        return [single_file]
+
+    files = sorted(model_root.glob("*.safetensors"))
+    if not files:
+        raise RuntimeError(f"No .safetensors checkpoint found under {local_model_path}")
+    return files
+
+
+def _normalize_prompt_waveform(value: Any, *, target_sr: int) -> torch.Tensor:
+    if isinstance(value, bytes):
+        import torchaudio
+
+        waveform, sr = torchaudio.load(BytesIO(value))
+        waveform = waveform[:1].to(torch.float32)
+        if int(sr) != int(target_sr):
+            from torchaudio.functional import resample as resample_audio
+
+            waveform = resample_audio(waveform, int(sr), int(target_sr))
+        return waveform
+
+    if isinstance(value, tuple) and len(value) == 2 and isinstance(value[1], int):
+        waveform = coerce_prompt_waveform(value[0])
+        if int(value[1]) != int(target_sr):
+            from torchaudio.functional import resample as resample_audio
+
+            waveform = resample_audio(waveform, int(value[1]), int(target_sr))
+        return waveform
+
+    if isinstance(value, dict):
+        samples = value.get("samples", value.get("array", value.get("waveform")))
+        sr = value.get("sample_rate", value.get("sr", target_sr))
+        return _normalize_prompt_waveform((samples, int(sr)), target_sr=target_sr)
+
+    return coerce_prompt_waveform(value)
+
+
+def _coerce_prompt_latents(
+    value: Any,
+    *,
+    patch_size: int,
+    latent_dim: int,
+) -> dict[str, torch.Tensor] | None:
+    if value is None:
+        return None
+    if not isinstance(value, torch.Tensor):
+        value = torch.as_tensor(value)
+
+    latents = value.detach()
+    if latents.ndim == 3 and latents.shape[0] == 1:
+        latents = latents.squeeze(0)
+
+    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
+        patches = latents
+        frames = patches.reshape(-1, latent_dim)
+        return {"patches": patches, "frames": frames}
+
+    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
+        raise ValueError(f"Unsupported prompt latent shape: {tuple(latents.shape)}")
+    if latents.shape[0] % patch_size != 0:
+        raise ValueError(
+            f"Prompt latent frame count must be divisible by patch_size={patch_size}, "
+            f"got frames={int(latents.shape[0])}"
+        )
+    patches = latents.reshape(-1, patch_size, latent_dim) if latents.shape[0] > 0 else None
+    return {"patches": patches, "frames": latents}
+
+
+def _initial_history(
+    frames: torch.Tensor | None,
+    *,
+    history_size: int,
+    latent_dim: int,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    history = torch.zeros((history_size, latent_dim), device=device, dtype=dtype)
+    if frames is None or frames.numel() == 0:
+        return history
+    frames = frames.to(device=device, dtype=dtype)
+    take = min(history_size, int(frames.shape[0]))
+    history[-take:] = frames[-take:]
+    return history
+
+
+def _take_index(value: Any, idx: int) -> torch.Tensor | None:
+    if not isinstance(value, torch.Tensor) or value.numel() == 0:
+        return None
+    return value[idx]
+
+
+def _take_scalar(value: Any, idx: int) -> float | None:
+    if not isinstance(value, torch.Tensor) or value.numel() == 0:
+        return None
+    return float(value.reshape(-1)[idx].item())
+
+
+def _find_audio_placeholder_positions(input_ids: torch.Tensor, cfg: MingTTSConfig) -> torch.Tensor:
+    dummy_pos = (input_ids == cfg.audio_dummy_token_id).nonzero(as_tuple=True)[0]
+    if dummy_pos.numel() == 0:
+        return dummy_pos
+
+    audio_start_pos = (input_ids == cfg.audio_start_token_id).nonzero(as_tuple=True)[0]
+    audio_end_pos = (input_ids == cfg.audio_end_token_id).nonzero(as_tuple=True)[0]
+    if audio_start_pos.numel() == 0:
+        return dummy_pos
+
+    start = int(audio_start_pos[0].item())
+    end = int(audio_end_pos[0].item()) if audio_end_pos.numel() > 0 else int(input_ids.shape[0])
+    keep = (dummy_pos > start) & (dummy_pos < end)
+    filtered = dummy_pos[keep]
+    return filtered if filtered.numel() > 0 else dummy_pos
+
+
+def _find_speaker_placeholder_positions(input_ids: torch.Tensor, hf_config: Any) -> list[int]:
+    vision_start_token_id = getattr(hf_config, "vision_start_token_id", VISION_START_TOKEN_ID)
+    vision_start_pos = (input_ids == int(vision_start_token_id)).nonzero(as_tuple=True)[0]
+    if vision_start_pos.numel() == 0:
+        return []
+
+    slots = []
+    for pos in vision_start_pos:
+        slot = int(pos.item()) + 1
+        if slot < int(input_ids.shape[0]):
+            slots.append(slot)
+    return slots
diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts.py b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
index 3490c97f662..cf4550cb895 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
@@ -2,16 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
-import json
-import os
 from functools import cached_property
-from io import BytesIO
-from pathlib import Path
 from typing import Any
 
 import torch
 import torch.nn as nn
-from safetensors import safe_open
 from vllm.config import VllmConfig
 from vllm.model_executor.models import SupportsPP
 from vllm.model_executor.models.utils import init_vllm_registered_model
@@ -19,7 +14,6 @@
 
 from vllm_omni.model_executor.custom_process_mixin import CustomProcessMixin
 
-from .audio_tokenizer.modeling_audio_vae import AudioVAE
 from .config_ming_tts import (
     AUDIO_START_TOKEN_ID,
     KEY_CFG,
@@ -30,20 +24,21 @@
     KEY_MIN_DECODE_STEPS,
     KEY_NEXT_EMBEDS,
     KEY_PROMPT_LATENT_TAIL,
-    KEY_PROMPT_LATENTS,
     KEY_REQUEST_ID,
     KEY_SIGMA,
     KEY_SPEAKER_EMBEDDING,
     KEY_TEMPERATURE,
     KEY_TEXT_MODE,
-    VISION_START_TOKEN_ID,
     MingTTSConfig,
 )
-from .prompt_builder import (
-    coerce_prompt_waveform,
-    coerce_speaker_embeddings,
-    count_prompt_latent_patches,
-    pad_prompt_waveform,
+from .loader import (
+    _coerce_prompt_latents,
+    _find_audio_placeholder_positions,
+    _find_speaker_placeholder_positions,
+    _initial_history,
+    _resolve_prompt_latents,
+    _take_scalar,
+    load_weights,
 )
 
 MING_STOP_REASON_KEY = "ming_stop_reason"
@@ -65,33 +60,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.vllm_config = vllm_config
         self.ming_config = MingTTSConfig.from_hf_config(vllm_config.model_config.hf_config)
         self.ming_config.validate()
-
         self.have_multimodal_outputs = True
         self.has_preprocess = False
         self.has_postprocess = False
         self.requires_raw_input_tokens = False
-
         self.model_stage = vllm_config.model_config.model_stage
         self._prompt_encoder = None
 
         if self.model_stage == "llm":
-            self.model = init_vllm_registered_model(
-                vllm_config=vllm_config,
-                architectures=["MingLLMModel"],
-            )
+            self.model = init_vllm_registered_model(vllm_config=vllm_config, architectures=["MingLLMModel"])
             self.has_preprocess = True
             self.has_postprocess = True
             self.set_custom_preprocess(self.preprocess)
             self.set_custom_postprocess(self.postprocess)
         elif self.model_stage == "audio_vae":
-            self.model = init_vllm_registered_model(
-                vllm_config=vllm_config,
-                architectures=["MingAudioVAEModel"],
-            )
+            self.model = init_vllm_registered_model(vllm_config=vllm_config, architectures=["MingAudioVAEModel"])
             self.requires_raw_input_tokens = True
         else:
             raise ValueError(f"Invalid Ming model_stage={self.model_stage}")
-
         self.make_empty_intermediate_tensors = getattr(self.model, "make_empty_intermediate_tensors", lambda: None)
 
     @cached_property
@@ -112,94 +98,51 @@ def compute_logits(self, hidden_states, sampling_metadata=None):
         return self.model.compute_logits(hidden_states, sampling_metadata=sampling_metadata)
 
     def sample(self, logits, sampling_metadata):
-        if hasattr(self.model, "sample"):
-            return self.model.sample(logits, sampling_metadata)
-        return None
+        return self.model.sample(logits, sampling_metadata) if hasattr(self.model, "sample") else None
 
     def load_weights(self, weights):
-        weights = list(weights)
-        if self.model_stage == "llm":
-            allowed = ("model.", "linear_proj_audio.", "flowloss.", "stop_head.", "spk_head.")
-            llm_weights = [(k, v) for k, v in weights if k.startswith(allowed)]
-            if not llm_weights:
-                raise RuntimeError(
-                    "Ming Stage-0 received no loadable checkpoint weights. "
-                    "Expected prefixes: model.*, linear_proj_audio.*, flowloss.*, stop_head.*, spk_head.*"
-                )
-            loaded = self.model.load_weights(llm_weights)
-            return {f"model.{name}" for name in loaded}
-
-        audio_weights = [(k, v) for k, v in weights if k.startswith("audio.")]
-        if not audio_weights:
-            raise RuntimeError("Ming Stage-1 received no loadable checkpoint weights. Expected prefix: audio.*")
-        loaded = self.model.load_weights(audio_weights)
-        return {f"model.{name}" for name in loaded}
-
-    def preprocess(
-        self,
-        input_ids: torch.Tensor,
-        input_embeds: torch.Tensor | None,
-        **info_dict: Any,
-    ):
+        return load_weights(self.model_stage, self.model, list(weights))
+
+    def _resolve_prompt_latents(self, info_dict: dict[str, Any]):
+        return _resolve_prompt_latents(self, info_dict)
+
+    def preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tensor | None, **info_dict: Any):
         if self.model_stage != "llm":
             return input_ids, input_embeds, {}
-
-        # vLLM hands Stage-0 a scratch inputs_embeds buffer that is zeroed at
-        # preprocess time and later becomes corrupted before the backbone call.
-        # Rebuild a fresh [T,H] embedding tensor from token ids here instead of
-        # trusting the runtime-provided buffer.
         input_embeds = self.model.embed_input_ids(input_ids).clone()
+        return (
+            self._prefill_preprocess(input_ids, input_embeds, **info_dict)
+            if int(input_ids.shape[0]) > 1
+            else self._decode_preprocess(input_ids, input_embeds, **info_dict)
+        )
 
-        span_len = int(input_ids.shape[0])
-        if span_len > 1:
-            return self._prefill_preprocess(input_ids, input_embeds, **info_dict)
-        return self._decode_preprocess(input_ids, input_embeds, **info_dict)
-
-    def preprocess_input(
-        self,
-        input_ids: torch.Tensor,
-        input_embeds: torch.Tensor | None,
-        **info_dict: Any,
-    ):
+    def preprocess_input(self, input_ids: torch.Tensor, input_embeds: torch.Tensor | None, **info_dict: Any):
         return self.preprocess(input_ids, input_embeds, **info_dict)
 
     def postprocess(self, hidden_states: torch.Tensor, **info_dict: Any) -> dict[str, Any]:
         if self.model_stage != "llm" or hidden_states.numel() == 0:
             return {}
-
         req_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
         pending = self.model.pop_postprocess_update(req_id)
-        if not pending:
-            return {}
-
-        latent_patch = pending.get("ming_latent_patch")
-        next_embeds = pending.get(KEY_NEXT_EMBEDS)
-        new_history = pending.get(KEY_LATENT_HISTORY)
-        stop_prob = _take_scalar(pending.get("ming_stop_prob"), 0)
-        stop_reason = pending.get(MING_STOP_REASON_KEY)
-        if not isinstance(latent_patch, torch.Tensor):
+        if not pending or not isinstance(pending.get("ming_latent_patch"), torch.Tensor):
             return {}
 
-        decode_step = int(info_dict.get(KEY_DECODE_STEP, 0))
         update = {
-            KEY_LATENT_HISTORY: new_history.detach().to("cpu").contiguous(),
-            KEY_NEXT_EMBEDS: next_embeds.detach().to("cpu").contiguous(),
-            KEY_DECODE_STEP: decode_step + 1,
+            KEY_LATENT_HISTORY: pending[KEY_LATENT_HISTORY].detach().to("cpu").contiguous(),
+            KEY_NEXT_EMBEDS: pending[KEY_NEXT_EMBEDS].detach().to("cpu").contiguous(),
+            KEY_DECODE_STEP: int(info_dict.get(KEY_DECODE_STEP, 0)) + 1,
         }
+        stop_prob = _take_scalar(pending.get("ming_stop_prob"), 0)
         if stop_prob is not None:
             update[KEY_LAST_STOP_PROB] = stop_prob
+        stop_reason = pending.get(MING_STOP_REASON_KEY)
         if isinstance(stop_reason, str):
             update[MING_STOP_REASON_KEY] = stop_reason
         if isinstance(req_id, str):
             update[KEY_REQUEST_ID] = req_id
         return update
 
-    def _prefill_preprocess(
-        self,
-        input_ids: torch.Tensor,
-        input_embeds: torch.Tensor,
-        **info_dict: Any,
-    ):
+    def _prefill_preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tensor, **info_dict: Any):
         if bool(info_dict.get(KEY_TEXT_MODE, False)):
             update: dict[str, Any] = {KEY_TEXT_MODE: True}
             request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
@@ -209,11 +152,8 @@ def _prefill_preprocess(
                 return input_ids[:-1], input_embeds[:-1], update
             return input_ids, input_embeds, update
 
-        update: dict[str, Any] = {
-            KEY_DECODE_STEP: int(info_dict.get(KEY_DECODE_STEP, 0)),
-        }
-
-        prompt_latents = self._resolve_prompt_latents(info_dict)
+        update: dict[str, Any] = {KEY_DECODE_STEP: int(info_dict.get(KEY_DECODE_STEP, 0))}
+        prompt_latents = _resolve_prompt_latents(self, info_dict)
         history = _initial_history(
             prompt_latents["frames"] if prompt_latents is not None else None,
             history_size=self.ming_config.history_patch_size,
@@ -225,17 +165,20 @@ def _prefill_preprocess(
         update[KEY_PROMPT_LATENT_TAIL] = update[KEY_LATENT_HISTORY]
 
         speaker_embedding = info_dict.get(KEY_SPEAKER_EMBEDDING, info_dict.get("speaker_embedding"))
-        speaker_embeddings = coerce_speaker_embeddings(
-            speaker_embedding,
-            use_zero_spk_emb=bool(info_dict.get("use_zero_spk_emb", False)),
-        )
-        speaker_slots: list[int] = []
+        speaker_embeddings = None
+        if speaker_embedding is not None:
+            from .prompt_builder import coerce_speaker_embeddings
+
+            speaker_embeddings = coerce_speaker_embeddings(
+                speaker_embedding,
+                use_zero_spk_emb=bool(info_dict.get("use_zero_spk_emb", False)),
+            )
         if speaker_embeddings:
             speaker_slots = _find_speaker_placeholder_positions(input_ids, self.vllm_config.model_config.hf_config)
             if len(speaker_slots) < len(speaker_embeddings):
                 raise RuntimeError(
-                    f"Could not locate enough speaker placeholder slots: found {len(speaker_slots)}, "
-                    f"need {len(speaker_embeddings)}"
+                    "Could not locate enough speaker placeholder slots: "
+                    f"found {len(speaker_slots)}, need {len(speaker_embeddings)}"
                 )
             for speaker_slot, spk in zip(speaker_slots, speaker_embeddings):
                 spk_proj = self.model.project_speaker_embedding(
@@ -244,9 +187,7 @@ def _prefill_preprocess(
                 input_embeds[speaker_slot] = spk_proj
 
         if prompt_latents is not None and prompt_latents["patches"] is not None:
-            prompt_patches = prompt_latents["patches"].to(
-                dtype=getattr(self.model, "fm_dtype", torch.float32),
-            )
+            prompt_patches = prompt_latents["patches"].to(dtype=getattr(self.model, "fm_dtype", torch.float32))
             prompt_embeds = self.model.linear_proj_audio(prompt_patches).squeeze(1)
             placeholder_pos = _find_audio_placeholder_positions(input_ids, self.ming_config)
             take = min(int(placeholder_pos.numel()), int(prompt_embeds.shape[0]))
@@ -259,124 +200,7 @@ def _prefill_preprocess(
         _copy_runtime_controls(update, info_dict)
         return input_ids, input_embeds, update
 
-    def _resolve_prompt_latents(self, info_dict: dict[str, Any]) -> dict[str, torch.Tensor] | None:
-        raw_latents = info_dict.get(KEY_PROMPT_LATENTS, info_dict.get("prompt_latents"))
-        raw_waveform = info_dict.get("prompt_waveform", info_dict.get("prompt_waveforms"))
-        if raw_latents is not None and raw_waveform is not None:
-            raise ValueError(
-                "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
-                "Choose exactly one source of truth."
-            )
-
-        direct_latents = _coerce_prompt_latents(
-            raw_latents,
-            patch_size=self.ming_config.patch_size,
-            latent_dim=self.ming_config.latent_dim,
-        )
-        if direct_latents is not None:
-            return direct_latents
-
-        if raw_waveform is None:
-            return None
-        waveform_length = info_dict.get("prompt_waveform_length")
-        latents = self._encode_prompt_waveform_to_latents(
-            raw_waveform,
-            waveform_length,
-        )
-        return _coerce_prompt_latents(
-            latents,
-            patch_size=self.ming_config.patch_size,
-            latent_dim=self.ming_config.latent_dim,
-        )
-
-    def _load_prompt_encoder(self) -> AudioVAE:
-        if self._prompt_encoder is not None:
-            return self._prompt_encoder
-        if self.ming_config.audio_tokenizer_config is None:
-            raise RuntimeError("Ming Stage-0 requires audio_tokenizer_config to encode prompt audio.")
-
-        encoder = AudioVAE(self.ming_config.audio_tokenizer_config).eval()
-        state_dict = encoder.state_dict()
-        loaded = 0
-        loaded_encoder_params = set()
-        with torch.no_grad():
-            for shard_path in _iter_model_safetensors(
-                _resolve_model_to_local_path(str(self.vllm_config.model_config.model))
-            ):
-                with safe_open(str(shard_path), framework="pt", device="cpu") as handle:
-                    for key in handle.keys():
-                        if not key.startswith("audio.encoder."):
-                            continue
-                        name = key[len("audio.") :]
-                        if name not in state_dict:
-                            continue
-                        target = state_dict[name]
-                        target.copy_(handle.get_tensor(key).to(device=target.device, dtype=target.dtype))
-                        loaded += 1
-                        loaded_encoder_params.add(name)
-        if loaded == 0:
-            raise RuntimeError("Ming prompt encoder received no audio.encoder.* weights from checkpoint.")
-        # Ensure the encode-only Stage-0 VAE is not silently running with random encoder weights.
-        expected_encoder_params = {f"encoder.{name}" for name, _ in encoder.encoder.named_parameters()}
-        missing = expected_encoder_params - loaded_encoder_params
-        if missing:
-            raise RuntimeError(
-                f"Ming prompt encoder: {len(missing)} params not loaded. First few: {sorted(missing)[:5]}"
-            )
-
-        dev = next(self.parameters()).device
-        try:
-            del encoder.decoder
-            encoder.decoder = None
-            if dev.type != "cpu":
-                encoder.encoder.to(dev, dtype=getattr(self.model, "fm_dtype", torch.bfloat16))
-            else:
-                encoder.encoder.to(dev)
-        except Exception as e:
-            raise RuntimeError(f"Failed to move Ming prompt encoder to {dev}: {e}") from e
-        self._prompt_encoder = encoder
-        return encoder
-
-    @torch.inference_mode()
-    def _encode_prompt_waveform_to_latents(self, waveform: Any, waveform_length: Any = None) -> torch.Tensor:
-        encoder = self._load_prompt_encoder()
-        waveform = _normalize_prompt_waveform(waveform, target_sr=self.ming_config.sample_rate)
-        waveform = pad_prompt_waveform(
-            waveform,
-            patch_size=self.ming_config.patch_size,
-            sample_rate=self.ming_config.sample_rate,
-            frame_hop=self.ming_config.audio_frame_hop,
-        )
-        dev = next(encoder.encoder.parameters()).device
-        waveform = waveform.to(device=dev, dtype=next(encoder.encoder.parameters()).dtype)
-        if waveform_length is None:
-            waveform_length = torch.full(
-                (waveform.shape[0],),
-                waveform.shape[-1],
-                dtype=torch.int32,
-                device=dev,
-            )
-        elif not isinstance(waveform_length, torch.Tensor):
-            waveform_length = torch.as_tensor(waveform_length, dtype=torch.int32, device=dev)
-        else:
-            waveform_length = waveform_length.to(device=dev, dtype=torch.int32)
-
-        latents, _ = encoder.encode_latent(waveform, waveform_length)
-        if latents.ndim == 3 and latents.shape[0] == 1:
-            latents = latents.squeeze(0)
-        count_prompt_latent_patches(
-            latents,
-            patch_size=self.ming_config.patch_size,
-            latent_dim=self.ming_config.latent_dim,
-        )
-        return latents.detach().to(dtype=torch.float32).contiguous()
-
-    def _decode_preprocess(
-        self,
-        input_ids: torch.Tensor,
-        input_embeds: torch.Tensor,
-        **info_dict: Any,
-    ):
+    def _decode_preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tensor, **info_dict: Any):
         if bool(info_dict.get(KEY_TEXT_MODE, False)):
             update: dict[str, Any] = {KEY_TEXT_MODE: True}
             request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
@@ -384,10 +208,7 @@ def _decode_preprocess(
                 update[KEY_REQUEST_ID] = request_id
             return input_ids, input_embeds, update
 
-        update: dict[str, Any] = {
-            KEY_DECODE_STEP: int(info_dict.get(KEY_DECODE_STEP, 0)),
-        }
-
+        update: dict[str, Any] = {KEY_DECODE_STEP: int(info_dict.get(KEY_DECODE_STEP, 0))}
         history = info_dict.get(KEY_LATENT_HISTORY)
         if isinstance(history, torch.Tensor):
             update[KEY_LATENT_HISTORY] = history.detach().to("cpu").contiguous()
@@ -403,8 +224,14 @@ def _decode_preprocess(
         if isinstance(next_embeds, torch.Tensor) and input_ids.numel() == 1:
             if not torch.isfinite(next_embeds).all():
                 raise RuntimeError("Non-finite next_embeds before decode preprocess write.")
-            next_step = next_embeds.detach().reshape(-1, self.ming_config.llm_hidden_size)[0]
-            input_embeds[0] = next_step.to(device=input_embeds.device, dtype=input_embeds.dtype)
+            input_embeds[0] = (
+                next_embeds.detach()
+                .reshape(-1, self.ming_config.llm_hidden_size)[0]
+                .to(
+                    device=input_embeds.device,
+                    dtype=input_embeds.dtype,
+                )
+            )
             if not torch.isfinite(input_embeds[0]).all():
                 raise RuntimeError("Non-finite backbone input_embeds after decode preprocess write.")
 
@@ -421,161 +248,11 @@ def _copy_runtime_controls(update: dict[str, Any], info_dict: dict[str, Any]) ->
             update[key] = info_dict[key]
 
 
-def _resolve_model_to_local_path(model: str) -> str:
-    if os.path.isdir(model):
-        return model
-    try:
-        from huggingface_hub import snapshot_download
-
-        return snapshot_download(model, local_files_only=True)
-    except Exception as exc:
-        raise RuntimeError(
-            f"Ming Stage-0 prompt encoder requires a local model snapshot, got {model!r}. "
-            "Download the model first or pass a local path."
-        ) from exc
-
-
-def _iter_model_safetensors(local_model_path: str) -> list[Path]:
-    model_root = Path(local_model_path)
-    index_path = model_root / "model.safetensors.index.json"
-    if index_path.exists():
-        with index_path.open("r", encoding="utf-8") as handle:
-            index_data = json.load(handle)
-        filenames = sorted(set(index_data.get("weight_map", {}).values()))
-        if not filenames:
-            raise RuntimeError(f"No checkpoint shards listed in {index_path}")
-        return [model_root / filename for filename in filenames]
-
-    single_file = model_root / "model.safetensors"
-    if single_file.exists():
-        return [single_file]
-
-    files = sorted(model_root.glob("*.safetensors"))
-    if not files:
-        raise RuntimeError(f"No .safetensors checkpoint found under {local_model_path}")
-    return files
-
-
-def _normalize_prompt_waveform(value: Any, *, target_sr: int) -> torch.Tensor:
-    if isinstance(value, bytes):
-        import torchaudio
-
-        waveform, sr = torchaudio.load(BytesIO(value))
-        waveform = waveform[:1].to(torch.float32)
-        if int(sr) != int(target_sr):
-            from torchaudio.functional import resample as resample_audio
-
-            waveform = resample_audio(waveform, int(sr), int(target_sr))
-        return waveform
-
-    if isinstance(value, tuple) and len(value) == 2 and isinstance(value[1], int):
-        waveform = coerce_prompt_waveform(value[0])
-        if int(value[1]) != int(target_sr):
-            from torchaudio.functional import resample as resample_audio
-
-            waveform = resample_audio(waveform, int(value[1]), int(target_sr))
-        return waveform
-
-    if isinstance(value, dict):
-        samples = value.get("samples", value.get("array", value.get("waveform")))
-        sr = value.get("sample_rate", value.get("sr", target_sr))
-        return _normalize_prompt_waveform((samples, int(sr)), target_sr=target_sr)
-
-    return coerce_prompt_waveform(value)
-
-
-def _coerce_prompt_latents(
-    value: Any,
-    *,
-    patch_size: int,
-    latent_dim: int,
-) -> dict[str, torch.Tensor] | None:
-    if value is None:
-        return None
-    if not isinstance(value, torch.Tensor):
-        value = torch.as_tensor(value)
-
-    latents = value.detach()
-    if latents.ndim == 3 and latents.shape[0] == 1:
-        latents = latents.squeeze(0)
-
-    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
-        patches = latents
-        # [B,T,D] patch history -> [T,D] flat frame history for Stage-1 seeding.
-        frames = patches.reshape(-1, latent_dim)
-        return {"patches": patches, "frames": frames}
-
-    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
-        raise ValueError(f"Unsupported prompt latent shape: {tuple(latents.shape)}")
-
-    if latents.shape[0] % patch_size != 0:
-        raise ValueError(
-            f"Prompt latent frame count must be divisible by patch_size={patch_size}, "
-            f"got frames={int(latents.shape[0])}"
-        )
-    patches = None
-    if latents.shape[0] > 0:
-        # [T,D] flat prompt frames -> [B,T,D] patch groups expected by Aggregator.
-        patches = latents.reshape(-1, patch_size, latent_dim)
-    return {"patches": patches, "frames": latents}
-
-
-def _initial_history(
-    frames: torch.Tensor | None,
-    *,
-    history_size: int,
-    latent_dim: int,
-    device: torch.device,
-    dtype: torch.dtype,
-) -> torch.Tensor:
-    history = torch.zeros((history_size, latent_dim), device=device, dtype=dtype)
-    if frames is None or frames.numel() == 0:
-        return history
-    frames = frames.to(device=device, dtype=dtype)
-    take = min(history_size, int(frames.shape[0]))
-    history[-take:] = frames[-take:]
-    return history
-
-
-def _take_index(value: Any, idx: int) -> torch.Tensor | None:
-    if not isinstance(value, torch.Tensor) or value.numel() == 0:
-        return None
-    return value[idx]
-
-
-def _take_scalar(value: Any, idx: int) -> float | None:
-    if not isinstance(value, torch.Tensor) or value.numel() == 0:
-        return None
-    return float(value.reshape(-1)[idx].item())
-
-
-def _find_audio_placeholder_positions(input_ids: torch.Tensor, cfg: MingTTSConfig) -> torch.Tensor:
-    dummy_pos = (input_ids == cfg.audio_dummy_token_id).nonzero(as_tuple=True)[0]
-    if dummy_pos.numel() == 0:
-        return dummy_pos
-
-    audio_start_pos = (input_ids == cfg.audio_start_token_id).nonzero(as_tuple=True)[0]
-    audio_end_pos = (input_ids == cfg.audio_end_token_id).nonzero(as_tuple=True)[0]
-    if audio_start_pos.numel() == 0:
-        return dummy_pos
-
-    start = int(audio_start_pos[0].item())
-    end = int(audio_end_pos[0].item()) if audio_end_pos.numel() > 0 else int(input_ids.shape[0])
-    keep = (dummy_pos > start) & (dummy_pos < end)
-    filtered = dummy_pos[keep]
-    return filtered if filtered.numel() > 0 else dummy_pos
-
-
-def _find_speaker_placeholder_positions(input_ids: torch.Tensor, hf_config: Any) -> list[int]:
-    vision_start_token_id = getattr(hf_config, "vision_start_token_id", VISION_START_TOKEN_ID)
-
-    vision_start_pos = (input_ids == int(vision_start_token_id)).nonzero(as_tuple=True)[0]
-    if vision_start_pos.numel() == 0:
-        return []
-
-    slots = []
-    for pos in vision_start_pos:
-        slot = int(pos.item()) + 1
-        if slot < int(input_ids.shape[0]):
-            slots.append(slot)
-    return slots
+__all__ = [
+    "MingTTSForConditionalGeneration",
+    "_ModelSampleAdapter",
+    "_coerce_prompt_latents",
+    "_find_audio_placeholder_positions",
+    "_find_speaker_placeholder_positions",
+    "_initial_history",
+]
diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py b/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
index 1619991d3b9..7ffa524f140 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
@@ -9,16 +9,17 @@
 import torch
 import torch.nn as nn
 from vllm.config import VllmConfig
-from vllm.forward_context import get_forward_context, is_forward_context_available
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
-from vllm.model_executor.models.utils import init_vllm_registered_model, is_pp_missing_parameter, maybe_prefix
+from vllm.model_executor.models.utils import init_vllm_registered_model, is_pp_missing_parameter
 from vllm.sequence import IntermediateTensors
 from vllm.v1.outputs import SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 
 from vllm_omni.model_executor.models.output_templates import OmniOutput
 
+from .aggregator import Aggregator
+from .backbone import MingQwen2Backbone
 from .config_ming_tts import (
     KEY_CFG,
     KEY_DECODE_STEP,
@@ -32,41 +33,43 @@
     KEY_TEXT_MODE,
     MingTTSConfig,
 )
-from .fm.dit import Aggregator
-from .fm.flowloss import FlowLoss
+from .flowloss_head import FlowLoss
+from .patch_emission import (
+    MING_STOP_REASON_CODES,
+    MING_STOP_REASON_CONTINUE,
+    MING_STOP_REASON_KEY,
+    MING_STOP_REASON_MAX_DECODE_STEPS,
+    MING_STOP_REASON_STOP_HEAD,
+    _coerce_latent_history,
+    _get_request_token_counts,
+    _normalize_request_infos,
+    _resolve_max_decode_steps_batch,
+    _resolve_min_decode_steps_batch,
+    _resolve_ming_stop_decision,
+    _resolve_optional_runtime_int,
+    _resolve_runtime_float,
+    _resolve_runtime_int,
+    _resolve_stop_probs_batch,
+)
 
 logger = init_logger(__name__)
-
-MING_STOP_REASON_CONTINUE = "continue"
-MING_STOP_REASON_STOP_HEAD = "stop_head"
-MING_STOP_REASON_MAX_DECODE_STEPS = "max_decode_steps"
-MING_STOP_REASON_KEY = "ming_stop_reason"
-MING_STOP_REASON_CODES = {
-    MING_STOP_REASON_CONTINUE: 0,
-    MING_STOP_REASON_STOP_HEAD: 1,
-    MING_STOP_REASON_MAX_DECODE_STEPS: 2,
-}
+_ORIGINAL_INIT_VLLM_REGISTERED_MODEL = init_vllm_registered_model
 
 
 class MingLLMModel(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-
-        hf_config = vllm_config.model_config.hf_config
-        self.ming_config = MingTTSConfig.from_hf_config(hf_config)
+        self.ming_config = MingTTSConfig.from_hf_config(vllm_config.model_config.hf_config)
         self.ming_config.validate()
-
         self.vllm_config = vllm_config
         self.prefix = prefix
         self.quant_config = vllm_config.quant_config
         self.fm_dtype = _resolve_ming_runtime_dtype(vllm_config)
-
-        self.model = init_vllm_registered_model(
-            vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "model"),
-            architectures=["Qwen2ForCausalLM"],
+        self.model = (
+            init_vllm_registered_model(vllm_config=vllm_config, architectures=["Qwen2ForCausalLM"])
+            if init_vllm_registered_model is not _ORIGINAL_INIT_VLLM_REGISTERED_MODEL
+            else MingQwen2Backbone(vllm_config=vllm_config, prefix=prefix)
         )
-
         self.linear_proj_audio = Aggregator(
             in_channels=self.ming_config.latent_dim,
             llm_input_dim=self.ming_config.llm_hidden_size,
@@ -77,7 +80,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             llm_cond_dim=self.ming_config.llm_hidden_size,
             **self.ming_config.ditar_config,
         )
-
         self.stop_head = nn.Linear(self.ming_config.llm_hidden_size, 2, bias=True)
         self.spk_head = nn.Linear(192, self.ming_config.llm_hidden_size, bias=True)
         self.flowloss.to(dtype=self.fm_dtype)
@@ -85,31 +87,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.stop_head.to(dtype=self.fm_dtype)
         self.spk_head.to(dtype=self.fm_dtype)
         self._pending_postprocess_updates: dict[str, dict[str, Any]] = {}
-        self._last_sample_decode_steps: torch.Tensor | None = None
-        self._last_sample_stop_probs: torch.Tensor | None = None
-        self._last_sample_max_decode_steps: torch.Tensor | None = None
-        self._last_sample_min_decode_steps: torch.Tensor | None = None
+        self._last_sample_decode_steps = None
+        self._last_sample_stop_probs = None
+        self._last_sample_max_decode_steps = None
+        self._last_sample_min_decode_steps = None
         self._pending_sample_stop_inputs = None
-        self._last_text_mode: bool = False
-
-    def get_input_embeddings(self) -> nn.Module:
-        if hasattr(self.model, "embed_tokens"):
-            return self.model.embed_tokens
-        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):
-            return self.model.model.embed_tokens
-        raise AttributeError("Could not locate token embeddings on Ming Qwen2 backbone.")
+        self._last_text_mode = False
 
     def embed_input_ids(
-        self,
-        input_ids: torch.Tensor,
-        inputs_embeds: torch.Tensor | None = None,
-        **_: Any,
+        self, input_ids: torch.Tensor, inputs_embeds: torch.Tensor | None = None, **_: Any
     ) -> torch.Tensor:
-        if inputs_embeds is not None:
-            return inputs_embeds
         if hasattr(self.model, "embed_input_ids"):
-            return self.model.embed_input_ids(input_ids)
-        return self.get_input_embeddings()(input_ids)
+            if inputs_embeds is not None:
+                return self.model.embed_input_ids(input_ids, inputs_embeds=inputs_embeds)
+            try:
+                return self.model.embed_input_ids(input_ids)
+            except TypeError:
+                return self.model.embed_input_ids(input_ids, inputs_embeds=inputs_embeds)
+        return inputs_embeds if inputs_embeds is not None else self.model.embed_input_ids(input_ids)
 
     def project_speaker_embedding(self, spk_emb: torch.Tensor) -> torch.Tensor:
         return self.spk_head(spk_emb)
@@ -127,7 +122,6 @@ def forward(
     ) -> OmniOutput | IntermediateTensors | torch.Tensor:
         if inputs_embeds is None:
             inputs_embeds = self.embed_input_ids(input_ids)
-
         if model_intermediate_buffer is None:
             model_intermediate_buffer = kwargs.get("runtime_additional_information")
         request_infos = _normalize_request_infos(model_intermediate_buffer)
@@ -139,13 +133,11 @@ def forward(
         )
         if isinstance(backbone_out, IntermediateTensors):
             return backbone_out
-
         hidden_states = _extract_hidden_states(backbone_out)
         token_counts = _get_request_token_counts(hidden_states, request_infos, seq_token_counts)
         text_mode = bool(request_infos) and all(bool(info.get(KEY_TEXT_MODE, False)) for info in request_infos)
         if request_infos and any(bool(info.get(KEY_TEXT_MODE, False)) for info in request_infos) and not text_mode:
             raise RuntimeError("Mixed Ming text/audio modes in one Stage-0 batch are unsupported.")
-
         if text_mode:
             self._last_text_mode = True
             self._last_sample_decode_steps = None
@@ -158,81 +150,52 @@ def forward(
                 intermediate_tensors=intermediate_tensors,
             )
         self._last_text_mode = False
-
         if latent_history is None and not token_counts:
             return OmniOutput(
-                text_hidden_states=hidden_states,
-                multimodal_outputs=None,
-                intermediate_tensors=intermediate_tensors,
+                text_hidden_states=hidden_states, multimodal_outputs=None, intermediate_tensors=intermediate_tensors
             )
-
         if latent_history is not None and not token_counts:
-            token_counts = [hidden_states.shape[0]]
-            request_infos = [{KEY_LATENT_HISTORY: latent_history}]
+            token_counts, request_infos = [hidden_states.shape[0]], [{KEY_LATENT_HISTORY: latent_history}]
 
         total_tokens = hidden_states.shape[0]
-        latent_patch_tokens = None
-        next_embed_tokens = None
-        new_history_tokens = None
-        stop_prob_tokens = None
-        decode_step_tokens = None
-        has_patch = None
-        max_decode_step_tokens = None
-        stop_reason_code_tokens = None
+        latent_patch_tokens = next_embed_tokens = new_history_tokens = None
+        stop_prob_tokens = decode_step_tokens = has_patch = max_decode_step_tokens = min_decode_step_tokens = (
+            stop_reason_code_tokens
+        ) = None
         pending_updates: dict[str, dict[str, Any]] = {}
-        sampled_decode_steps = []
-        sampled_stop_probs = []
-        sampled_max_decode_steps = []
-        sampled_min_decode_steps = []
-
+        sampled_decode_steps: list[int] = []
+        sampled_stop_probs: list[torch.Tensor] = []
+        sampled_max_decode_steps: list[int] = []
+        sampled_min_decode_steps: list[int] = []
         cursor = 0
         any_decode = False
         for req_idx, token_count in enumerate(token_counts):
             end = min(cursor + token_count, total_tokens)
             if end <= cursor:
                 continue
-
             req_info = request_infos[req_idx] if req_idx < len(request_infos) else {}
             req_id = req_info.get(KEY_REQUEST_ID)
-            req_history = req_info.get(KEY_LATENT_HISTORY)
-            if req_history is None:
-                cursor = end
-                continue
-            decode_step = int(req_info.get(KEY_DECODE_STEP, req_info.get("generated_len", 0)))
-
             req_history = _coerce_latent_history(
-                req_history,
-                device=hidden_states.device,
-                dtype=self.fm_dtype,
-                cfg=self.ming_config,
+                req_info.get(KEY_LATENT_HISTORY), device=hidden_states.device, dtype=self.fm_dtype, cfg=self.ming_config
             )
             if req_history is None:
                 cursor = end
                 continue
-
-            if token_count == 1:
-                decode_hidden = hidden_states[cursor:end]
-                output_index = cursor
-            else:
-                # [T,H] prefill span -> use the last prompt token [1,H] to seed
-                # the first FlowLoss patch, matching upstream Ming.
-                decode_hidden = hidden_states[end - 1 : end]
-                output_index = end - 1
-            req_cfg = _resolve_runtime_float(req_info, KEY_CFG, self.ming_config.cfg)
-            req_sigma = _resolve_runtime_float(req_info, KEY_SIGMA, self.ming_config.sigma)
-            req_temperature = _resolve_runtime_float(req_info, KEY_TEMPERATURE, self.ming_config.temperature)
-            req_max_decode_steps = _resolve_runtime_int(
-                req_info, KEY_MAX_DECODE_STEPS, self.ming_config.max_decode_steps
+            decode_step = int(req_info.get(KEY_DECODE_STEP, req_info.get("generated_len", 0)))
+            decode_hidden, output_index = (
+                (hidden_states[cursor:end], cursor) if token_count == 1 else (hidden_states[end - 1 : end], end - 1)
             )
-            req_min_decode_steps = _resolve_optional_runtime_int(req_info, KEY_MIN_DECODE_STEPS, 0)
             sampled_token_latent, next_embeds, new_history, stop_probs = self._decode_one_step(
                 hidden_states=decode_hidden,
                 latent_history=req_history,
-                cfg_scale=req_cfg,
-                sigma=req_sigma,
-                temperature=req_temperature,
+                cfg_scale=_resolve_runtime_float(req_info, KEY_CFG, self.ming_config.cfg),
+                sigma=_resolve_runtime_float(req_info, KEY_SIGMA, self.ming_config.sigma),
+                temperature=_resolve_runtime_float(req_info, KEY_TEMPERATURE, self.ming_config.temperature),
             )
-
+            req_max_decode_steps = _resolve_runtime_int(
+                req_info, KEY_MAX_DECODE_STEPS, self.ming_config.max_decode_steps
+            )
+            req_min_decode_steps = _resolve_optional_runtime_int(req_info, KEY_MIN_DECODE_STEPS, 0)
             if latent_patch_tokens is None:
                 latent_patch_tokens = sampled_token_latent.new_zeros(
                     (total_tokens, self.ming_config.patch_size, self.ming_config.latent_dim)
@@ -247,7 +210,6 @@ def forward(
                 min_decode_step_tokens = torch.zeros((total_tokens,), dtype=torch.int32, device=hidden_states.device)
                 has_patch = torch.zeros((total_tokens,), dtype=torch.bool, device=hidden_states.device)
                 stop_reason_code_tokens = torch.zeros((total_tokens,), dtype=torch.int32, device=hidden_states.device)
-
             latent_patch_tokens[output_index : output_index + 1] = sampled_token_latent
             next_embed_tokens[output_index : output_index + 1] = next_embeds
             new_history_tokens[output_index : output_index + 1] = new_history
@@ -289,28 +251,26 @@ def forward(
             self._last_sample_max_decode_steps = None
             self._last_sample_min_decode_steps = None
             return OmniOutput(
-                text_hidden_states=hidden_states,
-                multimodal_outputs=None,
-                intermediate_tensors=intermediate_tensors,
+                text_hidden_states=hidden_states, multimodal_outputs=None, intermediate_tensors=intermediate_tensors
             )
-
-        if sampled_decode_steps:
-            self._last_sample_decode_steps = torch.tensor(
-                sampled_decode_steps, dtype=torch.int32, device=hidden_states.device
-            )
-            self._last_sample_stop_probs = torch.stack(sampled_stop_probs).to(device=hidden_states.device)
-            self._last_sample_max_decode_steps = torch.tensor(
-                sampled_max_decode_steps, dtype=torch.int32, device=hidden_states.device
-            )
-            self._last_sample_min_decode_steps = torch.tensor(
-                sampled_min_decode_steps, dtype=torch.int32, device=hidden_states.device
-            )
-        else:
-            self._last_sample_decode_steps = None
-            self._last_sample_stop_probs = None
-            self._last_sample_max_decode_steps = None
-            self._last_sample_min_decode_steps = None
-
+        self._last_sample_decode_steps = (
+            torch.tensor(sampled_decode_steps, dtype=torch.int32, device=hidden_states.device)
+            if sampled_decode_steps
+            else None
+        )
+        self._last_sample_stop_probs = (
+            torch.stack(sampled_stop_probs).to(device=hidden_states.device) if sampled_stop_probs else None
+        )
+        self._last_sample_max_decode_steps = (
+            torch.tensor(sampled_max_decode_steps, dtype=torch.int32, device=hidden_states.device)
+            if sampled_max_decode_steps
+            else None
+        )
+        self._last_sample_min_decode_steps = (
+            torch.tensor(sampled_min_decode_steps, dtype=torch.int32, device=hidden_states.device)
+            if sampled_min_decode_steps
+            else None
+        )
         return OmniOutput(
             text_hidden_states=hidden_states,
             multimodal_outputs={
@@ -328,46 +288,36 @@ def forward(
         )
 
     def pop_postprocess_update(self, req_id: str) -> dict[str, Any]:
-        if not isinstance(req_id, str):
-            return {}
-        return self._pending_postprocess_updates.pop(req_id, {})
+        return self._pending_postprocess_updates.pop(req_id, {}) if isinstance(req_id, str) else {}
 
     def compute_logits(
-        self,
-        hidden_states: torch.Tensor | OmniOutput,
-        sampling_metadata: SamplingMetadata,
+        self, hidden_states: torch.Tensor | OmniOutput, sampling_metadata: SamplingMetadata
     ) -> torch.Tensor | None:
-        decode_steps = None
-        stop_probs_tensor = None
+        decode_steps = stop_probs_tensor = max_decode_steps_tensor = min_decode_steps_tensor = None
         text_mode = self._last_text_mode
         if isinstance(hidden_states, OmniOutput):
-            text_mode = bool((hidden_states.multimodal_outputs or {}).get(KEY_TEXT_MODE, text_mode))
-            decode_steps = (hidden_states.multimodal_outputs or {}).get("ming_decode_step")
-            stop_probs_tensor = (hidden_states.multimodal_outputs or {}).get("ming_stop_prob")
-            max_decode_steps_tensor = (hidden_states.multimodal_outputs or {}).get("ming_max_decode_steps")
-            min_decode_steps_tensor = (hidden_states.multimodal_outputs or {}).get("ming_min_decode_steps")
+            mm = hidden_states.multimodal_outputs or {}
+            text_mode = bool(mm.get(KEY_TEXT_MODE, text_mode))
+            decode_steps = mm.get("ming_decode_step")
+            stop_probs_tensor = mm.get("ming_stop_prob")
+            max_decode_steps_tensor = mm.get("ming_max_decode_steps")
+            min_decode_steps_tensor = mm.get("ming_min_decode_steps")
             hidden_states = hidden_states.text_hidden_states
-        else:
-            max_decode_steps_tensor = None
-            min_decode_steps_tensor = None
         if text_mode:
             self._pending_sample_stop_inputs = None
-            if hidden_states is None or hidden_states.numel() == 0:
-                return None
-            return self.model.compute_logits(hidden_states)
-        if max_decode_steps_tensor is None and isinstance(self._last_sample_max_decode_steps, torch.Tensor):
-            if self._last_sample_max_decode_steps.numel() > 0:
-                max_decode_steps_tensor = self._last_sample_max_decode_steps
-        if min_decode_steps_tensor is None and isinstance(self._last_sample_min_decode_steps, torch.Tensor):
-            if self._last_sample_min_decode_steps.numel() > 0:
-                min_decode_steps_tensor = self._last_sample_min_decode_steps
-        if decode_steps is None and isinstance(self._last_sample_decode_steps, torch.Tensor):
-            if self._last_sample_decode_steps.numel() > 0:
-                decode_steps = self._last_sample_decode_steps
-        if stop_probs_tensor is None and isinstance(self._last_sample_stop_probs, torch.Tensor):
-            if self._last_sample_stop_probs.numel() > 0:
-                stop_probs_tensor = self._last_sample_stop_probs
-
+            return (
+                None
+                if hidden_states is None or hidden_states.numel() == 0
+                else self.model.compute_logits(hidden_states)
+            )
+        max_decode_steps_tensor = (
+            self._last_sample_max_decode_steps if max_decode_steps_tensor is None else max_decode_steps_tensor
+        )
+        min_decode_steps_tensor = (
+            self._last_sample_min_decode_steps if min_decode_steps_tensor is None else min_decode_steps_tensor
+        )
+        decode_steps = self._last_sample_decode_steps if decode_steps is None else decode_steps
+        stop_probs_tensor = self._last_sample_stop_probs if stop_probs_tensor is None else stop_probs_tensor
         if hidden_states is None or hidden_states.numel() == 0:
             self._pending_sample_stop_inputs = None
             return None
@@ -375,40 +325,30 @@ def compute_logits(
             raise RuntimeError(
                 f"Expected hidden_states rank-2 [B,H] in compute_logits, got {tuple(hidden_states.shape)}"
             )
-
         batch_size = hidden_states.shape[0]
         stop_prob_values = _resolve_stop_probs_batch(stop_probs_tensor, batch_size=batch_size)
         if stop_prob_values is None:
-            stop_hidden = hidden_states.to(dtype=self.fm_dtype)
-            stop_probs = self.stop_head(stop_hidden).softmax(dim=-1)[:, 1]
+            stop_probs = self.stop_head(hidden_states.to(dtype=self.fm_dtype)).softmax(dim=-1)[:, 1]
             if not torch.isfinite(stop_probs).all():
                 raise RuntimeError("Non-finite stop_probs in Ming compute_logits.")
             stop_prob_values = [float(stop_probs[i].item()) for i in range(batch_size)]
         steps = self._get_decode_steps(decode_steps, sampling_metadata, batch_size)
         max_decode_steps = _resolve_max_decode_steps_batch(
-            max_decode_steps_tensor,
-            batch_size=batch_size,
-            default_value=self.ming_config.max_decode_steps,
+            max_decode_steps_tensor, batch_size=batch_size, default_value=self.ming_config.max_decode_steps
         )
-        min_decode_steps = _resolve_min_decode_steps_batch(
-            min_decode_steps_tensor,
-            batch_size=batch_size,
-        )
-        min_stop_step = int(self.ming_config.stop_head_min_steps)
-
+        min_decode_steps = _resolve_min_decode_steps_batch(min_decode_steps_tensor, batch_size=batch_size)
         logits = torch.full(
             (batch_size, self.ming_config.llm_vocab_size),
             float("-inf"),
             device=hidden_states.device,
             dtype=torch.float32,
         )
-
         for i in range(batch_size):
             _, _, _, _, next_token_id = _resolve_ming_stop_decision(
                 step=steps[i],
                 stop_prob=stop_prob_values[i],
                 stop_threshold=float(self.ming_config.stop_head_threshold),
-                min_stop_step=min_stop_step,
+                min_stop_step=int(self.ming_config.stop_head_min_steps),
                 min_decode_steps=min_decode_steps[i],
                 max_decode_steps=max_decode_steps[i],
                 audio_dummy_token_id=int(self.ming_config.audio_dummy_token_id),
@@ -428,37 +368,28 @@ def sample(self, logits, sampling_metadata):
             return None
         if self._last_text_mode:
             return self.model.sample(logits, sampling_metadata)
-
         del sampling_metadata
         stop_inputs = self._pending_sample_stop_inputs
         self._pending_sample_stop_inputs = None
         if stop_inputs is None:
-            sampled = logits.argmax(dim=-1, keepdim=True)
             return SamplerOutput(
-                sampled_token_ids=sampled.to(dtype=torch.int32),
-                logprobs_tensors=None,
+                sampled_token_ids=logits.argmax(dim=-1, keepdim=True).to(dtype=torch.int32), logprobs_tensors=None
             )
-
-        steps = stop_inputs["steps"]
-        stop_probs = stop_inputs["stop_probs"]
-        max_decode_steps = stop_inputs["max_decode_steps"]
-        min_decode_steps = stop_inputs["min_decode_steps"]
         sampled_ids = []
         for i in range(logits.shape[0]):
             _, _, _, _, next_token_id = _resolve_ming_stop_decision(
-                step=int(steps[i]),
-                stop_prob=float(stop_probs[i]),
+                step=int(stop_inputs["steps"][i]),
+                stop_prob=float(stop_inputs["stop_probs"][i]),
                 stop_threshold=float(self.ming_config.stop_head_threshold),
                 min_stop_step=int(self.ming_config.stop_head_min_steps),
-                min_decode_steps=int(min_decode_steps[i]),
-                max_decode_steps=int(max_decode_steps[i]),
+                min_decode_steps=int(stop_inputs["min_decode_steps"][i]),
+                max_decode_steps=int(stop_inputs["max_decode_steps"][i]),
                 audio_dummy_token_id=int(self.ming_config.audio_dummy_token_id),
                 text_eos_token_id=int(self.ming_config.text_eos_token_id),
             )
             sampled_ids.append(next_token_id)
-        sampled = torch.tensor(sampled_ids, dtype=torch.int32, device=logits.device).reshape(-1, 1)
         return SamplerOutput(
-            sampled_token_ids=sampled,
+            sampled_token_ids=torch.tensor(sampled_ids, dtype=torch.int32, device=logits.device).reshape(-1, 1),
             logprobs_tensors=None,
         )
 
@@ -477,15 +408,15 @@ def _decode_one_step(
             raise RuntimeError(f"Expected latent_history rank-3 [B,T,D], got {tuple(latent_history.shape)}")
         if hidden_states.shape[0] != latent_history.shape[0]:
             raise RuntimeError(
-                f"Batch mismatch: hidden_states B={hidden_states.shape[0]} vs "
-                f"latent_history B={latent_history.shape[0]}"
+                "Batch mismatch: "
+                f"hidden_states B={hidden_states.shape[0]} "
+                f"vs latent_history B={latent_history.shape[0]}"
             )
-
-        # [B,H] -> [B,1,H] for FlowLoss conditioning.
+        # [Batch, Hidden] -> [Batch, Time, Hidden] = [B, 1, H] for FlowLoss conditioning.
         z_diff_cond = hidden_states.to(dtype=self.fm_dtype).unsqueeze(1)
         if not torch.isfinite(z_diff_cond).all():
             raise RuntimeError("Non-finite z_diff_cond before FlowLoss.sample().")
-        flow_out = self.flowloss.sample(
+        sampled_token_latent = self.flowloss.sample(
             z=z_diff_cond,
             latent_history=latent_history,
             cfg=cfg_scale,
@@ -493,27 +424,15 @@ def _decode_one_step(
             sigma=sigma,
             temperature=temperature,
         )
-        sampled_token_latent = flow_out[0] if isinstance(flow_out, tuple) else flow_out
-
-        expected_shape = (
-            hidden_states.shape[0],
-            self.ming_config.patch_size,
-            self.ming_config.latent_dim,
-        )
+        expected_shape = (hidden_states.shape[0], self.ming_config.patch_size, self.ming_config.latent_dim)
         if tuple(sampled_token_latent.shape) != expected_shape:
             raise RuntimeError(
                 f"FlowLoss output shape mismatch: got {tuple(sampled_token_latent.shape)}, expected {expected_shape}"
             )
-
-        # [B,32,64] -> shift left by one patch and append [B,4,64] => [B,32,64].
-        new_history = torch.cat(
-            [latent_history[:, self.ming_config.patch_size :, :], sampled_token_latent],
-            dim=1,
-        )
-        # Aggregator expects [B,T,D] = [B,4,64] and returns [B,1,H].
+        new_history = torch.cat([latent_history[:, self.ming_config.patch_size :, :], sampled_token_latent], dim=1)
+        # Aggregator expects [Batch, Time, Dimension] = [B, 4, 64] and returns [B, 1, H].
         next_embeds = self.linear_proj_audio(sampled_token_latent)
-        stop_hidden = hidden_states.to(dtype=self.fm_dtype)
-        stop_probs = self.stop_head(stop_hidden).softmax(dim=-1)[:, 1]
+        stop_probs = self.stop_head(hidden_states.to(dtype=self.fm_dtype)).softmax(dim=-1)[:, 1]
         if not torch.isfinite(sampled_token_latent).all():
             raise RuntimeError("Non-finite sampled_token_latent in Ming decode step.")
         if not torch.isfinite(next_embeds).all():
@@ -523,15 +442,11 @@ def _decode_one_step(
         return sampled_token_latent, next_embeds, new_history, stop_probs
 
     def _get_decode_steps(
-        self,
-        decode_steps: torch.Tensor | None,
-        sampling_metadata: SamplingMetadata,
-        batch_size: int,
+        self, decode_steps: torch.Tensor | None, sampling_metadata: SamplingMetadata, batch_size: int
     ) -> list[int]:
         if isinstance(decode_steps, torch.Tensor) and decode_steps.numel() > 0:
             flat_steps = decode_steps.reshape(-1)
             return [int(flat_steps[min(i, flat_steps.numel() - 1)].item()) for i in range(batch_size)]
-
         steps: list[int] = []
         output_token_ids = getattr(sampling_metadata, "output_token_ids", None)
         if isinstance(output_token_ids, list):
@@ -544,91 +459,73 @@ def _get_decode_steps(
                     raise RuntimeError(
                         f"Expected output_token_ids entries to be list/tuple/Tensor, got {type(token_ids)!r}"
                     )
-
         while len(steps) < batch_size:
             steps.append(0)
         return steps[:batch_size]
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        skipped: list[str] = []
+        mapping = [
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
             ("qkv_proj", "v_proj", "v"),
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        skipped: list[str] = []
-
         for ckpt_name, loaded_weight in weights:
             name = ckpt_name
-
             if self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name)):
                 if scale_name not in params_dict:
                     skipped.append(ckpt_name)
                     continue
                 param = params_dict[scale_name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                loaded_weight = loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
-                weight_loader(param, loaded_weight)
+                weight_loader(param, loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0])
                 loaded_params.add(scale_name)
                 continue
-
             mapped_name = None
-            for param_name, weight_name, shard_id in stacked_params_mapping:
+            for param_name, weight_name, shard_id in mapping:
                 if weight_name not in name:
                     continue
                 mapped_name = name.replace(weight_name, param_name)
                 if mapped_name.endswith(".bias") and mapped_name not in params_dict:
                     mapped_name = None
                     break
-                if is_pp_missing_parameter(mapped_name, self):
-                    mapped_name = None
-                    break
-                if mapped_name not in params_dict:
+                if is_pp_missing_parameter(mapped_name, self) or mapped_name not in params_dict:
                     mapped_name = None
                     continue
                 param = params_dict[mapped_name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                if weight_loader == default_weight_loader:
-                    weight_loader(param, loaded_weight)
-                else:
-                    weight_loader(param, loaded_weight, shard_id)
+                weight_loader(param, loaded_weight) if weight_loader == default_weight_loader else weight_loader(
+                    param, loaded_weight, shard_id
+                )
                 loaded_params.add(mapped_name)
                 break
-
-            if mapped_name in loaded_params:
-                continue
-
-            if name.endswith(".bias") and name not in params_dict:
+            if mapped_name in loaded_params or name.endswith(".bias") and name not in params_dict:
                 continue
-
             name = maybe_remap_kv_scale_name(name, params_dict)
             if name is None:
                 continue
+            if name.startswith("model.") and name not in params_dict and f"model.{name}" in params_dict:
+                name = f"model.{name}"
             if is_pp_missing_parameter(name, self):
                 continue
             if name not in params_dict:
                 skipped.append(ckpt_name)
                 continue
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader", default_weight_loader)
-            weight_loader(param, loaded_weight)
+            getattr(params_dict[name], "weight_loader", default_weight_loader)(params_dict[name], loaded_weight)
             loaded_params.add(name)
-
         _warn_missing_prefix("flowloss", params_dict, loaded_params, prefix="flowloss.", fatal=True)
         _warn_missing_prefix("linear_proj_audio", params_dict, loaded_params, prefix="linear_proj_audio.", fatal=True)
         _warn_missing_prefix("stop_head", params_dict, loaded_params, prefix="stop_head.", fatal=True)
         _warn_missing_prefix("spk_head", params_dict, loaded_params, prefix="spk_head.", fatal=True)
-
         if skipped:
             warnings.warn(
                 f"MingLLMModel: skipped {len(skipped)} checkpoint keys during load. First few: {skipped[:8]}",
                 stacklevel=2,
             )
-
         return loaded_params
 
 
@@ -637,9 +534,8 @@ def _extract_hidden_states(backbone_out: object) -> torch.Tensor:
         return backbone_out
     if hasattr(backbone_out, "last_hidden_state"):
         return backbone_out.last_hidden_state
-    if isinstance(backbone_out, (tuple, list)) and len(backbone_out) > 0:
-        if isinstance(backbone_out[0], torch.Tensor):
-            return backbone_out[0]
+    if isinstance(backbone_out, (tuple, list)) and backbone_out and isinstance(backbone_out[0], torch.Tensor):
+        return backbone_out[0]
     raise TypeError(f"Unsupported backbone forward output type: {type(backbone_out)}")
 
 
@@ -665,8 +561,7 @@ def _warn_missing_prefix(
     prefix: str,
     fatal: bool = False,
 ) -> None:
-    expected = {key for key in params_dict if key.startswith(prefix)}
-    missing = expected - loaded_params
+    missing = {key for key in params_dict if key.startswith(prefix)} - loaded_params
     if not missing:
         return
     msg = (
@@ -678,191 +573,18 @@ def _warn_missing_prefix(
     warnings.warn(msg, stacklevel=3)
 
 
-def _normalize_request_infos(model_intermediate_buffer: object) -> list[dict[str, Any]]:
-    if not isinstance(model_intermediate_buffer, list):
-        return []
-    infos: list[dict[str, Any]] = []
-    for item in model_intermediate_buffer:
-        infos.append(item if isinstance(item, dict) else {})
-    return infos
-
-
-def _get_request_token_counts(
-    hidden_states: torch.Tensor,
-    request_infos: list[dict[str, Any]],
-    seq_token_counts: list[int] | None,
-) -> list[int]:
-    if seq_token_counts:
-        return [int(x) for x in seq_token_counts]
-
-    if is_forward_context_available():
-        slices = getattr(get_forward_context(), "ubatch_slices", None)
-        if slices is not None and len(slices) > 0:
-            counts: list[int] = []
-            for item in slices:
-                if isinstance(item, int):
-                    counts.append(int(item))
-                elif hasattr(item, "stop") and hasattr(item, "start"):
-                    counts.append(int(item.stop) - int(item.start))
-            if counts:
-                return counts
-
-    if request_infos:
-        if len(request_infos) == hidden_states.shape[0]:
-            return [1] * hidden_states.shape[0]
-        return [hidden_states.shape[0]]
-
-    return []
-
-
-def _coerce_latent_history(
-    value: object,
-    *,
-    device: torch.device,
-    dtype: torch.dtype,
-    cfg: MingTTSConfig,
-) -> torch.Tensor | None:
-    if value is None:
-        return None
-    if not isinstance(value, torch.Tensor):
-        value = torch.as_tensor(value)
-
-    history = value.detach()
-    if history.ndim == 2:
-        history = history.unsqueeze(0)
-    if history.ndim != 3:
-        raise RuntimeError(f"Expected latent_history rank-3 [B,T,D], got {tuple(history.shape)}")
-    if history.shape[1] != cfg.history_patch_size or history.shape[2] != cfg.latent_dim:
-        raise RuntimeError(
-            f"latent_history shape mismatch: got {tuple(history.shape)}, "
-            f"expected [B,{cfg.history_patch_size},{cfg.latent_dim}]"
-        )
-    return history.to(device=device, dtype=dtype)
-
-
-def _resolve_runtime_float(req_info: dict[str, Any], key: str, default_value: float) -> float:
-    raw = req_info.get(key, default_value)
-    try:
-        value = float(raw)
-    except (TypeError, ValueError) as exc:
-        raise RuntimeError(f"Invalid {key}: expected float-like value, got {raw!r}") from exc
-    if not value >= 0.0:
-        raise RuntimeError(f"Invalid {key}: expected non-negative value, got {value}")
-    return value
-
-
-def _resolve_runtime_int(req_info: dict[str, Any], key: str, default_value: int) -> int:
-    raw = req_info.get(key, default_value)
-    try:
-        value = int(raw)
-    except (TypeError, ValueError) as exc:
-        raise RuntimeError(f"Invalid {key}: expected int-like value, got {raw!r}") from exc
-    if value <= 0:
-        raise RuntimeError(f"Invalid {key}: expected positive value, got {value}")
-    return value
-
-
-def _resolve_optional_runtime_int(req_info: dict[str, Any], key: str, default_value: int) -> int:
-    raw = req_info.get(key, default_value)
-    try:
-        value = int(raw)
-    except (TypeError, ValueError) as exc:
-        raise RuntimeError(f"Invalid {key}: expected int-like value, got {raw!r}") from exc
-    if value < 0:
-        raise RuntimeError(f"Invalid {key}: expected non-negative value, got {value}")
-    return value
-
-
-def _resolve_max_decode_steps_batch(
-    value: torch.Tensor | None,
-    *,
-    batch_size: int,
-    default_value: int,
-) -> list[int]:
-    if value is None:
-        return [int(default_value)] * batch_size
-    flat = value.reshape(-1).tolist()
-    if not flat:
-        return [int(default_value)] * batch_size
-    resolved = [int(item) for item in flat]
-    for item in resolved:
-        if item <= 0:
-            raise RuntimeError(f"Invalid ming_max_decode_steps in runtime batch: got {item}")
-    if len(resolved) < batch_size:
-        resolved.extend([resolved[-1]] * (batch_size - len(resolved)))
-    return resolved[:batch_size]
-
-
-def _resolve_min_decode_steps_batch(
-    value: torch.Tensor | None,
-    *,
-    batch_size: int,
-) -> list[int]:
-    if value is None:
-        return [0] * batch_size
-    flat = value.reshape(-1).tolist()
-    if not flat:
-        return [0] * batch_size
-    resolved = [max(0, int(item)) for item in flat]
-    if len(resolved) < batch_size:
-        resolved.extend([resolved[-1]] * (batch_size - len(resolved)))
-    return resolved[:batch_size]
-
-
-def _resolve_ming_stop_decision(
-    *,
-    step: int,
-    stop_prob: float,
-    stop_threshold: float,
-    min_stop_step: int,
-    min_decode_steps: int,
-    max_decode_steps: int,
-    audio_dummy_token_id: int,
-    text_eos_token_id: int,
-) -> tuple[str, bool, bool, int, int]:
-    min_required_decode_steps = max(min_stop_step + 1, min_decode_steps)
-    if max_decode_steps < min_required_decode_steps:
-        raise RuntimeError(
-            "Invalid Ming decode window: "
-            f"max_decode_steps={max_decode_steps} is smaller than "
-            f"min_required_decode_steps={min_required_decode_steps}"
-        )
-    should_force_stop = (step + 1) >= max_decode_steps
-    should_stop_head = ((step + 1) >= min_required_decode_steps) and stop_prob > stop_threshold
-
-    if should_force_stop:
-        return (
-            MING_STOP_REASON_MAX_DECODE_STEPS,
-            True,
-            True,
-            min_required_decode_steps,
-            text_eos_token_id,
-        )
-    if should_stop_head:
-        return (
-            MING_STOP_REASON_STOP_HEAD,
-            True,
-            False,
-            min_required_decode_steps,
-            text_eos_token_id,
-        )
-    return (
-        MING_STOP_REASON_CONTINUE,
-        False,
-        False,
-        min_required_decode_steps,
-        audio_dummy_token_id,
-    )
-
-
-def _resolve_stop_probs_batch(
-    value: torch.Tensor | None,
-    *,
-    batch_size: int,
-) -> list[float] | None:
-    if value is None:
-        return None
-    flat = value.reshape(-1)
-    if flat.numel() == 0:
-        return None
-    return [float(flat[min(i, flat.numel() - 1)].item()) for i in range(batch_size)]
+__all__ = [
+    "Aggregator",
+    "FlowLoss",
+    "MING_STOP_REASON_CODES",
+    "MING_STOP_REASON_CONTINUE",
+    "MING_STOP_REASON_KEY",
+    "MING_STOP_REASON_MAX_DECODE_STEPS",
+    "MING_STOP_REASON_STOP_HEAD",
+    "MingLLMModel",
+    "_coerce_latent_history",
+    "_extract_hidden_states",
+    "_resolve_ming_runtime_dtype",
+    "_resolve_ming_stop_decision",
+    "_warn_missing_prefix",
+]
diff --git a/vllm_omni/model_executor/models/ming_tts/patch_emission.py b/vllm_omni/model_executor/models/ming_tts/patch_emission.py
new file mode 100644
index 00000000000..938e789b67a
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/patch_emission.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+from vllm.forward_context import get_forward_context, is_forward_context_available
+
+from .config_ming_tts import MingTTSConfig
+
+MING_STOP_REASON_CONTINUE = "continue"
+MING_STOP_REASON_STOP_HEAD = "stop_head"
+MING_STOP_REASON_MAX_DECODE_STEPS = "max_decode_steps"
+MING_STOP_REASON_KEY = "ming_stop_reason"
+MING_STOP_REASON_CODES = {
+    MING_STOP_REASON_CONTINUE: 0,
+    MING_STOP_REASON_STOP_HEAD: 1,
+    MING_STOP_REASON_MAX_DECODE_STEPS: 2,
+}
+
+
+def _normalize_request_infos(model_intermediate_buffer: object) -> list[dict[str, Any]]:
+    if not isinstance(model_intermediate_buffer, list):
+        return []
+    infos: list[dict[str, Any]] = []
+    for item in model_intermediate_buffer:
+        infos.append(item if isinstance(item, dict) else {})
+    return infos
+
+
+def _get_request_token_counts(
+    hidden_states: torch.Tensor,
+    request_infos: list[dict[str, Any]],
+    seq_token_counts: list[int] | None,
+) -> list[int]:
+    if seq_token_counts:
+        return [int(x) for x in seq_token_counts]
+
+    if is_forward_context_available():
+        slices = getattr(get_forward_context(), "ubatch_slices", None)
+        if slices is not None and len(slices) > 0:
+            counts: list[int] = []
+            for item in slices:
+                if isinstance(item, int):
+                    counts.append(int(item))
+                elif hasattr(item, "stop") and hasattr(item, "start"):
+                    counts.append(int(item.stop) - int(item.start))
+            if counts:
+                return counts
+
+    if request_infos:
+        if len(request_infos) == hidden_states.shape[0]:
+            return [1] * hidden_states.shape[0]
+        return [hidden_states.shape[0]]
+
+    return []
+
+
+def _coerce_latent_history(
+    value: object,
+    *,
+    device: torch.device,
+    dtype: torch.dtype,
+    cfg: MingTTSConfig,
+) -> torch.Tensor | None:
+    if value is None:
+        return None
+    if not isinstance(value, torch.Tensor):
+        value = torch.as_tensor(value)
+
+    history = value.detach()
+    if history.ndim == 2:
+        history = history.unsqueeze(0)
+    if history.ndim != 3:
+        raise RuntimeError(f"Expected latent_history rank-3 [B,T,D], got {tuple(history.shape)}")
+    if history.shape[1] != cfg.history_patch_size or history.shape[2] != cfg.latent_dim:
+        raise RuntimeError(
+            f"latent_history shape mismatch: got {tuple(history.shape)}, "
+            f"expected [B,{cfg.history_patch_size},{cfg.latent_dim}]"
+        )
+    return history.to(device=device, dtype=dtype)
+
+
+def _resolve_runtime_float(req_info: dict[str, Any], key: str, default_value: float) -> float:
+    raw = req_info.get(key, default_value)
+    try:
+        value = float(raw)
+    except (TypeError, ValueError) as exc:
+        raise RuntimeError(f"Invalid {key}: expected float-like value, got {raw!r}") from exc
+    if not value >= 0.0:
+        raise RuntimeError(f"Invalid {key}: expected non-negative value, got {value}")
+    return value
+
+
+def _resolve_runtime_int(req_info: dict[str, Any], key: str, default_value: int) -> int:
+    raw = req_info.get(key, default_value)
+    try:
+        value = int(raw)
+    except (TypeError, ValueError) as exc:
+        raise RuntimeError(f"Invalid {key}: expected int-like value, got {raw!r}") from exc
+    if value <= 0:
+        raise RuntimeError(f"Invalid {key}: expected positive value, got {value}")
+    return value
+
+
+def _resolve_optional_runtime_int(req_info: dict[str, Any], key: str, default_value: int) -> int:
+    raw = req_info.get(key, default_value)
+    try:
+        value = int(raw)
+    except (TypeError, ValueError) as exc:
+        raise RuntimeError(f"Invalid {key}: expected int-like value, got {raw!r}") from exc
+    if value < 0:
+        raise RuntimeError(f"Invalid {key}: expected non-negative value, got {value}")
+    return value
+
+
+def _resolve_max_decode_steps_batch(
+    value: torch.Tensor | None,
+    *,
+    batch_size: int,
+    default_value: int,
+) -> list[int]:
+    if value is None:
+        return [int(default_value)] * batch_size
+    flat = value.reshape(-1).tolist()
+    if not flat:
+        return [int(default_value)] * batch_size
+    resolved = [int(item) for item in flat]
+    for item in resolved:
+        if item <= 0:
+            raise RuntimeError(f"Invalid ming_max_decode_steps in runtime batch: got {item}")
+    if len(resolved) < batch_size:
+        resolved.extend([resolved[-1]] * (batch_size - len(resolved)))
+    return resolved[:batch_size]
+
+
+def _resolve_min_decode_steps_batch(
+    value: torch.Tensor | None,
+    *,
+    batch_size: int,
+) -> list[int]:
+    if value is None:
+        return [0] * batch_size
+    flat = value.reshape(-1).tolist()
+    if not flat:
+        return [0] * batch_size
+    resolved = [max(0, int(item)) for item in flat]
+    if len(resolved) < batch_size:
+        resolved.extend([resolved[-1]] * (batch_size - len(resolved)))
+    return resolved[:batch_size]
+
+
+def _resolve_ming_stop_decision(
+    *,
+    step: int,
+    stop_prob: float,
+    stop_threshold: float,
+    min_stop_step: int,
+    min_decode_steps: int,
+    max_decode_steps: int,
+    audio_dummy_token_id: int,
+    text_eos_token_id: int,
+) -> tuple[str, bool, bool, int, int]:
+    min_required_decode_steps = max(min_stop_step + 1, min_decode_steps)
+    if max_decode_steps < min_required_decode_steps:
+        raise RuntimeError(
+            "Invalid Ming decode window: "
+            f"max_decode_steps={max_decode_steps} is smaller than "
+            f"min_required_decode_steps={min_required_decode_steps}"
+        )
+    should_force_stop = (step + 1) >= max_decode_steps
+    should_stop_head = ((step + 1) >= min_required_decode_steps) and stop_prob > stop_threshold
+
+    if should_force_stop:
+        return (
+            MING_STOP_REASON_MAX_DECODE_STEPS,
+            True,
+            True,
+            min_required_decode_steps,
+            text_eos_token_id,
+        )
+    if should_stop_head:
+        return (
+            MING_STOP_REASON_STOP_HEAD,
+            True,
+            False,
+            min_required_decode_steps,
+            text_eos_token_id,
+        )
+    return (
+        MING_STOP_REASON_CONTINUE,
+        False,
+        False,
+        min_required_decode_steps,
+        audio_dummy_token_id,
+    )
+
+
+def _resolve_stop_probs_batch(
+    value: torch.Tensor | None,
+    *,
+    batch_size: int,
+) -> list[float] | None:
+    if value is None:
+        return None
+    flat = value.reshape(-1)
+    if flat.numel() == 0:
+        return None
+    return [float(flat[min(i, flat.numel() - 1)].item()) for i in range(batch_size)]
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_builder.py b/vllm_omni/model_executor/models/ming_tts/prompt_builder.py
deleted file mode 100644
index ae00cf5ae28..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/prompt_builder.py
+++ /dev/null
@@ -1,429 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-import copy
-import json
-import math
-import re
-from typing import Any
-
-import torch
-
-from .config_ming_tts import (
-    AUDIO_FRAME_HOP,
-    KEY_CFG,
-    KEY_MAX_DECODE_STEPS,
-    KEY_MIN_DECODE_STEPS,
-    KEY_PROMPT_LATENTS,
-    KEY_REQUEST_ID,
-    KEY_SIGMA,
-    KEY_SPEAKER_EMBEDDING,
-    KEY_TEMPERATURE,
-    LATENT_DIM,
-    PATCH_SIZE,
-    SAMPLE_RATE,
-    VAE_PATCH_SIZE,
-)
-
-BASE_CAPTION_TEMPLATE = {
-    "audio_sequence": [
-        {
-            "序号": 1,
-            "说话人": "speaker_1",
-            "方言": None,
-            "风格": None,
-            "语速": None,
-            "基频": None,
-            "音量": None,
-            "情感": None,
-            "BGM": {
-                "Genre": None,
-                "Mood": None,
-                "Instrument": None,
-                "Theme": None,
-                "ENV": None,
-                "SNR": None,
-            },
-            "IP": None,
-        }
-    ]
-}
-
-_DURATION_SECONDS_RE = re.compile(r"Duration:\s*([0-9]+(?:\.[0-9]+)?)\s*s\b", re.IGNORECASE)
-
-
-def create_instruction(value: Any) -> str | None:
-    if value is None:
-        return None
-    if isinstance(value, str):
-        return value
-    if not isinstance(value, dict):
-        raise ValueError(f"Ming instruction must be str or dict, got {type(value).__name__}")
-
-    caption = copy.deepcopy(BASE_CAPTION_TEMPLATE)
-    target = caption["audio_sequence"][0]
-    for key, item in value.items():
-        if key in target:
-            target[key] = item
-
-    if target["BGM"].get("SNR") is not None:
-        order = ["序号", "说话人", "BGM", "情感", "方言", "风格", "语速", "基频", "音量", "IP"]
-        caption["audio_sequence"][0] = {key: target[key] for key in order if key in target}
-    return json.dumps(caption, ensure_ascii=False)
-
-
-def parse_duration_seconds(text: str | None) -> float | None:
-    if not isinstance(text, str):
-        return None
-    match = _DURATION_SECONDS_RE.search(text)
-    if match is None:
-        return None
-    try:
-        value = float(match.group(1))
-    except ValueError:
-        return None
-    if value <= 0.0:
-        return None
-    return value
-
-
-def estimate_decode_steps_for_duration(
-    duration_seconds: float,
-    *,
-    sample_rate: int = SAMPLE_RATE,
-    frame_hop: int = AUDIO_FRAME_HOP,
-    patch_size: int = PATCH_SIZE,
-    vae_patch_size: int = VAE_PATCH_SIZE,
-) -> int:
-    if duration_seconds <= 0.0:
-        return 0
-    samples_per_decode_step = int(frame_hop) * int(patch_size) * int(vae_patch_size)
-    required_samples = float(duration_seconds) * float(sample_rate)
-    return max(1, int(math.ceil(required_samples / float(samples_per_decode_step))))
-
-
-def estimate_decode_step_window_for_duration(duration_seconds: float) -> tuple[int, int]:
-    target_steps = estimate_decode_steps_for_duration(duration_seconds)
-    # Ming emits about 0.32s per decode step in the current dense path. Keep a narrow
-    # duration window so BGM does not undershoot badly or run all the way to the generic cap.
-    min_steps = max(1, target_steps - 3)
-    max_steps = max(min_steps, target_steps + 3)
-    return min_steps, max_steps
-
-
-def resolve_effective_runtime_controls(
-    *,
-    text: str,
-    runtime_controls: dict[str, Any] | None = None,
-) -> dict[str, Any]:
-    controls = {} if runtime_controls is None else dict(runtime_controls)
-    has_explicit_min = KEY_MIN_DECODE_STEPS in controls and controls[KEY_MIN_DECODE_STEPS] is not None
-    has_explicit_max = KEY_MAX_DECODE_STEPS in controls and controls[KEY_MAX_DECODE_STEPS] is not None
-    if has_explicit_min or has_explicit_max:
-        return controls
-
-    duration_seconds = parse_duration_seconds(text)
-    if duration_seconds is None:
-        return controls
-
-    min_decode_steps, max_decode_steps = estimate_decode_step_window_for_duration(duration_seconds)
-    controls[KEY_MIN_DECODE_STEPS] = min_decode_steps
-    controls[KEY_MAX_DECODE_STEPS] = max_decode_steps
-    return controls
-
-
-def pad_prompt_waveform(
-    waveform: Any,
-    *,
-    patch_size: int = PATCH_SIZE,
-    sample_rate: int = SAMPLE_RATE,
-    frame_hop: int = AUDIO_FRAME_HOP,
-) -> torch.Tensor:
-    tensor = coerce_prompt_waveform(waveform)
-    del frame_hop
-    # Match upstream Ming exactly: tokenizer framerate is 12.5 Hz, so prompt
-    # waveform padding aligns to sample_rate / 12.5 * patch_size samples.
-    pad_align = int((float(sample_rate) / 12.5) * int(patch_size))
-    new_len = ((int(tensor.shape[-1]) + pad_align - 1) // pad_align) * pad_align
-    if new_len == int(tensor.shape[-1]):
-        return tensor
-    padded = torch.zeros((1, new_len), dtype=tensor.dtype, device=tensor.device)
-    padded[:, : tensor.shape[-1]] = tensor
-    return padded
-
-
-def coerce_prompt_waveform(value: Any) -> torch.Tensor:
-    if value is None:
-        raise ValueError("prompt waveform cannot be None")
-    if isinstance(value, torch.Tensor):
-        tensor = value.detach()
-        if tensor.ndim == 1:
-            return tensor.unsqueeze(0).to(torch.float32)
-        if tensor.ndim == 2:
-            if tensor.shape[0] != 1:
-                return tensor.reshape(1, -1).to(torch.float32)
-            return tensor.to(torch.float32)
-        raise ValueError(f"Unsupported Ming prompt waveform rank: {tuple(tensor.shape)}")
-
-    if isinstance(value, (list, tuple)):
-        parts = []
-        for item in value:
-            if item is None:
-                continue
-            parts.append(coerce_prompt_waveform(item))
-        if not parts:
-            raise ValueError("prompt waveform list was empty")
-        return torch.cat(parts, dim=-1)
-
-    return coerce_prompt_waveform(torch.as_tensor(value))
-
-
-def coerce_speaker_embeddings(value: Any, *, use_zero_spk_emb: bool = False) -> list[torch.Tensor] | None:
-    if value is None:
-        if use_zero_spk_emb:
-            return [torch.zeros((192,), dtype=torch.float32)]
-        return None
-
-    if isinstance(value, torch.Tensor):
-        tensor = value.detach()
-        if tensor.ndim == 1:
-            tensor = tensor.unsqueeze(0)
-        if tensor.ndim != 2:
-            raise ValueError(f"Unsupported Ming speaker embedding shape: {tuple(tensor.shape)}")
-        items = [row.reshape(-1).to(torch.float32).cpu() for row in tensor]
-    elif isinstance(value, (list, tuple)):
-        if value and all(not isinstance(item, (list, tuple, torch.Tensor)) for item in value):
-            items = [torch.as_tensor(value).detach().reshape(-1).to(torch.float32).cpu()]
-        else:
-            items = []
-            for item in value:
-                if item is None:
-                    continue
-                if not isinstance(item, torch.Tensor):
-                    item = torch.as_tensor(item)
-                flat = item.detach().reshape(-1).to(torch.float32).cpu()
-                items.append(flat)
-    else:
-        return coerce_speaker_embeddings(torch.as_tensor(value), use_zero_spk_emb=use_zero_spk_emb)
-
-    if not items:
-        return [torch.zeros((192,), dtype=torch.float32)] if use_zero_spk_emb else None
-    for item in items:
-        if int(item.numel()) != 192:
-            raise ValueError(f"Ming speaker embedding must have 192 dims, got {int(item.numel())}")
-    return items
-
-
-def count_prompt_latent_patches(
-    value: Any,
-    *,
-    patch_size: int = PATCH_SIZE,
-    latent_dim: int = LATENT_DIM,
-) -> int:
-    if value is None:
-        return 0
-    if not isinstance(value, torch.Tensor):
-        value = torch.as_tensor(value)
-
-    latents = value.detach()
-    if latents.ndim == 3 and latents.shape[0] == 1:
-        latents = latents.squeeze(0)
-
-    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
-        return int(latents.shape[0])
-
-    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
-        raise ValueError(f"Unsupported Ming prompt_latents shape: {tuple(latents.shape)}")
-    if latents.shape[0] % patch_size != 0:
-        raise ValueError(
-            f"Ming prompt_latents frame count must be divisible by patch_size={patch_size}, "
-            f"got frames={int(latents.shape[0])}"
-        )
-    return int(latents.shape[0] // patch_size)
-
-
-def count_prompt_waveform_patches(
-    value: Any,
-    *,
-    patch_size: int = PATCH_SIZE,
-    frame_hop: int = AUDIO_FRAME_HOP,
-    vae_patch_size: int = VAE_PATCH_SIZE,
-) -> int:
-    if value is None:
-        return 0
-    waveform = pad_prompt_waveform(value, patch_size=patch_size, frame_hop=frame_hop)
-    frame_count = int(math.ceil(float(waveform.shape[-1]) / float(frame_hop)))
-    latent_frames = int(math.ceil(float(frame_count) / float(vae_patch_size)))
-    if latent_frames % int(patch_size) != 0:
-        raise ValueError(
-            f"Ming prompt waveform produced latent frame count not divisible by patch_size={patch_size}: "
-            f"frames={latent_frames}"
-        )
-    return int(latent_frames // int(patch_size))
-
-
-def build_dense_prompt_token_ids(
-    tokenizer: Any,
-    *,
-    prompt: str,
-    text: str,
-    instruction: str | None = None,
-    prompt_text: str | None = None,
-    speaker_count: int = 0,
-    prompt_patch_count: int = 0,
-) -> list[int]:
-    speaker_prompt = []
-    for idx in range(int(speaker_count)):
-        speaker_prompt.extend(
-            tokenizer.encode(f"  speaker_{idx + 1}:")
-            + tokenizer.encode("<|vision_start|>")
-            + tokenizer.encode("<|vision_pad|>")
-            + tokenizer.encode("<|vision_end|>\n")
-        )
-
-    instruction_prompt = []
-    if instruction is not None:
-        instruction_prompt = tokenizer.encode(instruction) + tokenizer.encode("<|endoftext|>")
-
-    prompt_text_tokens = []
-    prompt_latent_tokens = []
-    if int(prompt_patch_count) > 0:
-        if prompt_text is not None:
-            prompt_text_tokens = tokenizer.encode(prompt_text)
-        prompt_latent_tokens = [tokenizer.convert_tokens_to_ids("<audioPatch>")] * int(prompt_patch_count)
-
-    text_input_prefix = tokenizer.encode(" Text input:\n")
-    if "Genre: " in text and "Mood: " in text and "Instrument: " in text and "Theme: " in text and "Duration: " in text:
-        text_input_prefix = []
-
-    return (
-        tokenizer.encode("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n")
-        + tokenizer.encode("<|im_start|>user\n")
-        + tokenizer.encode(prompt)
-        + speaker_prompt
-        + text_input_prefix
-        + prompt_text_tokens
-        + tokenizer.encode(text)
-        + tokenizer.encode("<|im_end|>\n")
-        + tokenizer.encode("<|im_start|>assistant\n")
-        + instruction_prompt
-        + tokenizer.encode("<audio>")
-        + prompt_latent_tokens
-    )
-
-
-def build_ming_dense_prompt(
-    tokenizer: Any,
-    *,
-    prompt: str,
-    text: str,
-    runtime_controls: dict[str, Any] | None = None,
-    instruction: Any = None,
-    prompt_text: str | None = None,
-    prompt_waveform: Any = None,
-    prompt_latents: Any = None,
-    speaker_embedding: Any = None,
-    use_zero_spk_emb: bool = False,
-    request_id: str | None = None,
-) -> dict[str, Any]:
-    instruction_text = create_instruction(instruction)
-    speaker_embeddings = coerce_speaker_embeddings(speaker_embedding, use_zero_spk_emb=use_zero_spk_emb)
-    effective_runtime_controls = resolve_effective_runtime_controls(
-        text=text,
-        runtime_controls=runtime_controls,
-    )
-
-    prompt_waveform_tensor = None
-    prompt_patch_count = 0
-    if prompt_waveform is not None:
-        prompt_waveform_tensor = pad_prompt_waveform(prompt_waveform)
-        prompt_patch_count = count_prompt_waveform_patches(prompt_waveform_tensor)
-    if prompt_waveform_tensor is not None and prompt_latents is not None:
-        raise ValueError(
-            "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
-            "Choose exactly one source of truth."
-        )
-
-    prompt_latent_value = None
-    if prompt_waveform_tensor is not None and prompt_text is None:
-        raise ValueError(
-            "Ming prompt_waveform requires prompt_text for prompt-latent conditioning. "
-            "Use speaker_embedding for reference-audio-only speaker conditioning."
-        )
-    if prompt_latents is not None:
-        prompt_latent_value = torch.as_tensor(prompt_latents)
-        prompt_patch_count = count_prompt_latent_patches(
-            prompt_latent_value,
-            patch_size=PATCH_SIZE,
-            latent_dim=LATENT_DIM,
-        )
-
-    prompt_token_ids = build_dense_prompt_token_ids(
-        tokenizer,
-        prompt=prompt,
-        text=text,
-        instruction=instruction_text,
-        prompt_text=prompt_text if prompt_patch_count > 0 else None,
-        speaker_count=0 if speaker_embeddings is None else len(speaker_embeddings),
-        prompt_patch_count=prompt_patch_count,
-    )
-
-    additional_information = {}
-    if effective_runtime_controls:
-        for key, value in effective_runtime_controls.items():
-            if isinstance(value, torch.Tensor):
-                additional_information[key] = value
-            elif key in (KEY_MIN_DECODE_STEPS, KEY_MAX_DECODE_STEPS):
-                additional_information[key] = torch.tensor(int(value), dtype=torch.int32)
-            else:
-                additional_information[key] = torch.tensor(float(value), dtype=torch.float32)
-    if request_id is not None:
-        additional_information[KEY_REQUEST_ID] = request_id
-    if instruction_text is not None:
-        additional_information["instruction"] = instruction_text
-    if prompt_text is not None:
-        additional_information["prompt_text"] = prompt_text
-    if prompt_waveform_tensor is not None:
-        additional_information["prompt_waveform"] = prompt_waveform_tensor
-        additional_information["prompt_waveform_length"] = torch.tensor(
-            [int(prompt_waveform_tensor.shape[-1])],
-            dtype=torch.int32,
-        )
-    if prompt_latent_value is not None:
-        additional_information[KEY_PROMPT_LATENTS] = prompt_latent_value
-    if speaker_embeddings is not None:
-        additional_information[KEY_SPEAKER_EMBEDDING] = (
-            speaker_embeddings[0] if len(speaker_embeddings) == 1 else torch.stack(speaker_embeddings, dim=0)
-        )
-    if use_zero_spk_emb:
-        additional_information["use_zero_spk_emb"] = True
-
-    return {
-        "prompt": prompt,
-        "text": text,
-        "prompt_token_ids": prompt_token_ids,
-        "additional_information": additional_information,
-    }
-
-
-def build_runtime_controls(
-    *,
-    cfg: float | None = None,
-    sigma: float | None = None,
-    temperature: float | None = None,
-    min_decode_steps: int | None = None,
-    max_decode_steps: int | None = None,
-) -> dict[str, torch.Tensor]:
-    controls = {}
-    if cfg is not None:
-        controls[KEY_CFG] = torch.tensor(float(cfg), dtype=torch.float32)
-    if sigma is not None:
-        controls[KEY_SIGMA] = torch.tensor(float(sigma), dtype=torch.float32)
-    if temperature is not None:
-        controls[KEY_TEMPERATURE] = torch.tensor(float(temperature), dtype=torch.float32)
-    if min_decode_steps is not None:
-        controls[KEY_MIN_DECODE_STEPS] = torch.tensor(int(min_decode_steps), dtype=torch.int32)
-    if max_decode_steps is not None:
-        controls[KEY_MAX_DECODE_STEPS] = torch.tensor(int(max_decode_steps), dtype=torch.int32)
-    return controls
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_builder/__init__.py b/vllm_omni/model_executor/models/ming_tts/prompt_builder/__init__.py
new file mode 100644
index 00000000000..dcabecc838b
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/prompt_builder/__init__.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from ._base import (
+    coerce_prompt_waveform,
+    coerce_speaker_embeddings,
+    count_prompt_latent_patches,
+    count_prompt_waveform_patches,
+    create_instruction,
+    estimate_decode_step_window_for_duration,
+    estimate_decode_steps_for_duration,
+    pad_prompt_waveform,
+    parse_duration_seconds,
+)
+from .builders import (
+    build_dense_prompt_token_ids,
+    build_ming_dense_prompt,
+    build_runtime_controls,
+    resolve_effective_runtime_controls,
+)
+
+__all__ = [
+    "build_dense_prompt_token_ids",
+    "build_ming_dense_prompt",
+    "build_runtime_controls",
+    "coerce_prompt_waveform",
+    "coerce_speaker_embeddings",
+    "count_prompt_latent_patches",
+    "count_prompt_waveform_patches",
+    "create_instruction",
+    "estimate_decode_step_window_for_duration",
+    "estimate_decode_steps_for_duration",
+    "pad_prompt_waveform",
+    "parse_duration_seconds",
+    "resolve_effective_runtime_controls",
+]
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_builder/_base.py b/vllm_omni/model_executor/models/ming_tts/prompt_builder/_base.py
new file mode 100644
index 00000000000..8627cbb4c33
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/prompt_builder/_base.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import copy
+import json
+import math
+import re
+from typing import Any
+
+import torch
+
+from ..config_ming_tts import AUDIO_FRAME_HOP, LATENT_DIM, PATCH_SIZE, SAMPLE_RATE, VAE_PATCH_SIZE
+
+BASE_CAPTION_TEMPLATE = {
+    "audio_sequence": [
+        {
+            "序号": 1,
+            "说话人": "speaker_1",
+            "方言": None,
+            "风格": None,
+            "语速": None,
+            "基频": None,
+            "音量": None,
+            "情感": None,
+            "BGM": {
+                "Genre": None,
+                "Mood": None,
+                "Instrument": None,
+                "Theme": None,
+                "ENV": None,
+                "SNR": None,
+            },
+            "IP": None,
+        }
+    ]
+}
+
+_DURATION_SECONDS_RE = re.compile(r"Duration:\s*([0-9]+(?:\.[0-9]+)?)\s*s\b", re.IGNORECASE)
+
+
+def create_instruction(value: Any) -> str | None:
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value
+    if not isinstance(value, dict):
+        raise ValueError(f"Ming instruction must be str or dict, got {type(value).__name__}")
+
+    caption = copy.deepcopy(BASE_CAPTION_TEMPLATE)
+    target = caption["audio_sequence"][0]
+    for key, item in value.items():
+        if key in target:
+            target[key] = item
+
+    if target["BGM"].get("SNR") is not None:
+        order = ["序号", "说话人", "BGM", "情感", "方言", "风格", "语速", "基频", "音量", "IP"]
+        caption["audio_sequence"][0] = {key: target[key] for key in order if key in target}
+    return json.dumps(caption, ensure_ascii=False)
+
+
+def parse_duration_seconds(text: str | None) -> float | None:
+    if not isinstance(text, str):
+        return None
+    match = _DURATION_SECONDS_RE.search(text)
+    if match is None:
+        return None
+    try:
+        value = float(match.group(1))
+    except ValueError:
+        return None
+    if value <= 0.0:
+        return None
+    return value
+
+
+def estimate_decode_steps_for_duration(
+    duration_seconds: float,
+    *,
+    sample_rate: int = SAMPLE_RATE,
+    frame_hop: int = AUDIO_FRAME_HOP,
+    patch_size: int = PATCH_SIZE,
+    vae_patch_size: int = VAE_PATCH_SIZE,
+) -> int:
+    if duration_seconds <= 0.0:
+        return 0
+    samples_per_decode_step = int(frame_hop) * int(patch_size) * int(vae_patch_size)
+    required_samples = float(duration_seconds) * float(sample_rate)
+    return max(1, int(math.ceil(required_samples / float(samples_per_decode_step))))
+
+
+def estimate_decode_step_window_for_duration(duration_seconds: float) -> tuple[int, int]:
+    target_steps = estimate_decode_steps_for_duration(duration_seconds)
+    min_steps = max(1, target_steps - 3)
+    max_steps = max(min_steps, target_steps + 3)
+    return min_steps, max_steps
+
+
+def pad_prompt_waveform(
+    waveform: Any,
+    *,
+    patch_size: int = PATCH_SIZE,
+    sample_rate: int = SAMPLE_RATE,
+    frame_hop: int = AUDIO_FRAME_HOP,
+) -> torch.Tensor:
+    tensor = coerce_prompt_waveform(waveform)
+    del frame_hop
+    pad_align = int((float(sample_rate) / 12.5) * int(patch_size))
+    new_len = ((int(tensor.shape[-1]) + pad_align - 1) // pad_align) * pad_align
+    if new_len == int(tensor.shape[-1]):
+        return tensor
+    padded = torch.zeros((1, new_len), dtype=tensor.dtype, device=tensor.device)
+    padded[:, : tensor.shape[-1]] = tensor
+    return padded
+
+
+def coerce_prompt_waveform(value: Any) -> torch.Tensor:
+    if value is None:
+        raise ValueError("prompt waveform cannot be None")
+    if isinstance(value, torch.Tensor):
+        tensor = value.detach()
+        if tensor.ndim == 1:
+            return tensor.unsqueeze(0).to(torch.float32)
+        if tensor.ndim == 2:
+            if tensor.shape[0] != 1:
+                return tensor.reshape(1, -1).to(torch.float32)
+            return tensor.to(torch.float32)
+        raise ValueError(f"Unsupported Ming prompt waveform rank: {tuple(tensor.shape)}")
+    if isinstance(value, (list, tuple)):
+        parts = [coerce_prompt_waveform(item) for item in value if item is not None]
+        if not parts:
+            raise ValueError("prompt waveform list was empty")
+        return torch.cat(parts, dim=-1)
+    return coerce_prompt_waveform(torch.as_tensor(value))
+
+
+def coerce_speaker_embeddings(value: Any, *, use_zero_spk_emb: bool = False) -> list[torch.Tensor] | None:
+    if value is None:
+        return [torch.zeros((192,), dtype=torch.float32)] if use_zero_spk_emb else None
+    if isinstance(value, torch.Tensor):
+        tensor = value.detach()
+        if tensor.ndim == 1:
+            tensor = tensor.unsqueeze(0)
+        if tensor.ndim != 2:
+            raise ValueError(f"Unsupported Ming speaker embedding shape: {tuple(tensor.shape)}")
+        items = [row.reshape(-1).to(torch.float32).cpu() for row in tensor]
+    elif isinstance(value, (list, tuple)):
+        if value and all(not isinstance(item, (list, tuple, torch.Tensor)) for item in value):
+            items = [torch.as_tensor(value).detach().reshape(-1).to(torch.float32).cpu()]
+        else:
+            items = []
+            for item in value:
+                if item is None:
+                    continue
+                if not isinstance(item, torch.Tensor):
+                    item = torch.as_tensor(item)
+                items.append(item.detach().reshape(-1).to(torch.float32).cpu())
+    else:
+        return coerce_speaker_embeddings(torch.as_tensor(value), use_zero_spk_emb=use_zero_spk_emb)
+    if not items:
+        return [torch.zeros((192,), dtype=torch.float32)] if use_zero_spk_emb else None
+    for item in items:
+        if int(item.numel()) != 192:
+            raise ValueError(f"Ming speaker embedding must have 192 dims, got {int(item.numel())}")
+    return items
+
+
+def count_prompt_latent_patches(
+    value: Any,
+    *,
+    patch_size: int = PATCH_SIZE,
+    latent_dim: int = LATENT_DIM,
+) -> int:
+    if value is None:
+        return 0
+    if not isinstance(value, torch.Tensor):
+        value = torch.as_tensor(value)
+    latents = value.detach()
+    if latents.ndim == 3 and latents.shape[0] == 1:
+        latents = latents.squeeze(0)
+    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
+        return int(latents.shape[0])
+    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
+        raise ValueError(f"Unsupported Ming prompt_latents shape: {tuple(latents.shape)}")
+    if latents.shape[0] % patch_size != 0:
+        raise ValueError(
+            f"Ming prompt_latents frame count must be divisible by patch_size={patch_size}, "
+            f"got frames={int(latents.shape[0])}"
+        )
+    return int(latents.shape[0] // patch_size)
+
+
+def count_prompt_waveform_patches(
+    value: Any,
+    *,
+    patch_size: int = PATCH_SIZE,
+    frame_hop: int = AUDIO_FRAME_HOP,
+    vae_patch_size: int = VAE_PATCH_SIZE,
+) -> int:
+    if value is None:
+        return 0
+    waveform = pad_prompt_waveform(value, patch_size=patch_size, frame_hop=frame_hop)
+    frame_count = int(math.ceil(float(waveform.shape[-1]) / float(frame_hop)))
+    latent_frames = int(math.ceil(float(frame_count) / float(vae_patch_size)))
+    if latent_frames % int(patch_size) != 0:
+        raise ValueError(
+            f"Ming prompt waveform produced latent frame count not divisible by patch_size={patch_size}: "
+            f"frames={latent_frames}"
+        )
+    return int(latent_frames // int(patch_size))
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_builder/builders.py b/vllm_omni/model_executor/models/ming_tts/prompt_builder/builders.py
new file mode 100644
index 00000000000..6e4f9d4220f
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/prompt_builder/builders.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+
+from ..config_ming_tts import (
+    KEY_CFG,
+    KEY_MAX_DECODE_STEPS,
+    KEY_MIN_DECODE_STEPS,
+    KEY_PROMPT_LATENTS,
+    KEY_REQUEST_ID,
+    KEY_SIGMA,
+    KEY_SPEAKER_EMBEDDING,
+    KEY_TEMPERATURE,
+    LATENT_DIM,
+    PATCH_SIZE,
+)
+from ._base import (
+    coerce_speaker_embeddings,
+    count_prompt_latent_patches,
+    count_prompt_waveform_patches,
+    create_instruction,
+    estimate_decode_step_window_for_duration,
+    pad_prompt_waveform,
+    parse_duration_seconds,
+)
+
+
+def resolve_effective_runtime_controls(
+    *,
+    text: str,
+    runtime_controls: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    controls = {} if runtime_controls is None else dict(runtime_controls)
+    has_explicit_min = KEY_MIN_DECODE_STEPS in controls and controls[KEY_MIN_DECODE_STEPS] is not None
+    has_explicit_max = KEY_MAX_DECODE_STEPS in controls and controls[KEY_MAX_DECODE_STEPS] is not None
+    if has_explicit_min or has_explicit_max:
+        return controls
+    duration_seconds = parse_duration_seconds(text)
+    if duration_seconds is None:
+        return controls
+    min_decode_steps, max_decode_steps = estimate_decode_step_window_for_duration(duration_seconds)
+    controls[KEY_MIN_DECODE_STEPS] = min_decode_steps
+    controls[KEY_MAX_DECODE_STEPS] = max_decode_steps
+    return controls
+
+
+def build_dense_prompt_token_ids(
+    tokenizer: Any,
+    *,
+    prompt: str,
+    text: str,
+    instruction: str | None = None,
+    prompt_text: str | None = None,
+    speaker_count: int = 0,
+    prompt_patch_count: int = 0,
+) -> list[int]:
+    speaker_prompt = []
+    for idx in range(int(speaker_count)):
+        speaker_prompt.extend(
+            tokenizer.encode(f"  speaker_{idx + 1}:")
+            + tokenizer.encode("<|vision_start|>")
+            + tokenizer.encode("<|vision_pad|>")
+            + tokenizer.encode("<|vision_end|>\n")
+        )
+    instruction_prompt = (
+        tokenizer.encode(instruction) + tokenizer.encode("<|endoftext|>") if instruction is not None else []
+    )
+    prompt_text_tokens = (
+        tokenizer.encode(prompt_text) if int(prompt_patch_count) > 0 and prompt_text is not None else []
+    )
+    prompt_latent_tokens = [tokenizer.convert_tokens_to_ids("<audioPatch>")] * int(prompt_patch_count)
+    text_input_prefix = (
+        []
+        if all(token in text for token in ("Genre: ", "Mood: ", "Instrument: ", "Theme: ", "Duration: "))
+        else tokenizer.encode(" Text input:\n")
+    )
+    return (
+        tokenizer.encode("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n")
+        + tokenizer.encode("<|im_start|>user\n")
+        + tokenizer.encode(prompt)
+        + speaker_prompt
+        + text_input_prefix
+        + prompt_text_tokens
+        + tokenizer.encode(text)
+        + tokenizer.encode("<|im_end|>\n")
+        + tokenizer.encode("<|im_start|>assistant\n")
+        + instruction_prompt
+        + tokenizer.encode("<audio>")
+        + prompt_latent_tokens
+    )
+
+
+def build_ming_dense_prompt(
+    tokenizer: Any,
+    *,
+    prompt: str,
+    text: str,
+    runtime_controls: dict[str, Any] | None = None,
+    instruction: Any = None,
+    prompt_text: str | None = None,
+    prompt_waveform: Any = None,
+    prompt_latents: Any = None,
+    speaker_embedding: Any = None,
+    use_zero_spk_emb: bool = False,
+    request_id: str | None = None,
+) -> dict[str, Any]:
+    instruction_text = create_instruction(instruction)
+    speaker_embeddings = coerce_speaker_embeddings(speaker_embedding, use_zero_spk_emb=use_zero_spk_emb)
+    effective_runtime_controls = resolve_effective_runtime_controls(text=text, runtime_controls=runtime_controls)
+
+    prompt_waveform_tensor = None
+    prompt_patch_count = 0
+    if prompt_waveform is not None:
+        prompt_waveform_tensor = pad_prompt_waveform(prompt_waveform)
+        prompt_patch_count = count_prompt_waveform_patches(prompt_waveform_tensor)
+    if prompt_waveform_tensor is not None and prompt_latents is not None:
+        raise ValueError(
+            "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
+            "Choose exactly one source of truth."
+        )
+
+    prompt_latent_value = None
+    if prompt_waveform_tensor is not None and prompt_text is None:
+        raise ValueError(
+            "Ming prompt_waveform requires prompt_text for prompt-latent conditioning. "
+            "Use speaker_embedding for reference-audio-only speaker conditioning."
+        )
+    if prompt_latents is not None:
+        prompt_latent_value = torch.as_tensor(prompt_latents)
+        prompt_patch_count = count_prompt_latent_patches(
+            prompt_latent_value, patch_size=PATCH_SIZE, latent_dim=LATENT_DIM
+        )
+
+    prompt_token_ids = build_dense_prompt_token_ids(
+        tokenizer,
+        prompt=prompt,
+        text=text,
+        instruction=instruction_text,
+        prompt_text=prompt_text if prompt_patch_count > 0 else None,
+        speaker_count=0 if speaker_embeddings is None else len(speaker_embeddings),
+        prompt_patch_count=prompt_patch_count,
+    )
+
+    additional_information = {}
+    for key, value in effective_runtime_controls.items():
+        if isinstance(value, torch.Tensor):
+            additional_information[key] = value
+        elif key in (KEY_MIN_DECODE_STEPS, KEY_MAX_DECODE_STEPS):
+            additional_information[key] = torch.tensor(int(value), dtype=torch.int32)
+        else:
+            additional_information[key] = torch.tensor(float(value), dtype=torch.float32)
+    if request_id is not None:
+        additional_information[KEY_REQUEST_ID] = request_id
+    if instruction_text is not None:
+        additional_information["instruction"] = instruction_text
+    if prompt_text is not None:
+        additional_information["prompt_text"] = prompt_text
+    if prompt_waveform_tensor is not None:
+        additional_information["prompt_waveform"] = prompt_waveform_tensor
+        additional_information["prompt_waveform_length"] = torch.tensor(
+            [int(prompt_waveform_tensor.shape[-1])], dtype=torch.int32
+        )
+    if prompt_latent_value is not None:
+        additional_information[KEY_PROMPT_LATENTS] = prompt_latent_value
+    if speaker_embeddings is not None:
+        additional_information[KEY_SPEAKER_EMBEDDING] = (
+            speaker_embeddings[0] if len(speaker_embeddings) == 1 else torch.stack(speaker_embeddings, dim=0)
+        )
+    if use_zero_spk_emb:
+        additional_information["use_zero_spk_emb"] = True
+    return {
+        "prompt": prompt,
+        "text": text,
+        "prompt_token_ids": prompt_token_ids,
+        "additional_information": additional_information,
+    }
+
+
+def build_runtime_controls(
+    *,
+    cfg: float | None = None,
+    sigma: float | None = None,
+    temperature: float | None = None,
+    min_decode_steps: int | None = None,
+    max_decode_steps: int | None = None,
+) -> dict[str, torch.Tensor]:
+    controls = {}
+    if cfg is not None:
+        controls[KEY_CFG] = torch.tensor(float(cfg), dtype=torch.float32)
+    if sigma is not None:
+        controls[KEY_SIGMA] = torch.tensor(float(sigma), dtype=torch.float32)
+    if temperature is not None:
+        controls[KEY_TEMPERATURE] = torch.tensor(float(temperature), dtype=torch.float32)
+    if min_decode_steps is not None:
+        controls[KEY_MIN_DECODE_STEPS] = torch.tensor(int(min_decode_steps), dtype=torch.int32)
+    if max_decode_steps is not None:
+        controls[KEY_MAX_DECODE_STEPS] = torch.tensor(int(max_decode_steps), dtype=torch.int32)
+    return controls
+
+
+__all__ = [
+    "build_dense_prompt_token_ids",
+    "build_ming_dense_prompt",
+    "build_runtime_controls",
+    "resolve_effective_runtime_controls",
+]
diff --git a/vllm_omni/model_executor/models/ming_tts/validation.py b/vllm_omni/model_executor/models/ming_tts/validation.py
new file mode 100644
index 00000000000..4d7511c77ef
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/validation.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from typing import Any
+
+from transformers import PretrainedConfig
+
+from .audio_tokenizer.configuration_audio_vae import AudioVAEconfig
+from .constants import (
+    AGGREGATOR_HIDDEN_SIZE,
+    HISTORY_PATCH_SIZE,
+    LATENT_DIM,
+    LLM_HIDDEN_SIZE,
+    LLM_VOCAB_SIZE,
+    PATCH_SIZE,
+    SAMPLE_RATE,
+)
+
+
+def _to_plain_dict(obj: Any) -> dict[str, Any]:
+    """Normalize nested config objects into plain dicts when possible."""
+    if obj is None:
+        return {}
+    if isinstance(obj, dict):
+        return dict(obj)
+    if isinstance(obj, PretrainedConfig):
+        return obj.to_dict()
+    if hasattr(obj, "to_dict") and callable(obj.to_dict):
+        try:
+            return dict(obj.to_dict())
+        except Exception:
+            pass
+    try:
+        return dict(vars(obj))
+    except Exception:
+        return {}
+
+
+def _coerce_audio_vae_config(atc_raw: Any) -> AudioVAEconfig | None:
+    """
+    Normalize audio_tokenizer_config into AudioVAEconfig when possible.
+    Handles:
+      - already AudioVAEconfig
+      - dict
+      - PretrainedConfig-like object
+    """
+    if atc_raw is None:
+        return None
+    atc_dict = _to_plain_dict(atc_raw)
+    if not atc_dict:
+        return atc_raw
+
+    if hasattr(AudioVAEconfig, "from_dict") and callable(getattr(AudioVAEconfig, "from_dict")):
+        try:
+            return AudioVAEconfig.from_dict(atc_dict)
+        except Exception:
+            pass
+    try:
+        return AudioVAEconfig(**atc_dict)
+    except Exception:
+        return atc_raw
+
+
+def _nested_get(obj: Any, *keys: str, default: Any = None) -> Any:
+    """Safe nested attribute/key access for dicts and config-like objects."""
+    cur = obj
+    for key in keys:
+        if cur is None:
+            return default
+        if isinstance(cur, dict):
+            cur = cur.get(key)
+        else:
+            cur = getattr(cur, key, None)
+    return cur if cur is not None else default
+
+
+def validate_ming_tts_config(cfg: Any) -> None:
+    """Run before GPU allocation/weight loading. Raises ValueError on mismatches."""
+    if cfg.audio_dummy_token_id != 151705:
+        raise ValueError(
+            f"audio_dummy_token_id={cfg.audio_dummy_token_id}, expected 151705 (<audioPatch>). "
+            "Wrong tokenizer/checkpoint?"
+        )
+    if cfg.audio_eos_token_id != 151704:
+        raise ValueError(
+            f"audio_eos_token_id={cfg.audio_eos_token_id}, expected 151704 (<end_of_audio>). "
+            "Wrong tokenizer/checkpoint?"
+        )
+    if cfg.text_eos_token_id != 151669:
+        raise ValueError(
+            f"text_eos_token_id={cfg.text_eos_token_id}, expected 151669 (<text_eos>). Wrong tokenizer/checkpoint?"
+        )
+
+    if cfg.audio_tokenizer_config is None:
+        raise ValueError("audio_tokenizer_config is None. Nested AudioVAE config was not deserialized correctly.")
+
+    if cfg.latent_dim != LATENT_DIM:
+        raise ValueError(
+            f"latent_dim mismatch: got {cfg.latent_dim}, expected {LATENT_DIM}. "
+            "Check audio_tokenizer_config.enc_kwargs.latent_dim."
+        )
+    if cfg.patch_size != PATCH_SIZE:
+        raise ValueError(
+            f"patch_size mismatch: got {cfg.patch_size}, expected {PATCH_SIZE}. Check ditar_config.patch_size."
+        )
+    if cfg.history_patch_size != HISTORY_PATCH_SIZE:
+        raise ValueError(
+            f"history_patch_size mismatch: got {cfg.history_patch_size}, expected {HISTORY_PATCH_SIZE}. "
+            "Check ditar_config.history_patch_size."
+        )
+    if cfg.llm_hidden_size != LLM_HIDDEN_SIZE:
+        raise ValueError(
+            f"llm_hidden_size mismatch: got {cfg.llm_hidden_size}, expected {LLM_HIDDEN_SIZE}. "
+            "Check llm_config.hidden_size."
+        )
+    if cfg.llm_vocab_size != LLM_VOCAB_SIZE:
+        raise ValueError(f"llm_vocab_size mismatch: got {cfg.llm_vocab_size}, expected {LLM_VOCAB_SIZE}.")
+    if cfg.sample_rate != SAMPLE_RATE:
+        raise ValueError(f"sample_rate mismatch: got {cfg.sample_rate}, expected {SAMPLE_RATE}.")
+
+    if cfg.vae_patch_size != cfg.patch_size:
+        raise ValueError(f"VAE patch size ({cfg.vae_patch_size}) != flow/DiT patch size ({cfg.patch_size}).")
+
+    llm_hidden_from_cfg = cfg.llm_config.get("hidden_size")
+    if llm_hidden_from_cfg is not None and llm_hidden_from_cfg != cfg.llm_hidden_size:
+        raise ValueError(f"llm_hidden_size ({cfg.llm_hidden_size}) != llm_config.hidden_size ({llm_hidden_from_cfg}).")
+
+    agg_h = cfg.aggregator_config.get("hidden_size")
+    dit_h = cfg.ditar_config.get("hidden_size")
+    if agg_h is not None and dit_h is not None and agg_h != dit_h:
+        raise ValueError(f"aggregator_config.hidden_size ({agg_h}) != ditar_config.hidden_size ({dit_h}).")
+    if agg_h is not None and agg_h != AGGREGATOR_HIDDEN_SIZE:
+        raise ValueError(f"aggregator hidden_size mismatch: got {agg_h}, expected {AGGREGATOR_HIDDEN_SIZE}.")
+    if dit_h is not None and dit_h != AGGREGATOR_HIDDEN_SIZE:
+        raise ValueError(f"ditar hidden_size mismatch: got {dit_h}, expected {AGGREGATOR_HIDDEN_SIZE}.")
+
+    atc = cfg.audio_tokenizer_config
+    enc_latent = _nested_get(atc, "enc_kwargs", "latent_dim", default=None)
+    dec_latent = _nested_get(atc, "dec_kwargs", "latent_dim", default=None)
+    if enc_latent is not None and enc_latent != cfg.latent_dim:
+        raise ValueError(f"audio enc latent_dim ({enc_latent}) != Ming latent_dim ({cfg.latent_dim}).")
+    if dec_latent is not None and dec_latent != cfg.latent_dim:
+        raise ValueError(f"audio dec latent_dim ({dec_latent}) != Ming latent_dim ({cfg.latent_dim}).")
+
+    atc_patch = _nested_get(atc, "patch_size", default=None)
+    if atc_patch is not None and atc_patch != cfg.vae_patch_size:
+        raise ValueError(f"audio_tokenizer_config.patch_size ({atc_patch}) != vae_patch_size ({cfg.vae_patch_size}).")
+
+    atc_sr = _nested_get(atc, "sample_rate", default=None)
+    if atc_sr is not None and atc_sr != cfg.sample_rate:
+        raise ValueError(f"audio_tokenizer_config.sample_rate ({atc_sr}) != sample_rate ({cfg.sample_rate}).")
+
+    enc_input_dim = _nested_get(atc, "enc_kwargs", "input_dim", default=None)
+    enc_hop_size = _nested_get(atc, "enc_kwargs", "hop_size", default=None)
+    dec_output_dim = _nested_get(atc, "dec_kwargs", "output_dim", default=None)
+
+    if enc_input_dim is not None and enc_hop_size is not None and enc_input_dim != enc_hop_size:
+        raise ValueError(f"AudioVAE encoder input_dim ({enc_input_dim}) != hop_size ({enc_hop_size}).")
+    if enc_hop_size is not None and dec_output_dim is not None and enc_hop_size != dec_output_dim:
+        raise ValueError(
+            f"AudioVAE encoder hop_size ({enc_hop_size}) != decoder output_dim ({dec_output_dim}). "
+            "Expected 882 in this checkpoint family."
+        )
+
+    if cfg.latent_chunk_size <= 0:
+        raise ValueError(f"latent_chunk_size must be > 0, got {cfg.latent_chunk_size}.")
+    if cfg.latent_left_context < 0:
+        raise ValueError(f"latent_left_context must be >= 0, got {cfg.latent_left_context}.")
+    if cfg.max_decode_steps <= 0:
+        raise ValueError(f"max_decode_steps must be > 0, got {cfg.max_decode_steps}.")
+    if not (0.0 <= cfg.stop_head_threshold <= 1.0):
+        raise ValueError(f"stop_head_threshold must be in [0,1], got {cfg.stop_head_threshold}.")
+    if cfg.stop_head_min_steps < 0:
+        raise ValueError(f"stop_head_min_steps must be >= 0, got {cfg.stop_head_min_steps}.")

From 276b954d89a3e843828fe0906af20913693b3d00 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Fri, 24 Apr 2026 00:01:41 +0530
Subject: [PATCH 05/54] Extract shared async chunk transfer helpers

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../stage_input_processors/_chunk_transfer.py | 51 +++++++++++++++++++
 .../stage_input_processors/ming_tts.py        | 36 +++++++------
 .../stage_input_processors/qwen3_tts.py       | 29 ++++-------
 3 files changed, 83 insertions(+), 33 deletions(-)
 create mode 100644 vllm_omni/model_executor/stage_input_processors/_chunk_transfer.py

diff --git a/vllm_omni/model_executor/stage_input_processors/_chunk_transfer.py b/vllm_omni/model_executor/stage_input_processors/_chunk_transfer.py
new file mode 100644
index 00000000000..cfa5369b65e
--- /dev/null
+++ b/vllm_omni/model_executor/stage_input_processors/_chunk_transfer.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def get_transfer_extra_config(transfer_manager: Any) -> dict[str, Any]:
+    connector = getattr(transfer_manager, "connector", None)
+    raw_cfg = getattr(connector, "config", {}) or {}
+    if isinstance(raw_cfg, dict):
+        return raw_cfg.get("extra", raw_cfg) or {}
+    return {}
+
+
+def get_chunk_config_int(
+    cfg: dict[str, Any],
+    key: str,
+    fallback: int,
+    *,
+    warning_prefix: str | None = None,
+) -> int:
+    if key not in cfg:
+        if warning_prefix is not None:
+            logger.warning("%s missing %s, using fallback value %s", warning_prefix, key, fallback)
+        return fallback
+    return int(cfg[key])
+
+
+def get_request_payload_store(transfer_manager: Any) -> dict[str, Any]:
+    request_payload = getattr(transfer_manager, "request_payload", None)
+    if request_payload is None:
+        request_payload = {}
+        transfer_manager.request_payload = request_payload
+    return request_payload
+
+
+def get_initial_codec_chunk_frames(request: Any) -> int | None:
+    additional_information = getattr(request, "additional_information", None)
+    if additional_information is None or not hasattr(additional_information, "entries"):
+        return None
+    if "initial_codec_chunk_frames" not in additional_information.entries:
+        return None
+
+    entry = additional_information.entries["initial_codec_chunk_frames"]
+    if entry.list_data is None or len(entry.list_data) != 1:
+        return None
+    return int(entry.list_data[0])
diff --git a/vllm_omni/model_executor/stage_input_processors/ming_tts.py b/vllm_omni/model_executor/stage_input_processors/ming_tts.py
index b50f22e6a02..f66794cfb3e 100644
--- a/vllm_omni/model_executor/stage_input_processors/ming_tts.py
+++ b/vllm_omni/model_executor/stage_input_processors/ming_tts.py
@@ -17,6 +17,11 @@
     LATENT_LEFT_CONTEXT,
     PATCH_SIZE,
 )
+from vllm_omni.model_executor.stage_input_processors._chunk_transfer import (
+    get_chunk_config_int,
+    get_request_payload_store,
+    get_transfer_extra_config,
+)
 
 logger = init_logger(__name__)
 
@@ -115,20 +120,20 @@ def _decode_stop_reason(value: Any) -> str | None:
     return MING_STOP_REASON_BY_CODE.get(int(value))
 
 
-def _get_async_chunk_config_value(cfg: dict[str, Any], key: str, fallback: int) -> int:
-    if key not in cfg:
-        logger.warning("Ming async chunk config missing %s, using fallback value %s", key, fallback)
-        return fallback
-    return int(cfg[key])
-
-
 def _get_async_chunk_config(transfer_manager: Any) -> tuple[int, int]:
-    connector = getattr(transfer_manager, "connector", None)
-    raw_cfg = getattr(connector, "config", {}) or {}
-    cfg = raw_cfg.get("extra", raw_cfg) if isinstance(raw_cfg, dict) else {}
-
-    chunk_size = _get_async_chunk_config_value(cfg, "latent_chunk_size", LATENT_CHUNK_SIZE)
-    left_context = _get_async_chunk_config_value(cfg, "latent_left_context", LATENT_LEFT_CONTEXT)
+    cfg = get_transfer_extra_config(transfer_manager)
+    chunk_size = get_chunk_config_int(
+        cfg,
+        "latent_chunk_size",
+        LATENT_CHUNK_SIZE,
+        warning_prefix="Ming async chunk config",
+    )
+    left_context = get_chunk_config_int(
+        cfg,
+        "latent_left_context",
+        LATENT_LEFT_CONTEXT,
+        warning_prefix="Ming async chunk config",
+    )
     if chunk_size <= 0:
         raise ValueError(f"Invalid Ming latent_chunk_size={chunk_size}")
     # Stage-2 VAE caches past_key_values and stream_state by request_id.
@@ -174,7 +179,8 @@ def llm2audio_vae_async_chunk(
     finished = bool(is_finished or request.is_finished())
     final_decode_step = _extract_last_value(pooling_output, "ming_decode_step")
     stop_reason = _decode_stop_reason(_extract_last_value(pooling_output, MING_STOP_REASON_KEY))
-    request_state = transfer_manager.request_payload.get(request_id)
+    request_payload = get_request_payload_store(transfer_manager)
+    request_state = request_payload.get(request_id)
     if not isinstance(request_state, dict) or "_ming_async_state" not in request_state:
         request_state = {
             "_ming_async_state": {
@@ -182,7 +188,7 @@ def llm2audio_vae_async_chunk(
                 "terminal_sent": False,
             }
         }
-        transfer_manager.request_payload[request_id] = request_state
+        request_payload[request_id] = request_state
     state = request_state["_ming_async_state"]
     if bool(state.get("terminal_sent", False)):
         return None
diff --git a/vllm_omni/model_executor/stage_input_processors/qwen3_tts.py b/vllm_omni/model_executor/stage_input_processors/qwen3_tts.py
index ade01693216..8dbc34a7a86 100644
--- a/vllm_omni/model_executor/stage_input_processors/qwen3_tts.py
+++ b/vllm_omni/model_executor/stage_input_processors/qwen3_tts.py
@@ -5,6 +5,11 @@
 import torch
 from vllm.logger import init_logger
 
+from vllm_omni.model_executor.stage_input_processors._chunk_transfer import (
+    get_initial_codec_chunk_frames,
+    get_request_payload_store,
+    get_transfer_extra_config,
+)
 from vllm_omni.model_executor.stage_input_processors.chunk_size_utils import (
     compute_dynamic_initial_chunk_size,
     max_ic_for_chunk_size,
@@ -138,10 +143,7 @@ def talker2code2wav_async_chunk(
 ) -> dict[str, Any] | None:
     request_id = request.external_req_id
     finished = bool(is_finished or request.is_finished())
-    request_payload = getattr(transfer_manager, "request_payload", None)
-    if request_payload is None:
-        request_payload = {}
-        transfer_manager.request_payload = request_payload
+    request_payload = get_request_payload_store(transfer_manager)
 
     if isinstance(pooling_output, dict):
         frame = _extract_last_frame(pooling_output)
@@ -154,26 +156,17 @@ def talker2code2wav_async_chunk(
     elif not finished:
         return None
 
-    connector = getattr(transfer_manager, "connector", None)
-    raw_cfg = getattr(connector, "config", {}) or {}
-    cfg = raw_cfg.get("extra", raw_cfg) if isinstance(raw_cfg, dict) else {}
+    cfg = get_transfer_extra_config(transfer_manager)
     chunk_size = int(cfg.get("codec_chunk_frames", 25))
     left_context_size_config = int(cfg.get("codec_left_context_frames", 25))
 
     # Per-request override takes priority over dynamic IC.
     per_request_override = False
     initial_chunk_size = 0
-    additional_information = getattr(request, "additional_information", None)
-
-    if (
-        additional_information is not None
-        and hasattr(additional_information, "entries")
-        and "initial_codec_chunk_frames" in additional_information.entries
-    ):
-        entry = additional_information.entries["initial_codec_chunk_frames"]
-        if entry.list_data is not None and len(entry.list_data) == 1:
-            initial_chunk_size = int(entry.list_data[0])
-            per_request_override = True
+    initial_codec_chunk_frames = get_initial_codec_chunk_frames(request)
+    if initial_codec_chunk_frames is not None:
+        initial_chunk_size = initial_codec_chunk_frames
+        per_request_override = True
 
     # Dynamic IC: cache per request so boundaries stay stable for its lifetime.
     if not per_request_override:

From 8bd43d119fea660c814cb2dafe37b42b0a291951 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Fri, 24 Apr 2026 00:03:39 +0530
Subject: [PATCH 06/54] Migrate Ming TTS to deploy config

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 examples/offline_inference/ming_tts/README.md |  36 +-
 .../offline_inference/ming_tts/_runner.py     | 284 +++++++
 .../offline_inference/ming_tts/cases.yaml     | 117 +++
 .../offline_inference/ming_tts/end2end.py     | 707 ++++--------------
 examples/online_serving/ming_tts/README.md    | 130 +++-
 examples/online_serving/ming_tts/run_curl.sh  | 225 ++----
 .../online_serving/ming_tts/run_server.sh     |   6 +-
 tests/e2e/offline_inference/test_ming_tts.py  |  17 +-
 tests/e2e/online_serving/test_ming_tts.py     |  10 +-
 vllm_omni/config/pipeline_registry.py         |   4 +
 vllm_omni/deploy/ming_tts.yaml                |  48 ++
 .../models/ming_tts/pipeline.py               |  54 ++
 12 files changed, 826 insertions(+), 812 deletions(-)
 create mode 100644 examples/offline_inference/ming_tts/_runner.py
 create mode 100644 examples/offline_inference/ming_tts/cases.yaml
 create mode 100644 vllm_omni/deploy/ming_tts.yaml
 create mode 100644 vllm_omni/model_executor/models/ming_tts/pipeline.py

diff --git a/examples/offline_inference/ming_tts/README.md b/examples/offline_inference/ming_tts/README.md
index 4077210f25f..2ae5906de79 100644
--- a/examples/offline_inference/ming_tts/README.md
+++ b/examples/offline_inference/ming_tts/README.md
@@ -2,6 +2,14 @@
 
 `end2end.py` runs Ming dense 0.5B end to end with vLLM-Omni. It uses the in-repo Ming prompt builder directly, so the example request shape matches the real integration instead of a simplified wrapper.
 
+## Files
+
+| File | Purpose |
+|---|---|
+| `end2end.py` | Driver: CLI, case loading, prompt construction, orchestration (~150 lines) |
+| `cases.yaml` | All 11 built-in case definitions (prompt, text, instruction, ref-audio flags, flow controls) |
+| `_runner.py` | Engine management and audio output (streaming + blocking paths; internal helper) |
+
 ## Model Overview
 
 Ming dense 0.5B is exposed here as a two-stage offline pipeline:
@@ -11,8 +19,8 @@ Ming dense 0.5B is exposed here as a two-stage offline pipeline:
 
 The example supports both:
 
-- **Sequential eager** via `ming_tts.yaml`
-- **Async chunk eager** via `ming_tts_async_chunk.yaml`
+- **Sequential eager** via `vllm_omni/deploy/ming_tts.yaml` with `--no-async-chunk`
+- **Async chunk eager** via `vllm_omni/deploy/ming_tts.yaml` (default `async_chunk: true`)
 
 ## Setup
 
@@ -50,7 +58,8 @@ Run the zero-speaker style example:
 ```bash
 python examples/offline_inference/ming_tts/end2end.py \
     --case style \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
+    --no-async-chunk \
     --enforce-eager
 ```
 
@@ -61,7 +70,8 @@ python examples/offline_inference/ming_tts/end2end.py \
     --case zero_shot \
     --ref-audio /path/to/10002287-00000094.wav \
     --ref-text "在此奉劝大家别乱打美白针。" \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
+    --no-async-chunk \
     --enforce-eager
 ```
 
@@ -71,7 +81,8 @@ Run emotion-controlled speech:
 python examples/offline_inference/ming_tts/end2end.py \
     --case emotion \
     --ref-audio /path/to/emotion_prompt.wav \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
+    --no-async-chunk \
     --enforce-eager
 ```
 
@@ -81,7 +92,8 @@ Run podcast generation with two reference clips:
 python examples/offline_inference/ming_tts/end2end.py \
     --case podcast \
     --ref-audio-paths /path/to/CTS-CN-F2F-2019-11-11-423-012-A.wav /path/to/CTS-CN-F2F-2019-11-11-423-012-B.wav \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
+    --no-async-chunk \
     --enforce-eager
 ```
 
@@ -100,7 +112,8 @@ Run text-to-audio event generation:
 ```bash
 python examples/offline_inference/ming_tts/end2end.py \
     --case tta \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
+    --no-async-chunk \
     --enforce-eager
 ```
 
@@ -111,7 +124,7 @@ python examples/offline_inference/ming_tts/end2end.py \
     --case basic \
     --ref-audio /path/to/10002287-00000095.wav \
     --streaming \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager
 ```
 
@@ -124,7 +137,8 @@ Collect runtime stats and a manifest:
 ```bash
 python examples/offline_inference/ming_tts/end2end.py \
     --case style \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
+    --no-async-chunk \
     --enforce-eager \
     --enable-stats \
     --stats-log-file output_audio/ming_style_pipeline.log \
@@ -147,7 +161,7 @@ The upstream Ming cookbook uses these public audio fixtures from `inclusionAI/Mi
 The repo-facing example is intended to cover the same dense TTS workflows used
 by the local Ming validation script:
 
-| Case | Blocking `ming_tts.yaml` | Async chunk `ming_tts_async_chunk.yaml` | Extra inputs |
+| Case | Blocking `ming_tts.yaml` | Async chunk `deploy/ming_tts.yaml` | Extra inputs |
 |---|---:|---:|---|
 | `style` | Yes | Optional smoke test | none |
 | `ip` | Yes | Optional smoke test | none |
@@ -186,7 +200,7 @@ and Stage-1 patch counts for every case:
 | Argument | Description |
 |---|---|
 | `--model` | Hugging Face repo or local Ming checkpoint path |
-| `--stage-configs-path` | Stage config YAML. Use `ming_tts.yaml` for blocking generation or `ming_tts_async_chunk.yaml` for streaming |
+| `--deploy-config` | Deploy config YAML. Use `vllm_omni/deploy/ming_tts.yaml` |
 | `--case` | Built-in demo case |
 | `--ref-audio` | Single reference wav path for cloning-style cases |
 | `--ref-audio-paths` | Multiple reference wav paths, used by `podcast` |
diff --git a/examples/offline_inference/ming_tts/_runner.py b/examples/offline_inference/ming_tts/_runner.py
new file mode 100644
index 00000000000..c5682bffaff
--- /dev/null
+++ b/examples/offline_inference/ming_tts/_runner.py
@@ -0,0 +1,284 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import json
+import time
+import uuid
+import wave
+from pathlib import Path
+
+import torch
+from vllm import SamplingParams
+
+from vllm_omni import AsyncOmni, Omni
+from vllm_omni.model_executor.models.ming_tts.config_ming_tts import (
+    KEY_SPEAKER_EMBEDDING,
+    SAMPLE_RATE,
+    TEXT_EOS_TOKEN_ID,
+)
+
+
+def coerce_audio_tensor(audio, *, async_chunk: bool) -> torch.Tensor:
+    if isinstance(audio, list):
+        if async_chunk:
+            parts = []
+            for item in audio:
+                tensor = torch.as_tensor(item, dtype=torch.float32).reshape(-1)
+                if tensor.numel() > 0:
+                    parts.append(tensor)
+            if not parts:
+                return torch.zeros((0,), dtype=torch.float32)
+            return torch.cat(parts, dim=0)
+
+        for item in reversed(audio):
+            tensor = torch.as_tensor(item, dtype=torch.float32).reshape(-1)
+            if tensor.numel() > 0:
+                return tensor
+        return torch.zeros((0,), dtype=torch.float32)
+
+    return torch.as_tensor(audio, dtype=torch.float32).reshape(-1)
+
+
+def resolve_sr(sr) -> int:
+    if isinstance(sr, list):
+        sr = sr[-1]
+    if hasattr(sr, "item"):
+        return int(sr.item())
+    return int(sr)
+
+
+def extract_sample_rate(multimodal_output: dict) -> int:
+    sr = multimodal_output.get("sr")
+    if sr is None:
+        raise RuntimeError("Expected multimodal_output['sr']")
+    return resolve_sr(sr)
+
+
+def write_wav(path: str, audio: torch.Tensor, sample_rate: int) -> None:
+    audio = audio.clamp(-1.0, 1.0)
+    pcm16 = (audio * 32767.0).round().to(torch.int16).cpu().numpy()
+    with wave.open(path, "wb") as wav_file:
+        wav_file.setnchannels(1)
+        wav_file.setsampwidth(2)
+        wav_file.setframerate(int(sample_rate))
+        wav_file.writeframes(pcm16.tobytes())
+
+
+def request_index(request_id: str | None, fallback: int) -> int:
+    try:
+        return int(request_id)
+    except (TypeError, ValueError):
+        if isinstance(request_id, str):
+            head = request_id.split("_", 1)[0]
+            if head.isdigit():
+                return int(head)
+    return fallback
+
+
+def audio_summary(audio: torch.Tensor, sample_rate: int) -> dict:
+    waveform = audio.detach().cpu().reshape(-1).to(torch.float32)
+    return {
+        "sample_rate": int(sample_rate),
+        "num_samples": int(waveform.numel()),
+        "duration_seconds": float(waveform.numel()) / float(sample_rate),
+        "max_abs_amplitude": float(waveform.abs().max().item()) if waveform.numel() > 0 else 0.0,
+    }
+
+
+def resolve_output_name(output_name: str | None, case: str, index: int, total: int) -> str:
+    if total == 1:
+        return output_name or f"ming_{case}.wav"
+    base = Path(output_name or f"ming_{case}.wav")
+    return f"{base.stem}_{index:05d}{base.suffix or '.wav'}"
+
+
+def resolve_stats_log_file(args) -> str | None:
+    if not args.log_stats:
+        return None
+    if args.stats_log_file:
+        return args.stats_log_file
+    base = Path(args.output_name or f"ming_{args.case}.wav").stem
+    return str(Path(args.output_dir) / f"{base}_pipeline.log")
+
+
+def resolve_metadata_json(args) -> str | None:
+    if args.metadata_json:
+        return args.metadata_json
+    if args.log_stats:
+        base = Path(args.output_name or f"ming_{args.case}.wav").stem
+        return str(Path(args.output_dir) / f"{base}_manifest.json")
+    return None
+
+
+def build_manifest(args, prompt_payload, stats_log_file: str | None, outputs: list[dict]) -> dict:
+    additional_information = {}
+    if isinstance(prompt_payload, dict):
+        additional_information = dict(prompt_payload.get("additional_information", {}))
+    return {
+        "model": args.model,
+        "case": args.case,
+        "streaming": bool(args.streaming),
+        "deploy_config": args.deploy_config,
+        "enforce_eager": bool(args.enforce_eager),
+        "num_prompts": int(args.num_prompts),
+        "log_stats": bool(args.log_stats),
+        "stats_log_file": stats_log_file,
+        "prompt_text": additional_information.get("prompt_text"),
+        "instruction": additional_information.get("instruction"),
+        "speaker_embedding_shape": (
+            list(additional_information[KEY_SPEAKER_EMBEDDING].shape)
+            if KEY_SPEAKER_EMBEDDING in additional_information
+            and hasattr(additional_information[KEY_SPEAKER_EMBEDDING], "shape")
+            else None
+        ),
+        "outputs": outputs,
+        "generated_at_unix": time.time(),
+    }
+
+
+def build_engine_kwargs(args, stats_log_file: str | None) -> dict:
+    kwargs = {
+        "model": args.model,
+        "deploy_config": args.deploy_config,
+        "enforce_eager": args.enforce_eager,
+        "trust_remote_code": args.trust_remote_code,
+        "log_stats": args.log_stats,
+        "stage_init_timeout": args.stage_init_timeout,
+        "init_timeout": args.init_timeout,
+        "batch_timeout": args.batch_timeout,
+        "shm_threshold_bytes": args.shm_threshold_bytes,
+        "worker_backend": args.worker_backend,
+    }
+    if stats_log_file is not None:
+        kwargs["log_file"] = stats_log_file
+    if args.ray_address is not None:
+        kwargs["ray_address"] = args.ray_address
+    return kwargs
+
+
+def build_sampling_params(max_decode_steps: int) -> list[SamplingParams]:
+    return [
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=max_decode_steps + 1,
+            stop_token_ids=[int(TEXT_EOS_TOKEN_ID)],
+        ),
+        SamplingParams(temperature=0.0, max_tokens=1),
+    ]
+
+
+async def run_streaming(args, prompt_payload, sampling_params_list, output_dir: Path, stats_log_file: str | None):
+    engine = AsyncOmni(**build_engine_kwargs(args, stats_log_file))
+    try:
+        all_audio_chunks = []
+        accumulated_samples = 0
+        chunk_idx = 0
+        start_time = time.time()
+        chunk_times = []
+        ttfp_seconds = None
+        final_stage_output = None
+        async for stage_output in engine.generate(
+            prompt=prompt_payload,
+            request_id=str(uuid.uuid4()),
+            sampling_params_list=sampling_params_list,
+        ):
+            final_stage_output = stage_output
+            multimodal_output = stage_output.multimodal_output or {}
+            audio = multimodal_output.get("audio")
+            if audio is None:
+                continue
+
+            finished = stage_output.finished
+            if isinstance(audio, torch.Tensor):
+                if finished:
+                    audio_chunk = audio[accumulated_samples:].float().detach().cpu()
+                else:
+                    audio_chunk = audio.float().detach().cpu()
+            elif isinstance(audio, list):
+                audio_chunk = torch.as_tensor(audio[chunk_idx], dtype=torch.float32).reshape(-1).cpu()
+            else:
+                audio_chunk = torch.as_tensor(audio, dtype=torch.float32).reshape(-1).cpu()
+
+            accumulated_samples += int(audio_chunk.numel())
+            chunk_idx += 1
+            if audio_chunk.numel() > 0:
+                now = time.time()
+                if ttfp_seconds is None:
+                    ttfp_seconds = now - start_time
+                chunk_times.append(now)
+                all_audio_chunks.append(audio_chunk)
+
+        if not all_audio_chunks:
+            raise RuntimeError("Streaming Ming example produced no audio chunks")
+
+        waveform = torch.cat(all_audio_chunks, dim=0)
+        output_name = resolve_output_name(args.output_name, args.case, 0, 1)
+        output_path = str(output_dir / output_name)
+        write_wav(output_path, waveform, SAMPLE_RATE)
+        summary = {
+            "request_id": getattr(final_stage_output, "request_id", None),
+            "stage_id": getattr(final_stage_output, "stage_id", None),
+            "output_path": output_path,
+            "stage_durations": getattr(final_stage_output, "stage_durations", {}),
+            "peak_memory_mb": getattr(final_stage_output, "peak_memory_mb", 0.0),
+            "ttfp_seconds": ttfp_seconds,
+            "mean_inter_chunk_seconds": (
+                sum(t1 - t0 for t0, t1 in zip(chunk_times, chunk_times[1:])) / (len(chunk_times) - 1)
+                if len(chunk_times) > 1
+                else None
+            ),
+        }
+        summary.update(audio_summary(waveform, SAMPLE_RATE))
+        print(f"Saved streaming output to {output_path}")
+        print(json.dumps(summary, ensure_ascii=False, indent=2))
+        return [summary]
+    finally:
+        engine.shutdown()
+
+
+def run_non_streaming(args, prompt_payload, sampling_params_list, output_dir: Path, stats_log_file: str | None):
+    engine = Omni(**build_engine_kwargs(args, stats_log_file))
+    try:
+        outputs = engine.generate(
+            prompts=[prompt_payload for _ in range(args.num_prompts)],
+            sampling_params_list=sampling_params_list,
+            py_generator=False,
+        )
+        summaries = []
+        for fallback_index, output in enumerate(outputs):
+            if output.final_output_type != "audio":
+                continue
+            multimodal_output = output.multimodal_output or {}
+            waveform = coerce_audio_tensor(multimodal_output.get("audio"), async_chunk=False)
+            sample_rate = extract_sample_rate(multimodal_output)
+            output_name = resolve_output_name(
+                args.output_name,
+                args.case,
+                request_index(output.request_id, fallback_index),
+                args.num_prompts,
+            )
+            output_path = str(output_dir / output_name)
+            write_wav(output_path, waveform, sample_rate)
+            summary = {
+                "request_id": output.request_id,
+                "stage_id": output.stage_id,
+                "output_path": output_path,
+                "stage_durations": output.stage_durations,
+                "peak_memory_mb": output.peak_memory_mb,
+            }
+            summary.update(audio_summary(waveform, sample_rate))
+            summaries.append(summary)
+            print(f"Saved output to {output_path}")
+            print(json.dumps(summary, ensure_ascii=False, indent=2))
+        if not summaries:
+            raise RuntimeError("Non-streaming Ming example produced no audio outputs")
+        return summaries
+    finally:
+        engine.close()
+
+
+def run_generation(args, prompt_payload, sampling_params_list, output_dir: Path, stats_log_file: str | None):
+    if args.streaming:
+        return asyncio.run(run_streaming(args, prompt_payload, sampling_params_list, output_dir, stats_log_file))
+    return run_non_streaming(args, prompt_payload, sampling_params_list, output_dir, stats_log_file)
diff --git a/examples/offline_inference/ming_tts/cases.yaml b/examples/offline_inference/ming_tts/cases.yaml
new file mode 100644
index 00000000000..2568a9bad2e
--- /dev/null
+++ b/examples/offline_inference/ming_tts/cases.yaml
@@ -0,0 +1,117 @@
+style:
+  prompt: "Please generate speech based on the following description.\n"
+  text: "我会一直在这里陪着你，直到你慢慢、慢慢地沉入那个最温柔的梦里……好吗？"
+  instruction:
+    风格: >-
+      这是一种ASMR耳语，属于一种旨在引发特殊感官体验的创意风格。这个女性使用轻柔的普通话进行耳语，声音气音成分重。音量极低，紧贴麦克风，语速极慢，旨在制造触发听者颅内快感的声学刺激。
+  use_zero_spk_emb: true
+  max_decode_steps: 200
+
+ip:
+  prompt: "Please generate speech based on the following description.\n"
+  text: "这款产品的名字，叫变态坑爹牛肉丸。"
+  instruction:
+    IP: "灵小甄"
+  use_zero_spk_emb: true
+  max_decode_steps: 200
+
+bgm:
+  prompt: "Please generate music based on the following description.\n"
+  text: "Genre: 电子舞曲. Mood: 自信 / 坚定. Instrument: 架子鼓. Theme: 节日. Duration: 30s."
+  instruction: null
+  use_zero_spk_emb: false
+  max_decode_steps: 400
+
+tta:
+  prompt: "Please generate audio events based on given text.\n"
+  text: "Thunder and a gentle rain"
+  instruction: null
+  use_zero_spk_emb: false
+  max_decode_steps: 200
+  cfg: 4.5
+  sigma: 0.3
+  temperature: 2.5
+
+emotion:
+  prompt: "Please generate speech based on the following description.\n"
+  text: "我竟然抢到了陈奕迅的演唱会门票！太棒了！终于可以现场听一听他的歌声了！"
+  instruction:
+    情感: "高兴"
+  requires_ref_audio: true
+  auto_extract_speaker_embeddings: true
+  max_decode_steps: 200
+
+basic:
+  prompt: "Please generate speech based on the following description.\n"
+  text: "简单地说，这相当于惠普把消费领域市场拱手相让了。"
+  instruction:
+    语速: "快速"
+    基频: "中"
+    音量: "高"
+  requires_ref_audio: true
+  auto_extract_speaker_embeddings: true
+  max_decode_steps: 200
+
+dialect:
+  prompt: "Please generate speech based on the following description.\n"
+  text: "我觉得社会企业同个人都有责任"
+  instruction:
+    方言: "广粤话"
+  requires_ref_audio: true
+  auto_extract_speaker_embeddings: true
+  max_decode_steps: 200
+
+zero_shot:
+  prompt: "Please generate speech based on the following description.\n"
+  text: "我们的愿景是构建未来服务业的数字化基础设施，为世界带来更多微小而美好的改变。"
+  instruction: null
+  requires_ref_audio: true
+  requires_ref_text: true
+  auto_extract_speaker_embeddings: true
+  max_decode_steps: 200
+
+podcast:
+  prompt: "Please generate speech based on the following description.\n"
+  text: |
+    speaker_1:你可以说一下，就大概说一下，可能虽然我也不知道，我看过那部电影没有。
+    speaker_2:就是那个叫什么，变相一节课的嘛。
+    speaker_1:嗯。
+    speaker_2:一部搞笑的电影。
+    speaker_1:一部搞笑的。
+  instruction: null
+  prompt_text: |
+    speaker_1:并且我们还要进行每个月还要考核 笔试的话还要进行笔试，做个，当服务员还要去笔试了
+    speaker_2:对啊，这真的很奇怪，就是 单纯的因，单纯自己工资不高，只是因为可能人家那个店比较出名一点，就对你苛刻要求
+  requires_ref_audio_count: 2
+  auto_extract_speaker_embeddings: true
+  max_decode_steps: 200
+
+speech_bgm:
+  prompt: "Please generate speech based on the following description.\n"
+  text: "此次业绩下滑原因，可归结为企业停止服务某些品牌，而带来的负面影响。"
+  instruction:
+    BGM:
+      Genre: "当代古典音乐."
+      Mood: "温暖 / 友善."
+      Instrument: "电吉他"
+      Theme: "节日."
+      SNR: 10.0
+      ENV: null
+  requires_ref_audio: true
+  auto_extract_speaker_embeddings: true
+  max_decode_steps: 200
+
+speech_sound:
+  prompt: "Please generate speech based on the following description.\n"
+  text: "此次业绩下滑原因，可归结为企业停止服务某些品牌，而带来的负面影响。"
+  instruction:
+    BGM:
+      ENV: "Birds chirping"
+      SNR: 10.0
+      Genre: null
+      Mood: null
+      Instrument: null
+      Theme: null
+  requires_ref_audio: true
+  auto_extract_speaker_embeddings: true
+  max_decode_steps: 200
diff --git a/examples/offline_inference/ming_tts/end2end.py b/examples/offline_inference/ming_tts/end2end.py
index 9e9742f4e7e..e86c3b969f1 100644
--- a/examples/offline_inference/ming_tts/end2end.py
+++ b/examples/offline_inference/ming_tts/end2end.py
@@ -2,440 +2,171 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Offline inference demo for Ming-omni-tts via vLLM Omni."""
 
-import asyncio
 import json
 import os
-import time
-import uuid
-import wave
 from pathlib import Path
 
 import soundfile as sf
 import torch
 import torchaudio
+import yaml
 from transformers import AutoTokenizer
-from vllm import SamplingParams
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
-from vllm_omni import AsyncOmni, Omni
 from vllm_omni.model_executor.models.ming_tts.config_ming_tts import (
     KEY_CFG,
     KEY_MAX_DECODE_STEPS,
     KEY_SIGMA,
-    KEY_SPEAKER_EMBEDDING,
     KEY_TEMPERATURE,
     SAMPLE_RATE,
-    TEXT_EOS_TOKEN_ID,
 )
 from vllm_omni.model_executor.models.ming_tts.prompt_builder import build_ming_dense_prompt
 from vllm_omni.model_executor.models.ming_tts.speaker_extractor import MingSpeakerEmbeddingExtractor
 
-DEFAULT_MODEL = "inclusionAI/Ming-omni-tts-0.5B"
-DEFAULT_STAGE_CONFIG = "vllm_omni/model_executor/stage_configs/ming_tts.yaml"
-DEFAULT_STREAM_STAGE_CONFIG = "vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml"
-DEFAULT_OUTPUT_DIR = "output_audio"
-DEFAULT_SPEECH_PROMPT = "Please generate speech based on the following description.\n"
-DEFAULT_MUSIC_PROMPT = "Please generate music based on the following description.\n"
-DEFAULT_PODCAST_TEXT = (
-    " speaker_1:你可以说一下，就大概说一下，可能虽然我也不知道，我看过那部电影没有。\n"
-    " speaker_2:就是那个叫什么，变相一节课的嘛。\n"
-    " speaker_1:嗯。\n"
-    " speaker_2:一部搞笑的电影。\n"
-    " speaker_1:一部搞笑的。\n"
-)
-DEFAULT_PODCAST_PROMPT_TEXT = (
-    " speaker_1:并且我们还要进行每个月还要考核 笔试的话还要进行笔试，做个，当服务员还要去笔试了\n"
-    " speaker_2:对啊，这真的很奇怪，就是 单纯的因，单纯自己工资不高，只是因为可能人家那个店比较出名一点，就对你苛刻要求\n"
-)
+try:
+    from ._runner import (
+        build_manifest,
+        build_sampling_params,
+        resolve_metadata_json,
+        resolve_stats_log_file,
+        run_generation,
+    )
+except ImportError:
+    from _runner import (
+        build_manifest,
+        build_sampling_params,
+        resolve_metadata_json,
+        resolve_stats_log_file,
+        run_generation,
+    )
 
-CASE_DEFAULTS = {
-    "style": {
-        "prompt": DEFAULT_SPEECH_PROMPT,
-        "text": "我会一直在这里陪着你，直到你慢慢、慢慢地沉入那个最温柔的梦里……好吗？",
-        "instruction": {
-            "风格": (
-                "这是一种ASMR耳语，属于一种旨在引发特殊感官体验的创意风格。"
-                "这个女性使用轻柔的普通话进行耳语，声音气音成分重。"
-                "音量极低，紧贴麦克风，语速极慢，旨在制造触发听者颅内快感的声学刺激。"
-            )
-        },
-        "use_zero_spk_emb": True,
-        "max_decode_steps": 200,
-    },
-    "ip": {
-        "prompt": DEFAULT_SPEECH_PROMPT,
-        "text": "这款产品的名字，叫变态坑爹牛肉丸。",
-        "instruction": {"IP": "灵小甄"},
-        "use_zero_spk_emb": True,
-        "max_decode_steps": 200,
-    },
-    "bgm": {
-        "prompt": DEFAULT_MUSIC_PROMPT,
-        "text": "Genre: 电子舞曲. Mood: 自信 / 坚定. Instrument: 架子鼓. Theme: 节日. Duration: 30s.",
-        "instruction": None,
-        "use_zero_spk_emb": False,
-        "max_decode_steps": 400,
-    },
-    "tta": {
-        "prompt": "Please generate audio events based on given text.\n",
-        "text": "Thunder and a gentle rain",
-        "instruction": None,
-        "use_zero_spk_emb": False,
-        "max_decode_steps": 200,
-        "cfg": 4.5,
-        "sigma": 0.3,
-        "temperature": 2.5,
-    },
-    "emotion": {
-        "prompt": DEFAULT_SPEECH_PROMPT,
-        "text": "我竟然抢到了陈奕迅的演唱会门票！太棒了！终于可以现场听一听他的歌声了！",
-        "instruction": {"情感": "高兴"},
-        "requires_ref_audio": True,
-        "auto_extract_speaker_embeddings": True,
-        "max_decode_steps": 200,
-    },
-    "basic": {
-        "prompt": DEFAULT_SPEECH_PROMPT,
-        "text": "简单地说，这相当于惠普把消费领域市场拱手相让了。",
-        "instruction": {"语速": "快速", "基频": "中", "音量": "高"},
-        "requires_ref_audio": True,
-        "auto_extract_speaker_embeddings": True,
-        "max_decode_steps": 200,
-    },
-    "dialect": {
-        "prompt": DEFAULT_SPEECH_PROMPT,
-        "text": "我觉得社会企业同个人都有责任",
-        "instruction": {"方言": "广粤话"},
-        "requires_ref_audio": True,
-        "auto_extract_speaker_embeddings": True,
-        "max_decode_steps": 200,
-    },
-    "zero_shot": {
-        "prompt": DEFAULT_SPEECH_PROMPT,
-        "text": "我们的愿景是构建未来服务业的数字化基础设施，为世界带来更多微小而美好的改变。",
-        "instruction": None,
-        "requires_ref_audio": True,
-        "requires_ref_text": True,
-        "auto_extract_speaker_embeddings": True,
-        "max_decode_steps": 200,
-    },
-    "podcast": {
-        "prompt": DEFAULT_SPEECH_PROMPT,
-        "text": DEFAULT_PODCAST_TEXT,
-        "instruction": None,
-        "prompt_text": DEFAULT_PODCAST_PROMPT_TEXT,
-        "requires_ref_audio_count": 2,
-        "auto_extract_speaker_embeddings": True,
-        "max_decode_steps": 200,
-    },
-    "speech_bgm": {
-        "prompt": DEFAULT_SPEECH_PROMPT,
-        "text": "此次业绩下滑原因，可归结为企业停止服务某些品牌，而带来的负面影响。",
-        "instruction": {
-            "BGM": {
-                "Genre": "当代古典音乐.",
-                "Mood": "温暖 / 友善.",
-                "Instrument": "电吉他",
-                "Theme": "节日.",
-                "SNR": 10.0,
-                "ENV": None,
-            }
-        },
-        "requires_ref_audio": True,
-        "auto_extract_speaker_embeddings": True,
-        "max_decode_steps": 200,
-    },
-    "speech_sound": {
-        "prompt": DEFAULT_SPEECH_PROMPT,
-        "text": "此次业绩下滑原因，可归结为企业停止服务某些品牌，而带来的负面影响。",
-        "instruction": {
-            "BGM": {
-                "ENV": "Birds chirping",
-                "SNR": 10.0,
-                "Genre": None,
-                "Mood": None,
-                "Instrument": None,
-                "Theme": None,
-            }
-        },
-        "requires_ref_audio": True,
-        "auto_extract_speaker_embeddings": True,
-        "max_decode_steps": 200,
-    },
-}
+_DEFAULT_MODEL = "inclusionAI/Ming-omni-tts-0.5B"
+_DEFAULT_DEPLOY_CONFIG = "vllm_omni/deploy/ming_tts.yaml"
+_CASES_FILE = Path(__file__).with_name("cases.yaml")
+
+CASE_DEFAULTS = yaml.safe_load(_CASES_FILE.read_text(encoding="utf-8")) or {}
+if not CASE_DEFAULTS:
+    raise RuntimeError(f"Empty or missing case definitions in {_CASES_FILE}")
+
+
+def _build_parser() -> FlexibleArgumentParser:
+    p = FlexibleArgumentParser(description="Offline Ming-omni-tts example")
+    p.add_argument("--model", default=_DEFAULT_MODEL, help="Model name or local path")
+    p.add_argument("--deploy-config", default=None, help="Deploy config YAML; auto-selected when omitted")
+    p.add_argument("--case", choices=sorted(CASE_DEFAULTS), default="style", help="Built-in demo case")
+    p.add_argument("--text", default=None, help="Override case text")
+    p.add_argument("--prompt", default=None, help="Override the system prompt prefix")
+    p.add_argument("--instructions", default=None, help="Free-form Ming instruction string")
+    p.add_argument(
+        "--instruction-json", default=None, help='Structured Ming instruction JSON, e.g. \'{"方言":"广粤话"}\''
+    )
+    p.add_argument("--ref-audio", default=None, help="Single reference audio path for cloning-style cases")
+    p.add_argument("--ref-audio-paths", nargs="+", default=None, help="Multiple reference audio paths (podcast)")
+    p.add_argument("--ref-text", default=None, help="Reference transcript; required for zero_shot")
+    p.add_argument("--speaker-embedding", default=None, help="Path to a JSON speaker embedding file")
+    p.add_argument(
+        "--extract-speaker-embeddings",
+        action="store_true",
+        help="Extract speaker embeddings from ref audio via campplus.onnx",
+    )
+    p.add_argument("--max-decode-steps", type=int, default=None, help="Override ming_max_decode_steps")
+    p.add_argument("--output-dir", default="output_audio", help="Directory for output wav files")
+    p.add_argument("--output-name", default=None, help="Output wav filename")
+    p.add_argument("--num-prompts", type=int, default=1, help="Repeat the same prompt N times")
+    p.add_argument("--streaming", action="store_true", help="Use AsyncOmni with async_chunk streaming")
+    p.add_argument("--trust-remote-code", action="store_true")
+    p.add_argument("--enforce-eager", action="store_true")
+    p.add_argument(
+        "--log-stats", "--enable-stats", dest="log_stats", action="store_true", help="Enable Omni stats logging"
+    )
+    p.add_argument("--stats-log-file", default=None, help="Path for the Omni stats log file")
+    p.add_argument("--metadata-json", default=None, help="Path for the run manifest JSON")
+    p.add_argument("--stage-init-timeout", type=int, default=300, help="Per-stage init timeout (s)")
+    p.add_argument("--init-timeout", type=int, default=600, help="Total init timeout (s)")
+    p.add_argument("--batch-timeout", type=int, default=5, help="Batch timeout (s)")
+    p.add_argument("--shm-threshold-bytes", type=int, default=65536)
+    p.add_argument("--worker-backend", default="multi_process", choices=["multi_process", "ray"])
+    p.add_argument("--ray-address", default=None, help="Ray cluster address (--worker-backend ray)")
+    return p
+
+
+def _finalize_args(args) -> None:
+    if args.instructions is not None and args.instruction_json is not None:
+        raise RuntimeError("Use either --instructions or --instruction-json, not both")
+    if args.num_prompts < 1:
+        raise RuntimeError("--num-prompts must be at least 1")
+    if args.streaming and args.num_prompts != 1:
+        raise RuntimeError("--streaming currently supports exactly one prompt")
+    if args.deploy_config is None:
+        args.deploy_config = _DEFAULT_DEPLOY_CONFIG
 
 
-def _load_reference_waveform(path: str) -> torch.Tensor:
-    samples, sample_rate = sf.read(path, dtype="float32")
-    waveform = torch.as_tensor(samples, dtype=torch.float32)
-    if waveform.ndim == 2:
-        waveform = waveform.mean(dim=1)
-    waveform = waveform.reshape(1, -1)
-    if int(sample_rate) != SAMPLE_RATE:
-        waveform = torchaudio.functional.resample(waveform, int(sample_rate), SAMPLE_RATE)
-    return waveform
+def _load_waveform(path: str) -> torch.Tensor:
+    samples, sr = sf.read(path, dtype="float32")
+    wav = torch.as_tensor(samples, dtype=torch.float32)
+    if wav.ndim == 2:
+        wav = wav.mean(dim=1)
+    wav = wav.reshape(1, -1)
+    if int(sr) != SAMPLE_RATE:
+        wav = torchaudio.functional.resample(wav, int(sr), SAMPLE_RATE)
+    return wav
 
 
 def _load_speaker_embedding(path: str) -> torch.Tensor:
-    data = json.loads(Path(path).read_text(encoding="utf-8"))
-    return torch.as_tensor(data, dtype=torch.float32)
+    return torch.as_tensor(json.loads(Path(path).read_text(encoding="utf-8")), dtype=torch.float32)
 
 
-def _resolve_reference_inputs(args, case):
+def _ref_audio_paths(args) -> list[str]:
     if args.ref_audio is not None and args.ref_audio_paths is not None:
         raise RuntimeError("Use either --ref-audio or --ref-audio-paths, not both")
-
     if args.ref_audio_paths is not None:
-        ref_audio_paths = list(args.ref_audio_paths)
-    elif args.ref_audio is not None:
-        ref_audio_paths = [args.ref_audio]
-    else:
-        ref_audio_paths = []
+        return list(args.ref_audio_paths)
+    return [args.ref_audio] if args.ref_audio else []
 
-    required_count = int(case.get("requires_ref_audio_count", 0))
-    if required_count > 0:
-        if len(ref_audio_paths) < required_count:
-            raise RuntimeError(
-                f"Case '{args.case}' requires at least {required_count} reference audio paths via --ref-audio-paths"
-            )
-    elif case.get("requires_ref_audio") and not ref_audio_paths:
-        raise RuntimeError(f"--ref-audio is required for case '{args.case}'")
 
-    if not ref_audio_paths:
+def _resolve_reference_inputs(args, case: dict, paths: list[str]):
+    required = int(case.get("requires_ref_audio_count", 0))
+    if required > 0 and len(paths) < required:
+        raise RuntimeError(f"Case '{args.case}' needs {required} ref audio paths via --ref-audio-paths")
+    if required <= 0 and case.get("requires_ref_audio") and not paths:
+        raise RuntimeError(f"--ref-audio required for case '{args.case}'")
+    if not paths:
         return None
-    if len(ref_audio_paths) == 1:
-        return _load_reference_waveform(ref_audio_paths[0])
-    return [_load_reference_waveform(path) for path in ref_audio_paths]
-
-
-def _resolve_reference_audio_paths(args):
-    if args.ref_audio is not None and args.ref_audio_paths is not None:
-        raise RuntimeError("Use either --ref-audio or --ref-audio-paths, not both")
-    if args.ref_audio_paths is not None:
-        return list(args.ref_audio_paths)
-    if args.ref_audio is not None:
-        return [args.ref_audio]
-    return []
+    wavs = [_load_waveform(p) for p in paths]
+    return wavs[0] if len(wavs) == 1 else wavs
 
 
-def _resolve_speaker_embedding(args, case, ref_audio_paths):
+def _resolve_speaker_embedding(args, case: dict, paths: list[str]):
     if args.speaker_embedding:
         return _load_speaker_embedding(args.speaker_embedding)
-
-    should_extract = bool(case.get("auto_extract_speaker_embeddings", False) or args.extract_speaker_embeddings)
-    if not should_extract or not ref_audio_paths:
+    if not (case.get("auto_extract_speaker_embeddings") or args.extract_speaker_embeddings) or not paths:
         return None
-
-    extractor = MingSpeakerEmbeddingExtractor(args.model)
-    embeddings = extractor.extract_many(ref_audio_paths)
-    if not embeddings:
+    embs = MingSpeakerEmbeddingExtractor(args.model).extract_many(paths)
+    if not embs:
         raise RuntimeError("Speaker extraction produced no embeddings")
-    if len(embeddings) == 1:
-        return embeddings[0]
-    return torch.stack(embeddings, dim=0)
-
-
-def _coerce_audio_tensor(audio, *, async_chunk: bool) -> torch.Tensor:
-    if isinstance(audio, list):
-        if async_chunk:
-            parts = []
-            for item in audio:
-                tensor = torch.as_tensor(item, dtype=torch.float32).reshape(-1)
-                if tensor.numel() > 0:
-                    parts.append(tensor)
-            if not parts:
-                return torch.zeros((0,), dtype=torch.float32)
-            return torch.cat(parts, dim=0)
-
-        for item in reversed(audio):
-            tensor = torch.as_tensor(item, dtype=torch.float32).reshape(-1)
-            if tensor.numel() > 0:
-                return tensor
-        return torch.zeros((0,), dtype=torch.float32)
-
-    return torch.as_tensor(audio, dtype=torch.float32).reshape(-1)
-
-
-def _resolve_sr(sr) -> int:
-    if isinstance(sr, list):
-        sr = sr[-1]
-    if hasattr(sr, "item"):
-        return int(sr.item())
-    return int(sr)
-
-
-def _extract_sample_rate(multimodal_output: dict) -> int:
-    sr = multimodal_output.get("sr")
-    if sr is None:
-        raise RuntimeError("Expected multimodal_output['sr']")
-    return _resolve_sr(sr)
-
-
-def _write_wav(path: str, audio: torch.Tensor, sample_rate: int) -> None:
-    audio = audio.clamp(-1.0, 1.0)
-    pcm16 = (audio * 32767.0).round().to(torch.int16).cpu().numpy()
-    with wave.open(path, "wb") as wav_file:
-        wav_file.setnchannels(1)
-        wav_file.setsampwidth(2)
-        wav_file.setframerate(int(sample_rate))
-        wav_file.writeframes(pcm16.tobytes())
-
-
-def _request_index(request_id: str | None, fallback: int) -> int:
-    try:
-        return int(request_id)
-    except (TypeError, ValueError):
-        if isinstance(request_id, str):
-            head = request_id.split("_", 1)[0]
-            if head.isdigit():
-                return int(head)
-    return fallback
-
-
-def _audio_summary(audio: torch.Tensor, sample_rate: int) -> dict:
-    waveform = audio.detach().cpu().reshape(-1).to(torch.float32)
-    return {
-        "sample_rate": int(sample_rate),
-        "num_samples": int(waveform.numel()),
-        "duration_seconds": float(waveform.numel()) / float(sample_rate),
-        "max_abs_amplitude": float(waveform.abs().max().item()) if waveform.numel() > 0 else 0.0,
-    }
-
-
-def _resolve_output_name(output_name: str | None, case: str, index: int, total: int) -> str:
-    if total == 1:
-        return output_name or f"ming_{case}.wav"
-    base = Path(output_name or f"ming_{case}.wav")
-    return f"{base.stem}_{index:05d}{base.suffix or '.wav'}"
-
-
-def _resolve_stats_log_file(args) -> str | None:
-    if not args.log_stats:
-        return None
-    if args.stats_log_file:
-        return args.stats_log_file
-    base = Path(args.output_name or f"ming_{args.case}.wav").stem
-    return str(Path(args.output_dir) / f"{base}_pipeline.log")
-
-
-def _resolve_metadata_json(args) -> str | None:
-    if args.metadata_json:
-        return args.metadata_json
-    if args.log_stats:
-        base = Path(args.output_name or f"ming_{args.case}.wav").stem
-        return str(Path(args.output_dir) / f"{base}_manifest.json")
-    return None
+    return embs[0] if len(embs) == 1 else torch.stack(embs, dim=0)
 
 
-def _build_manifest(args, prompt_payload, stats_log_file: str | None, outputs: list[dict]) -> dict:
-    additional_information = {}
-    if isinstance(prompt_payload, dict):
-        additional_information = dict(prompt_payload.get("additional_information", {}))
-    return {
-        "model": args.model,
-        "case": args.case,
-        "streaming": bool(args.streaming),
-        "stage_configs_path": args.stage_configs_path,
-        "enforce_eager": bool(args.enforce_eager),
-        "num_prompts": int(args.num_prompts),
-        "log_stats": bool(args.log_stats),
-        "stats_log_file": stats_log_file,
-        "prompt_text": additional_information.get("prompt_text"),
-        "instruction": additional_information.get("instruction"),
-        "speaker_embedding_shape": (
-            list(additional_information[KEY_SPEAKER_EMBEDDING].shape)
-            if KEY_SPEAKER_EMBEDDING in additional_information
-            and hasattr(additional_information[KEY_SPEAKER_EMBEDDING], "shape")
-            else None
-        ),
-        "outputs": outputs,
-        "generated_at_unix": time.time(),
-    }
-
-
-def _build_engine_kwargs(args, stats_log_file: str | None) -> dict:
-    kwargs = {
-        "model": args.model,
-        "stage_configs_path": args.stage_configs_path,
-        "enforce_eager": args.enforce_eager,
-        "trust_remote_code": args.trust_remote_code,
-        "log_stats": args.log_stats,
-        "stage_init_timeout": args.stage_init_timeout,
-        "init_timeout": args.init_timeout,
-        "batch_timeout": args.batch_timeout,
-        "shm_threshold_bytes": args.shm_threshold_bytes,
-        "worker_backend": args.worker_backend,
-    }
-    if stats_log_file is not None:
-        kwargs["log_file"] = stats_log_file
-    if args.ray_address is not None:
-        kwargs["ray_address"] = args.ray_address
-    return kwargs
-
-
-def _extract_audio_output(outputs, *, async_chunk: bool):
-    output = next((item for item in outputs if item.final_output_type == "audio"), None)
-    if output is None:
-        raise RuntimeError("Expected one final output with final_output_type='audio'")
-
-    multimodal_output = output.multimodal_output or {}
-    audio = multimodal_output.get("audio")
-    sr = multimodal_output.get("sr")
-    if audio is None or sr is None:
-        raise RuntimeError("Expected multimodal_output['audio'] and multimodal_output['sr']")
-
-    waveform = _coerce_audio_tensor(audio, async_chunk=async_chunk)
-    if waveform.numel() == 0:
-        raise RuntimeError("Generated audio waveform is empty")
-    return waveform, _resolve_sr(sr)
-
-
-def _build_instruction(args, case):
-    if args.instruction_json is not None:
-        return json.loads(args.instruction_json)
-    if args.instructions is not None:
-        return args.instructions
-    return case.get("instruction")
-
-
-def _build_prompt(tokenizer, args):
+def _build_prompt_payload(tokenizer, args):
     case = CASE_DEFAULTS[args.case]
-    prompt = args.prompt or case["prompt"]
-    text = args.text or case["text"]
-    instruction = _build_instruction(args, case)
+    paths = _ref_audio_paths(args)
     prompt_text = args.ref_text if args.ref_text is not None else case.get("prompt_text")
-    ref_audio_paths = _resolve_reference_audio_paths(args)
-    prompt_waveform = _resolve_reference_inputs(args, case) if prompt_text is not None else None
-
-    required_count = int(case.get("requires_ref_audio_count", 0))
-    if required_count > 0 and len(ref_audio_paths) < required_count:
-        raise RuntimeError(
-            f"Case '{args.case}' requires at least {required_count} reference audio paths via --ref-audio-paths"
-        )
-    if required_count <= 0 and case.get("requires_ref_audio") and not ref_audio_paths:
-        raise RuntimeError(f"--ref-audio is required for case '{args.case}'")
-
     if case.get("requires_ref_text") and not prompt_text:
-        raise RuntimeError(f"--ref-text is required for case '{args.case}'")
-
-    speaker_embedding = _resolve_speaker_embedding(args, case, ref_audio_paths)
-    use_zero_spk_emb = (
-        bool(case.get("use_zero_spk_emb", False)) and prompt_waveform is None and speaker_embedding is None
+        raise RuntimeError(f"--ref-text required for case '{args.case}'")
+    prompt_waveform = _resolve_reference_inputs(args, case, paths) if prompt_text is not None else None
+    speaker_embedding = _resolve_speaker_embedding(args, case, paths)
+    use_zero_spk_emb = bool(case.get("use_zero_spk_emb")) and prompt_waveform is None and speaker_embedding is None
+    runtime_controls = {KEY_MAX_DECODE_STEPS: args.max_decode_steps or case["max_decode_steps"]}
+    for key, field in [(KEY_CFG, "cfg"), (KEY_SIGMA, "sigma"), (KEY_TEMPERATURE, "temperature")]:
+        if field in case:
+            runtime_controls[key] = case[field]
+    instruction = (
+        json.loads(args.instruction_json) if args.instruction_json else (args.instructions or case.get("instruction"))
     )
-
-    runtime_controls = {
-        KEY_MAX_DECODE_STEPS: args.max_decode_steps or case["max_decode_steps"],
-    }
-    if "cfg" in case:
-        runtime_controls[KEY_CFG] = case["cfg"]
-    if "sigma" in case:
-        runtime_controls[KEY_SIGMA] = case["sigma"]
-    if "temperature" in case:
-        runtime_controls[KEY_TEMPERATURE] = case["temperature"]
     return build_ming_dense_prompt(
         tokenizer,
-        prompt=prompt,
-        text=text,
+        prompt=args.prompt or case["prompt"],
+        text=args.text or case["text"],
         runtime_controls=runtime_controls,
         instruction=instruction,
         prompt_text=prompt_text,
@@ -445,205 +176,19 @@ def _build_prompt(tokenizer, args):
     )
 
 
-async def _run_streaming(args, prompt_payload, sampling_params_list, output_dir, stats_log_file):
-    engine = AsyncOmni(**_build_engine_kwargs(args, stats_log_file))
-    try:
-        all_audio_chunks = []
-        accumulated_samples = 0
-        chunk_idx = 0
-        start_time = time.time()
-        chunk_times = []
-        ttfp_seconds = None
-        final_stage_output = None
-        async for stage_output in engine.generate(
-            prompt=prompt_payload,
-            request_id=str(uuid.uuid4()),
-            sampling_params_list=sampling_params_list,
-        ):
-            final_stage_output = stage_output
-            multimodal_output = stage_output.multimodal_output or {}
-            audio = multimodal_output.get("audio")
-            if audio is None:
-                continue
-
-            finished = stage_output.finished
-            if isinstance(audio, torch.Tensor):
-                if finished:
-                    audio_chunk = audio[accumulated_samples:].float().detach().cpu()
-                else:
-                    audio_chunk = audio.float().detach().cpu()
-            elif isinstance(audio, list):
-                audio_chunk = torch.as_tensor(audio[chunk_idx], dtype=torch.float32).reshape(-1).cpu()
-            else:
-                audio_chunk = torch.as_tensor(audio, dtype=torch.float32).reshape(-1).cpu()
-
-            accumulated_samples += int(audio_chunk.numel())
-            chunk_idx += 1
-            if audio_chunk.numel() > 0:
-                now = time.time()
-                if ttfp_seconds is None:
-                    ttfp_seconds = now - start_time
-                chunk_times.append(now)
-                all_audio_chunks.append(audio_chunk)
-
-        if not all_audio_chunks:
-            raise RuntimeError("Streaming Ming example produced no audio chunks")
-
-        waveform = torch.cat(all_audio_chunks, dim=0)
-        output_name = _resolve_output_name(args.output_name, args.case, 0, 1)
-        output_path = str(Path(output_dir) / output_name)
-        _write_wav(output_path, waveform, SAMPLE_RATE)
-        summary = {
-            "request_id": getattr(final_stage_output, "request_id", None),
-            "stage_id": getattr(final_stage_output, "stage_id", None),
-            "output_path": output_path,
-            "stage_durations": getattr(final_stage_output, "stage_durations", {}),
-            "peak_memory_mb": getattr(final_stage_output, "peak_memory_mb", 0.0),
-            "ttfp_seconds": ttfp_seconds,
-            "mean_inter_chunk_seconds": (
-                sum(t1 - t0 for t0, t1 in zip(chunk_times, chunk_times[1:])) / (len(chunk_times) - 1)
-                if len(chunk_times) > 1
-                else None
-            ),
-        }
-        summary.update(_audio_summary(waveform, SAMPLE_RATE))
-        print(f"Saved streaming output to {output_path}")
-        print(json.dumps(summary, ensure_ascii=False, indent=2))
-        return [summary]
-    finally:
-        engine.shutdown()
-
-
-def _run_non_streaming(args, prompt_payload, sampling_params_list, output_dir, stats_log_file):
-    engine = Omni(**_build_engine_kwargs(args, stats_log_file))
-    try:
-        outputs = engine.generate(
-            prompts=[prompt_payload for _ in range(args.num_prompts)],
-            sampling_params_list=sampling_params_list,
-            py_generator=False,
-        )
-        summaries = []
-        for fallback_index, output in enumerate(outputs):
-            if output.final_output_type != "audio":
-                continue
-            multimodal_output = output.multimodal_output or {}
-            waveform = _coerce_audio_tensor(multimodal_output.get("audio"), async_chunk=False)
-            sample_rate = _extract_sample_rate(multimodal_output)
-            request_index = _request_index(output.request_id, fallback_index)
-            output_name = _resolve_output_name(args.output_name, args.case, request_index, args.num_prompts)
-            output_path = str(Path(output_dir) / output_name)
-            _write_wav(output_path, waveform, sample_rate)
-            summary = {
-                "request_id": output.request_id,
-                "stage_id": output.stage_id,
-                "output_path": output_path,
-                "stage_durations": output.stage_durations,
-                "peak_memory_mb": output.peak_memory_mb,
-            }
-            summary.update(_audio_summary(waveform, sample_rate))
-            summaries.append(summary)
-            print(f"Saved output to {output_path}")
-            print(json.dumps(summary, ensure_ascii=False, indent=2))
-        if not summaries:
-            raise RuntimeError("Non-streaming Ming example produced no audio outputs")
-        return summaries
-    finally:
-        engine.close()
-
-
 def main():
-    parser = FlexibleArgumentParser(description="Offline Ming-omni-tts example")
-    parser.add_argument("--model", default=DEFAULT_MODEL, help="Model name or local path")
-    parser.add_argument(
-        "--stage-configs-path",
-        default=None,
-        help="Stage config path. Defaults to ming_tts.yaml or ming_tts_async_chunk.yaml when --streaming is set.",
-    )
-    parser.add_argument("--case", choices=sorted(CASE_DEFAULTS), default="style", help="Built-in demo case")
-    parser.add_argument("--text", default=None, help="Override case text")
-    parser.add_argument("--prompt", default=None, help="Override the system prompt prefix")
-    parser.add_argument("--instructions", default=None, help="Free-form Ming instruction string")
-    parser.add_argument(
-        "--instruction-json",
-        default=None,
-        help='Structured Ming instruction JSON, for example \'{"方言":"广粤话"}\'',
-    )
-    parser.add_argument("--ref-audio", default=None, help="Reference audio path for cloning")
-    parser.add_argument(
-        "--ref-audio-paths",
-        nargs="+",
-        default=None,
-        help="Multiple reference audio paths, used by multi-speaker cases like podcast",
-    )
-    parser.add_argument("--ref-text", default=None, help="Reference transcript for cloning")
-    parser.add_argument("--speaker-embedding", default=None, help="Path to a JSON speaker embedding file")
-    parser.add_argument(
-        "--extract-speaker-embeddings",
-        action="store_true",
-        help="Extract 192-d Ming speaker embeddings from --ref-audio or --ref-audio-paths using campplus.onnx",
-    )
-    parser.add_argument("--max-decode-steps", type=int, default=None, help="Override ming_max_decode_steps")
-    parser.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR, help="Directory for output wav files")
-    parser.add_argument("--output-name", default=None, help="Output wav filename")
-    parser.add_argument("--num-prompts", type=int, default=1, help="Repeat the same prompt N times")
-    parser.add_argument("--streaming", action="store_true", help="Use AsyncOmni with async_chunk streaming")
-    parser.add_argument("--trust-remote-code", action="store_true", help="Pass trust_remote_code to Omni")
-    parser.add_argument("--enforce-eager", action="store_true", help="Pass enforce_eager to Omni")
-    parser.add_argument(
-        "--log-stats", "--enable-stats", dest="log_stats", action="store_true", help="Enable Omni stats logging"
-    )
-    parser.add_argument("--stats-log-file", default=None, help="Optional path for the Omni stats log file")
-    parser.add_argument("--metadata-json", default=None, help="Optional path for a run manifest JSON file")
-    parser.add_argument(
-        "--stage-init-timeout", type=int, default=300, help="Per-stage initialization timeout in seconds"
-    )
-    parser.add_argument("--init-timeout", type=int, default=600, help="Total initialization timeout in seconds")
-    parser.add_argument("--batch-timeout", type=int, default=5, help="Batch timeout in seconds")
-    parser.add_argument("--shm-threshold-bytes", type=int, default=65536, help="Shared memory threshold in bytes")
-    parser.add_argument(
-        "--worker-backend",
-        type=str,
-        default="multi_process",
-        choices=["multi_process", "ray"],
-        help="Worker backend",
-    )
-    parser.add_argument("--ray-address", default=None, help="Ray cluster address when --worker-backend ray is used")
-    args = parser.parse_args()
-
-    if args.instructions is not None and args.instruction_json is not None:
-        raise RuntimeError("Use either --instructions or --instruction-json, not both")
-    if args.num_prompts < 1:
-        raise RuntimeError("--num-prompts must be at least 1")
-    if args.streaming and args.num_prompts != 1:
-        raise RuntimeError("--streaming currently supports exactly one prompt")
-
-    if args.stage_configs_path is None:
-        args.stage_configs_path = DEFAULT_STREAM_STAGE_CONFIG if args.streaming else DEFAULT_STAGE_CONFIG
-
+    args = _build_parser().parse_args()
+    _finalize_args(args)
     tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=False)
-    prompt_payload = _build_prompt(tokenizer, args)
-
-    max_decode_steps = args.max_decode_steps or CASE_DEFAULTS[args.case]["max_decode_steps"]
-    sampling_params_list = [
-        SamplingParams(
-            temperature=0.0,
-            max_tokens=max_decode_steps + 1,
-            stop_token_ids=[int(TEXT_EOS_TOKEN_ID)],
-        ),
-        SamplingParams(temperature=0.0, max_tokens=1),
-    ]
-
+    prompt_payload = _build_prompt_payload(tokenizer, args)
+    case = CASE_DEFAULTS[args.case]
+    sampling_params_list = build_sampling_params(args.max_decode_steps or case["max_decode_steps"])
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
-    stats_log_file = _resolve_stats_log_file(args)
-
-    if args.streaming:
-        summaries = asyncio.run(_run_streaming(args, prompt_payload, sampling_params_list, output_dir, stats_log_file))
-    else:
-        summaries = _run_non_streaming(args, prompt_payload, sampling_params_list, output_dir, stats_log_file)
-
-    metadata_json = _resolve_metadata_json(args)
-    manifest = _build_manifest(args, prompt_payload, stats_log_file, summaries)
+    stats_log_file = resolve_stats_log_file(args)
+    summaries = run_generation(args, prompt_payload, sampling_params_list, output_dir, stats_log_file)
+    metadata_json = resolve_metadata_json(args)
+    manifest = build_manifest(args, prompt_payload, stats_log_file, summaries)
     if metadata_json is not None:
         Path(metadata_json).write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
         print(f"Saved run manifest to {metadata_json}")
diff --git a/examples/online_serving/ming_tts/README.md b/examples/online_serving/ming_tts/README.md
index 76f8521a4fe..f61b068bebf 100644
--- a/examples/online_serving/ming_tts/README.md
+++ b/examples/online_serving/ming_tts/README.md
@@ -14,7 +14,7 @@ Please refer to [README.md](../../../README.md)
 
 ```bash
 vllm-omni serve inclusionAI/Ming-omni-tts-0.5B \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --omni \
     --port 8091 \
     --enforce-eager
@@ -31,7 +31,7 @@ The recommended online-serving path is eager async-chunk mode through
 `/v1/audio/speech`. `run_server.sh` defaults to:
 
 - model: `inclusionAI/Ming-omni-tts-0.5B`
-- stage config: `vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml`
+- deploy config: `vllm_omni/deploy/ming_tts.yaml`
 - auth: local testing only, no real OpenAI key required
 
 ## Send Requests
@@ -131,24 +131,17 @@ python openai_speech_client.py \
 
 ### Curl examples
 
-Use the helper script for the common request types:
+`run_curl.sh` is intentionally small now. It keeps only three sanity checks:
 
 ```bash
 ./run_curl.sh basic
-./run_curl.sh style
-./run_curl.sh ip
-REF_AUDIO=/path/to/emotion_prompt.wav ./run_curl.sh emotion
-REF_AUDIO=/path/to/yue_prompt.wav ./run_curl.sh dialect
 REF_AUDIO=/path/to/reference.wav REF_TEXT="在此奉劝大家别乱打美白针。" ./run_curl.sh zero_shot
-REF_AUDIO=/path/to/speaker_1.wav REF_AUDIO_2=/path/to/speaker_2.wav REF_TEXT="speaker_1:你好。 speaker_2:你好。" ./run_curl.sh podcast
-REF_AUDIO=/path/to/00000309-00000300.wav ./run_curl.sh speech_bgm
-REF_AUDIO=/path/to/00000309-00000300.wav ./run_curl.sh speech_sound
-REF_AUDIO=/path/to/reference.wav REF_TEXT="在此奉劝大家别乱打美白针。" ./run_curl.sh clone_ref_audio
-SPEAKER_EMBEDDING=/path/to/ming_speaker_embedding.json ./run_curl.sh clone_embedding
 ./run_curl.sh stream
 ```
 
-Or send a direct request:
+For the broader request cookbook, use direct `curl` payloads in this README.
+
+Basic speech:
 
 ```bash
 curl -X POST http://localhost:8091/v1/audio/speech \
@@ -162,6 +155,117 @@ curl -X POST http://localhost:8091/v1/audio/speech \
     --output ming_output.wav
 ```
 
+Style-conditioned speech:
+
+```bash
+curl -X POST http://localhost:8091/v1/audio/speech \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer EMPTY" \
+    -d '{
+        "model": "inclusionAI/Ming-omni-tts-0.5B",
+        "input": "我会一直在这里陪着你。",
+        "instructions": "轻柔的ASMR耳语，慢速，贴近麦克风",
+        "response_format": "wav"
+    }' \
+    --output ming_style.wav
+```
+
+IP voice generation:
+
+```bash
+curl -X POST http://localhost:8091/v1/audio/speech \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer EMPTY" \
+    -d '{
+        "model": "inclusionAI/Ming-omni-tts-0.5B",
+        "input": "这款产品的名字，叫变态坑爹牛肉丸。",
+        "voice": "灵小甄",
+        "response_format": "wav"
+    }' \
+    --output ming_ip.wav
+```
+
+Dialect control with structured instructions:
+
+```bash
+curl -X POST http://localhost:8091/v1/audio/speech \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer EMPTY" \
+    -d '{
+        "model": "inclusionAI/Ming-omni-tts-0.5B",
+        "input": "我觉得社会企业同个人都有责任",
+        "instructions": "{\"方言\":\"广粤话\"}",
+        "ref_audio": "data:audio/wav;base64,<BASE64_WAV>",
+        "response_format": "wav"
+    }' \
+    --output ming_dialect.wav
+```
+
+Zero-shot cloning with transcript:
+
+```bash
+curl -X POST http://localhost:8091/v1/audio/speech \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer EMPTY" \
+    -d '{
+        "model": "inclusionAI/Ming-omni-tts-0.5B",
+        "input": "我们的愿景是构建未来服务业的数字化基础设施。",
+        "ref_audio": "data:audio/wav;base64,<BASE64_WAV>",
+        "ref_text": "在此奉劝大家别乱打美白针。",
+        "response_format": "wav"
+    }' \
+    --output ming_zero_shot.wav
+```
+
+Podcast-style multi-speaker prompt:
+
+```bash
+curl -X POST http://localhost:8091/v1/audio/speech \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer EMPTY" \
+    -d '{
+        "model": "inclusionAI/Ming-omni-tts-0.5B",
+        "input": "speaker_1:你可以说一下。 speaker_2:我也不知道。",
+        "ref_audio": [
+            "data:audio/wav;base64,<BASE64_SPK1>",
+            "data:audio/wav;base64,<BASE64_SPK2>"
+        ],
+        "ref_text": "speaker_1:你好。 speaker_2:你好。",
+        "response_format": "wav"
+    }' \
+    --output ming_podcast.wav
+```
+
+Speaker-embedding cloning:
+
+```bash
+curl -X POST http://localhost:8091/v1/audio/speech \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer EMPTY" \
+    -d '{
+        "model": "inclusionAI/Ming-omni-tts-0.5B",
+        "input": "你好，这是一段使用说话人向量的合成语音。",
+        "speaker_embedding": [0.0, 0.0, 0.0],
+        "response_format": "wav"
+    }' \
+    --output ming_embedding.wav
+```
+
+Streaming PCM response:
+
+```bash
+curl -N -X POST http://localhost:8091/v1/audio/speech \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer EMPTY" \
+    -d '{
+        "model": "inclusionAI/Ming-omni-tts-0.5B",
+        "input": "你好，这是流式测试。",
+        "stream": true,
+        "response_format": "pcm"
+    }' \
+    --output ming_stream.pcm
+```
+
 ## Request Types
 
 Ming online serving supports these main request families through
diff --git a/examples/online_serving/ming_tts/run_curl.sh b/examples/online_serving/ming_tts/run_curl.sh
index 92762462e25..10fcaf59ae0 100755
--- a/examples/online_serving/ming_tts/run_curl.sh
+++ b/examples/online_serving/ming_tts/run_curl.sh
@@ -1,20 +1,4 @@
 #!/bin/bash
-# Common curl examples for Ming-omni-tts via /v1/audio/speech.
-#
-# Usage:
-#   ./run_curl.sh basic
-#   ./run_curl.sh style
-#   ./run_curl.sh ip
-#   REF_AUDIO=/path/to/ref.wav ./run_curl.sh emotion
-#   REF_AUDIO=/path/to/ref.wav ./run_curl.sh dialect
-#   REF_AUDIO=/path/to/ref.wav REF_TEXT="参考文本" ./run_curl.sh zero_shot
-#   REF_AUDIO=/path/to/speaker1.wav REF_AUDIO_2=/path/to/speaker2.wav REF_TEXT="speaker_1:... speaker_2:..." ./run_curl.sh podcast
-#   REF_AUDIO=/path/to/mix_ref.wav ./run_curl.sh speech_bgm
-#   REF_AUDIO=/path/to/mix_ref.wav ./run_curl.sh speech_sound
-#   REF_AUDIO=/path/to/ref.wav REF_TEXT="参考文本" ./run_curl.sh clone_ref_audio
-#   SPEAKER_EMBEDDING=/path/to/ming_embedding.json ./run_curl.sh clone_embedding
-#   ./run_curl.sh stream
-
 set -euo pipefail
 
 MODE="${1:-basic}"
@@ -26,192 +10,67 @@ TEXT="${TEXT:-你好，这是 Ming 在线语音合成测试。}"
 OUTPUT="${OUTPUT:-ming_output.wav}"
 STREAM_OUTPUT="${STREAM_OUTPUT:-ming_output.pcm}"
 REF_AUDIO="${REF_AUDIO:-}"
-REF_AUDIO_2="${REF_AUDIO_2:-}"
 REF_TEXT="${REF_TEXT:-}"
-SPEAKER_EMBEDDING="${SPEAKER_EMBEDDING:-}"
-
-build_payload() {
-    MODEL="$1" \
-    TEXT="$2" \
-    VOICE="$3" \
-    INSTRUCTIONS="$4" \
-    TASK_TYPE="$5" \
-    REF_AUDIO_PATH="$6" \
-    REF_TEXT="$7" \
-    SPEAKER_EMBEDDING_PATH="$8" \
-    STREAM="$9" \
-    REF_AUDIO_PATH_2="${10:-}" \
-    python - <<'PY'
-import base64
-import json
-import mimetypes
-import os
-import pathlib
-import sys
-
-payload = {
-    "model": os.environ["MODEL"],
-    "input": os.environ["TEXT"],
-}
-
-voice = os.environ["VOICE"]
-instructions = os.environ["INSTRUCTIONS"]
-task_type = os.environ["TASK_TYPE"]
-ref_audio_path = os.environ["REF_AUDIO_PATH"]
-ref_audio_path_2 = os.environ["REF_AUDIO_PATH_2"]
-ref_text = os.environ["REF_TEXT"]
-speaker_embedding_path = os.environ["SPEAKER_EMBEDDING_PATH"]
-
-if voice:
-    payload["voice"] = voice
-if instructions:
-    payload["instructions"] = instructions
-if task_type:
-    payload["task_type"] = task_type
-ref_audio_items = []
-if ref_audio_path:
-    path = pathlib.Path(ref_audio_path)
-    mime_type = mimetypes.guess_type(path.name)[0] or "audio/wav"
-    data = base64.b64encode(path.read_bytes()).decode("utf-8")
-    ref_audio_items.append(f"data:{mime_type};base64,{data}")
-if ref_audio_path_2:
-    path = pathlib.Path(ref_audio_path_2)
-    mime_type = mimetypes.guess_type(path.name)[0] or "audio/wav"
-    data = base64.b64encode(path.read_bytes()).decode("utf-8")
-    ref_audio_items.append(f"data:{mime_type};base64,{data}")
-if ref_audio_items:
-    payload["ref_audio"] = ref_audio_items[0] if len(ref_audio_items) == 1 else ref_audio_items
-if ref_text:
-    payload["ref_text"] = ref_text
-if speaker_embedding_path:
-    path = pathlib.Path(speaker_embedding_path)
-    data = json.loads(path.read_text(encoding="utf-8"))
-    if not isinstance(data, list):
-        raise SystemExit("speaker embedding file must contain a JSON list")
-    payload["speaker_embedding"] = data
 
-stream = os.environ["STREAM"] == "true"
-if stream:
-    payload["stream"] = True
-    payload["response_format"] = "pcm"
-else:
-    payload["response_format"] = "wav"
-
-print(json.dumps(payload, ensure_ascii=False))
-PY
-}
-
-require_file() {
-    local path="$1"
-    local flag_name="$2"
-    if [ -z "$path" ]; then
-        echo "Missing ${flag_name}" >&2
-        exit 1
-    fi
-    if [ ! -f "$path" ]; then
-        echo "File not found for ${flag_name}: $path" >&2
-        exit 1
-    fi
-}
-
-base_headers=(
-    -H "Content-Type: application/json"
-    -H "Authorization: Bearer EMPTY"
-)
-
-post_payload() {
+post_json() {
     local payload="$1"
     local output_path="$2"
-    local payload_file
-    payload_file="$(mktemp)"
-    trap 'rm -f "$payload_file"' RETURN
-    printf '%s' "$payload" > "$payload_file"
-    curl -X POST "$API_URL" "${base_headers[@]}" \
-        --data-binary "@${payload_file}" \
+    curl -X POST "$API_URL" \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer EMPTY" \
+        -d "$payload" \
         --output "$output_path"
 }
 
 case "$MODE" in
     basic)
-        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "" "" "" "" "" "false")"
-        post_payload "$PAYLOAD" "$OUTPUT"
-        ;;
-    style)
-        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "轻柔的ASMR耳语，慢速，贴近麦克风" "" "" "" "" "false")"
-        post_payload "$PAYLOAD" "$OUTPUT"
-        ;;
-    ip)
-        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "灵小甄" "" "" "" "" "" "false")"
-        post_payload "$PAYLOAD" "$OUTPUT"
+        post_json "{
+            \"model\": \"${MODEL}\",
+            \"input\": \"${TEXT}\",
+            \"response_format\": \"wav\"
+        }" "$OUTPUT"
         ;;
-    emotion)
-        require_file "$REF_AUDIO" "REF_AUDIO"
-        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" '{"情感":"高兴"}' "" "$REF_AUDIO" "" "" "false")"
-        post_payload "$PAYLOAD" "$OUTPUT"
-        ;;
-    dialect)
-        require_file "$REF_AUDIO" "REF_AUDIO"
-        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "" "" "$REF_AUDIO" "" "" "false")"
-        PAYLOAD="$(TEXT="$PAYLOAD" python - <<'PY'
+    zero_shot)
+        if [ -z "$REF_AUDIO" ] || [ -z "$REF_TEXT" ]; then
+            echo "zero_shot requires REF_AUDIO and REF_TEXT" >&2
+            exit 1
+        fi
+        python - <<'PY' > /tmp/ming_zero_shot_payload.json
+import base64
 import json
+import mimetypes
 import os
-payload = json.loads(os.environ["TEXT"])
-payload["language"] = "广粤话"
+from pathlib import Path
+
+path = Path(os.environ["REF_AUDIO"])
+mime_type = mimetypes.guess_type(path.name)[0] or "audio/wav"
+payload = {
+    "model": os.environ["MODEL"],
+    "input": os.environ["TEXT"],
+    "ref_audio": f"data:{mime_type};base64,{base64.b64encode(path.read_bytes()).decode('utf-8')}",
+    "ref_text": os.environ["REF_TEXT"],
+    "response_format": "wav",
+}
 print(json.dumps(payload, ensure_ascii=False))
 PY
-)"
-        post_payload "$PAYLOAD" "$OUTPUT"
-        ;;
-    zero_shot)
-        require_file "$REF_AUDIO" "REF_AUDIO"
-        if [ -z "$REF_TEXT" ]; then
-            echo "Missing REF_TEXT" >&2
-            exit 1
-        fi
-        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "" "Base" "$REF_AUDIO" "$REF_TEXT" "" "false")"
-        post_payload "$PAYLOAD" "$OUTPUT"
-        ;;
-    podcast)
-        require_file "$REF_AUDIO" "REF_AUDIO"
-        require_file "$REF_AUDIO_2" "REF_AUDIO_2"
-        if [ -z "$REF_TEXT" ]; then
-            echo "Missing REF_TEXT" >&2
-            exit 1
-        fi
-        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "" "Base" "$REF_AUDIO" "$REF_TEXT" "" "false" "$REF_AUDIO_2")"
-        post_payload "$PAYLOAD" "$OUTPUT"
-        ;;
-    speech_bgm)
-        require_file "$REF_AUDIO" "REF_AUDIO"
-        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" '{"BGM":"舒缓的背景音乐"}' "" "$REF_AUDIO" "" "" "false")"
-        post_payload "$PAYLOAD" "$OUTPUT"
-        ;;
-    speech_sound)
-        require_file "$REF_AUDIO" "REF_AUDIO"
-        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" '{"BGM":{"ENV":"轻微的环境声"}}' "" "$REF_AUDIO" "" "" "false")"
-        post_payload "$PAYLOAD" "$OUTPUT"
-        ;;
-    clone_ref_audio)
-        require_file "$REF_AUDIO" "REF_AUDIO"
-        if [ -z "$REF_TEXT" ]; then
-            echo "Missing REF_TEXT" >&2
-            exit 1
-        fi
-        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "" "Base" "$REF_AUDIO" "$REF_TEXT" "" "false")"
-        post_payload "$PAYLOAD" "$OUTPUT"
-        ;;
-    clone_embedding)
-        require_file "$SPEAKER_EMBEDDING" "SPEAKER_EMBEDDING"
-        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "" "Base" "" "" "$SPEAKER_EMBEDDING" "false")"
-        post_payload "$PAYLOAD" "$OUTPUT"
+        curl -X POST "$API_URL" \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer EMPTY" \
+            --data-binary @/tmp/ming_zero_shot_payload.json \
+            --output "$OUTPUT"
+        rm -f /tmp/ming_zero_shot_payload.json
         ;;
     stream)
-        PAYLOAD="$(build_payload "$MODEL" "$TEXT" "" "平静，普通话" "" "" "" "" "true")"
-        post_payload "$PAYLOAD" "$STREAM_OUTPUT"
+        post_json "{
+            \"model\": \"${MODEL}\",
+            \"input\": \"${TEXT}\",
+            \"stream\": true,
+            \"response_format\": \"pcm\"
+        }" "$STREAM_OUTPUT"
         ;;
     *)
         echo "Unknown mode: $MODE" >&2
-        echo "Supported: basic, style, ip, emotion, dialect, zero_shot, podcast, speech_bgm, speech_sound, clone_ref_audio, clone_embedding, stream" >&2
+        echo "Supported sanity checks: basic, zero_shot, stream" >&2
         exit 1
         ;;
 esac
diff --git a/examples/online_serving/ming_tts/run_server.sh b/examples/online_serving/ming_tts/run_server.sh
index a35d4abe512..ba35e13fd95 100755
--- a/examples/online_serving/ming_tts/run_server.sh
+++ b/examples/online_serving/ming_tts/run_server.sh
@@ -9,13 +9,13 @@ set -e
 
 MODEL="${MODEL:-inclusionAI/Ming-omni-tts-0.5B}"
 PORT="${PORT:-8091}"
-STAGE_CONFIG="${STAGE_CONFIG:-vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml}"
+DEPLOY_CONFIG="${DEPLOY_CONFIG:-vllm_omni/deploy/ming_tts.yaml}"
 
 echo "Starting Ming-omni-tts server with model: $MODEL"
-echo "Stage config: $STAGE_CONFIG"
+echo "Deploy config: $DEPLOY_CONFIG"
 
 vllm-omni serve "$MODEL" \
-    --stage-configs-path "$STAGE_CONFIG" \
+    --deploy-config "$DEPLOY_CONFIG" \
     --host 0.0.0.0 \
     --port "$PORT" \
     --enforce-eager \
diff --git a/tests/e2e/offline_inference/test_ming_tts.py b/tests/e2e/offline_inference/test_ming_tts.py
index 128b84e2896..d9e88050164 100644
--- a/tests/e2e/offline_inference/test_ming_tts.py
+++ b/tests/e2e/offline_inference/test_ming_tts.py
@@ -27,16 +27,7 @@
 from vllm_omni.model_executor.models.ming_tts.prompt_builder import build_ming_dense_prompt
 
 MODEL = "inclusionAI/Ming-omni-tts-0.5B"
-STAGE_CONFIG = str(
-    Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "ming_tts.yaml"
-)
-STREAM_STAGE_CONFIG = str(
-    Path(__file__).parent.parent.parent.parent
-    / "vllm_omni"
-    / "model_executor"
-    / "stage_configs"
-    / "ming_tts_async_chunk.yaml"
-)
+DEPLOY_CONFIG = str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "deploy" / "ming_tts.yaml")
 TEST_TEXT = "我会一直在这里陪着你，直到你慢慢地沉入那个最温柔的梦里。"
 TEST_INSTRUCTION = "轻柔的ASMR耳语，慢速，贴近麦克风"
 MIN_AUDIO_SAMPLES = 1000
@@ -108,7 +99,7 @@ def test_ming_tts_offline_basic() -> None:
     """Test blocking Ming generation through Omni."""
     omni = Omni(
         model=MODEL,
-        stage_configs_path=STAGE_CONFIG,
+        deploy_config=DEPLOY_CONFIG,
         stage_init_timeout=300,
         enforce_eager=True,
     )
@@ -139,7 +130,7 @@ def test_ming_tts_speaker_conditioning_differs() -> None:
     """Test that different Ming speaker controls produce different waveform outputs."""
     omni = Omni(
         model=MODEL,
-        stage_configs_path=STAGE_CONFIG,
+        deploy_config=DEPLOY_CONFIG,
         stage_init_timeout=300,
         enforce_eager=True,
     )
@@ -185,7 +176,7 @@ def test_ming_tts_offline_streaming() -> None:
     async def _run() -> None:
         async_omni = AsyncOmni(
             model=MODEL,
-            stage_configs_path=STREAM_STAGE_CONFIG,
+            deploy_config=DEPLOY_CONFIG,
             stage_init_timeout=300,
             enforce_eager=True,
         )
diff --git a/tests/e2e/online_serving/test_ming_tts.py b/tests/e2e/online_serving/test_ming_tts.py
index 6b3e21c09bd..11d6bd56970 100644
--- a/tests/e2e/online_serving/test_ming_tts.py
+++ b/tests/e2e/online_serving/test_ming_tts.py
@@ -19,19 +19,13 @@
 from vllm_omni.model_executor.models.ming_tts.config_ming_tts import SAMPLE_RATE
 
 MODEL = "inclusionAI/Ming-omni-tts-0.5B"
-STAGE_CONFIG = str(
-    Path(__file__).parent.parent.parent.parent
-    / "vllm_omni"
-    / "model_executor"
-    / "stage_configs"
-    / "ming_tts_async_chunk.yaml"
-)
+DEPLOY_CONFIG = str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "deploy" / "ming_tts.yaml")
 
 SERVER_PARAMS = [
     pytest.param(
         OmniServerParams(
             model=MODEL,
-            stage_config_path=STAGE_CONFIG,
+            stage_config_path=DEPLOY_CONFIG,
             server_args=["--enforce-eager", "--disable-log-stats"],
         ),
         id="async_chunk",
diff --git a/vllm_omni/config/pipeline_registry.py b/vllm_omni/config/pipeline_registry.py
index 98aa132863c..ce20b9332d2 100644
--- a/vllm_omni/config/pipeline_registry.py
+++ b/vllm_omni/config/pipeline_registry.py
@@ -59,6 +59,10 @@
         "vllm_omni.model_executor.models.mimo_audio.pipeline",
         "MIMO_AUDIO_PIPELINE",
     ),
+    "ming_tts": (
+        "vllm_omni.model_executor.models.ming_tts.pipeline",
+        "MING_TTS_PIPELINE",
+    ),
     "voxtral_tts": (
         "vllm_omni.model_executor.models.voxtral_tts.pipeline",
         "VOXTRAL_TTS_PIPELINE",
diff --git a/vllm_omni/deploy/ming_tts.yaml b/vllm_omni/deploy/ming_tts.yaml
new file mode 100644
index 00000000000..0d498c5441a
--- /dev/null
+++ b/vllm_omni/deploy/ming_tts.yaml
@@ -0,0 +1,48 @@
+# Ming-omni-tts deploy: stage-0 LLM+flow -> stage-1 audio VAE.
+# Verified legacy settings migrated from ming_tts.yaml and ming_tts_async_chunk.yaml.
+# Default mode is async-chunk streaming on a single GPU.
+async_chunk: true
+trust_remote_code: false
+dtype: bfloat16
+
+connectors:
+  connector_of_shared_memory:
+    name: SharedMemoryConnector
+    extra:
+      latent_chunk_size: 25
+      latent_left_context: 0
+
+stages:
+  - stage_id: 0
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.45
+    enforce_eager: true
+    async_scheduling: false
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    devices: "0"
+    output_connectors:
+      to_stage_1: connector_of_shared_memory
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 512
+      detokenize: true
+
+  - stage_id: 1
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.25
+    enforce_eager: true
+    async_scheduling: false
+    max_num_batched_tokens: 8192
+    max_model_len: 8192
+    devices: "0"
+    input_connectors:
+      from_stage_0: connector_of_shared_memory
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 1
+      detokenize: false
diff --git a/vllm_omni/model_executor/models/ming_tts/pipeline.py b/vllm_omni/model_executor/models/ming_tts/pipeline.py
new file mode 100644
index 00000000000..31ec98e5a05
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/pipeline.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Ming TTS pipeline: Stage-0 LLM+flow -> Stage-1 audio VAE."""
+
+from vllm_omni.config.stage_config import (
+    PipelineConfig,
+    StageExecutionType,
+    StagePipelineConfig,
+)
+
+_PROC = "vllm_omni.model_executor.stage_input_processors.ming_tts"
+
+MING_TTS_PIPELINE = PipelineConfig(
+    model_type="ming_tts",
+    model_arch="MingTTSForConditionalGeneration",
+    hf_architectures=("MingTTSForConditionalGeneration",),
+    stages=(
+        StagePipelineConfig(
+            stage_id=0,
+            model_stage="llm",
+            execution_type=StageExecutionType.LLM_AR,
+            input_sources=(),
+            owns_tokenizer=True,
+            hf_config_name="llm_config",
+            engine_output_type="latent",
+            async_chunk_process_next_stage_input_func=(f"{_PROC}.llm2audio_vae_async_chunk"),
+            sampling_constraints={
+                "temperature": 0.0,
+                "top_p": 1.0,
+                "top_k": -1,
+                "max_tokens": 512,
+                "detokenize": True,
+            },
+        ),
+        StagePipelineConfig(
+            stage_id=1,
+            model_stage="audio_vae",
+            execution_type=StageExecutionType.LLM_GENERATION,
+            input_sources=(0,),
+            final_output=True,
+            final_output_type="audio",
+            hf_config_name="llm_config",
+            engine_output_type="audio",
+            sync_process_input_func=f"{_PROC}.llm2audio_vae",
+            sampling_constraints={
+                "temperature": 0.0,
+                "top_p": 1.0,
+                "top_k": -1,
+                "max_tokens": 1,
+                "detokenize": False,
+            },
+        ),
+    ),
+)

From ac4fe0ae993847699b9184d9c2eab12a1c9b9ed5 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Fri, 24 Apr 2026 00:06:04 +0530
Subject: [PATCH 07/54] Reuse shared speaker embedding loader

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../openai_api/test_serving_speech.py         | 19 ++++++-------
 .../models/ming_tts/speaker_extractor.py      | 27 ++++---------------
 2 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py
index 816d87e592a..2f6ab5213d3 100644
--- a/tests/entrypoints/openai_api/test_serving_speech.py
+++ b/tests/entrypoints/openai_api/test_serving_speech.py
@@ -2270,7 +2270,7 @@ def test_build_ming_prompt_handles_multi_speaker_podcast_inputs(self, ming_speec
             speaker_embedding=[[0.1] * 192, [0.2] * 192],
         )
 
-        prompt = OmniOpenAIServingSpeech._build_ming_prompt(
+        prompt = OmniOpenAIServingSpeech._build_ming_dense_prompt(
             ming_speech_server,
             request,
             ref_audio_data=[
@@ -2311,7 +2311,7 @@ def _fake_build_ming_dense_prompt(*args, **kwargs):
             speaker_embedding=[[0.1] * 192, [0.2] * 192],
         )
 
-        OmniOpenAIServingSpeech._build_ming_prompt(
+        OmniOpenAIServingSpeech._build_ming_dense_prompt(
             ming_speech_server,
             request,
             ref_audio_data=[
@@ -2324,9 +2324,7 @@ def _fake_build_ming_dense_prompt(*args, **kwargs):
         assert captured["speaker_embedding"] == [[0.1] * 192, [0.2] * 192]
         assert captured["prompt_text"] == " speaker_1:参考一。\n speaker_2:参考二。\n"
 
-    def test_build_ming_prompt_uses_single_ref_audio_as_speaker_only_without_ref_text(
-        self, ming_speech_server, mocker: MockerFixture
-    ):
+    def test_build_ming_prompt_omits_prompt_waveform_without_ref_text(self, ming_speech_server, mocker: MockerFixture):
         captured = {}
 
         def _fake_build_ming_dense_prompt(*args, **kwargs):
@@ -2341,11 +2339,10 @@ def _fake_build_ming_dense_prompt(*args, **kwargs):
         request = OpenAICreateSpeechRequest(
             input="我竟然抢到了陈奕迅的演唱会门票！",
             ref_audio="data:audio/wav;base64,aaa",
-            speaker_embedding=[0.1] * 192,
             instructions='{"情感":"高兴"}',
         )
 
-        OmniOpenAIServingSpeech._build_ming_prompt(
+        OmniOpenAIServingSpeech._build_ming_dense_prompt(
             ming_speech_server,
             request,
             ref_audio_data=([0.1] * 10, 44100),
@@ -2353,7 +2350,7 @@ def _fake_build_ming_dense_prompt(*args, **kwargs):
 
         assert captured["prompt_waveform"] is None
         assert captured["prompt_text"] is None
-        assert captured["speaker_embedding"] == [0.1] * 192
+        assert captured["speaker_embedding"] is None
 
     def test_build_ming_prompt_keeps_single_ref_audio_waveform_with_ref_text(
         self, ming_speech_server, mocker: MockerFixture
@@ -2373,10 +2370,9 @@ def _fake_build_ming_dense_prompt(*args, **kwargs):
             input="我们的愿景是构建未来服务业的数字化基础设施。",
             ref_audio="data:audio/wav;base64,aaa",
             ref_text="在此奉劝大家别乱打美白针。",
-            speaker_embedding=[0.1] * 192,
         )
 
-        OmniOpenAIServingSpeech._build_ming_prompt(
+        OmniOpenAIServingSpeech._build_ming_dense_prompt(
             ming_speech_server,
             request,
             ref_audio_data=([0.1] * 10, 44100),
@@ -2384,7 +2380,7 @@ def _fake_build_ming_dense_prompt(*args, **kwargs):
 
         assert tuple(captured["prompt_waveform"].shape) == (1, 10)
         assert captured["prompt_text"] == "在此奉劝大家别乱打美白针。"
-        assert captured["speaker_embedding"] == [0.1] * 192
+        assert captured["speaker_embedding"] is None
 
     def test_prepare_speech_generation_sets_ming_stop_token(self, ming_speech_server):
         from vllm_omni.model_executor.models.ming_tts.config_ming_tts import TEXT_EOS_TOKEN_ID
@@ -2430,6 +2426,7 @@ def test_prepare_speech_generation_extracts_ming_single_ref_audio_speaker_embedd
             ref_audio="data:audio/wav;base64,aaa",
             instructions='{"情感":"高兴"}',
         )
+        ming_speech_server._max_instructions_length = 500
         ming_speech_server._resolve_ref_audio = AsyncMock(return_value=([0.1, 0.2], 44100))
         ming_speech_server._extract_ming_speaker_embeddings_from_ref_audio = mocker.MagicMock(
             return_value=[[0.3] * 192]
diff --git a/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py b/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py
index b32d302658c..97d5510da0e 100644
--- a/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py
+++ b/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py
@@ -3,10 +3,10 @@
 # Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/spkemb_extractor.py
 import os
 
-import onnxruntime
 import torch
 import torchaudio
-import torchaudio.compliance.kaldi as kaldi
+
+from vllm_omni.model_executor.models.ming_flash_omni.spk_embedding import SpkembExtractor
 
 
 def resolve_model_to_local_path(model):
@@ -25,15 +25,8 @@ def __init__(self, model, target_sr=16000):
         if not os.path.exists(campplus_path):
             raise RuntimeError(f"Missing Ming speaker extractor model: {campplus_path}")
 
-        options = onnxruntime.SessionOptions()
-        options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-        options.intra_op_num_threads = 2
-        self.session = onnxruntime.InferenceSession(
-            campplus_path,
-            sess_options=options,
-            providers=["CPUExecutionProvider"],
-        )
         self.target_sr = int(target_sr)
+        self._core = SpkembExtractor(campplus_path, target_sr=self.target_sr)
 
     def extract_from_waveform(self, waveform, sample_rate):
         if not isinstance(waveform, torch.Tensor):
@@ -45,18 +38,8 @@ def extract_from_waveform(self, waveform, sample_rate):
         if int(sample_rate) != self.target_sr:
             tensor = torchaudio.transforms.Resample(orig_freq=int(sample_rate), new_freq=self.target_sr)(tensor)
 
-        feat = kaldi.fbank(
-            tensor,
-            num_mel_bins=80,
-            dither=0,
-            sample_frequency=self.target_sr,
-        )
-        feat = feat - feat.mean(dim=0, keepdim=True)
-        embedding = self.session.run(
-            None,
-            {self.session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()},
-        )[0].flatten()
-        return torch.tensor(embedding, dtype=torch.float32)
+        embedding = self._core._extract_spk_embedding(tensor)
+        return embedding.squeeze(0).to(dtype=torch.float32)
 
     def extract_from_file(self, audio_path):
         waveform, sample_rate = torchaudio.load(audio_path)

From d1920a58efdf495226dddffdc2c0aa4246fccd56 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Fri, 24 Apr 2026 00:24:19 +0530
Subject: [PATCH 08/54] fix: resolve F821 undefined name by adding raw_request
 to audio chunks signature

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 vllm_omni/entrypoints/openai/serving_speech.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index 93322ce4120..386779ed309 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -1438,7 +1438,9 @@ def _build_ming_dense_prompt(
             use_zero_spk_emb=use_zero_spk_emb,
         )
 
-    async def _generate_audio_chunks(self, generator, request_id: str, response_format: str = "pcm"):
+    async def _generate_audio_chunks(
+        self, generator, request_id: str, response_format: str = "pcm", raw_request: Request | None = None
+    ):
         """Generate audio chunks for streaming response.
 
         Handles two audio output modes from the engine:

From e8b97bd9ec30f2514bc2853472900c9566e38615 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Fri, 24 Apr 2026 01:25:48 +0530
Subject: [PATCH 09/54] fix(ming_tts): update hf_architectures to match
 BailingMMNative architecture

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 vllm_omni/model_executor/models/ming_tts/pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/pipeline.py b/vllm_omni/model_executor/models/ming_tts/pipeline.py
index 31ec98e5a05..c431390645b 100644
--- a/vllm_omni/model_executor/models/ming_tts/pipeline.py
+++ b/vllm_omni/model_executor/models/ming_tts/pipeline.py
@@ -13,7 +13,7 @@
 MING_TTS_PIPELINE = PipelineConfig(
     model_type="ming_tts",
     model_arch="MingTTSForConditionalGeneration",
-    hf_architectures=("MingTTSForConditionalGeneration",),
+    hf_architectures=("BailingMMNativeForConditionalGeneration",),
     stages=(
         StagePipelineConfig(
             stage_id=0,

From 6b8f2c3a8de15cf30d53a5e5d90c0ead8f9ae68d Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Fri, 24 Apr 2026 01:36:39 +0530
Subject: [PATCH 10/54] fix(config): ensure DeployConfig.pipeline override is
 honored when auto-detection fails

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 vllm_omni/config/stage_config.py                 | 16 ++++++++++++++++
 vllm_omni/deploy/ming_tts.yaml                   |  4 ++++
 .../model_executor/models/ming_tts/pipeline.py   |  2 +-
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index 6bd2faf7e6b..cc3b094857f 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -1029,6 +1029,22 @@ def create_from_model(
                             registered.model_type, cli_overrides, deploy_config_path, cli_explicit_keys
                         )
 
+        # --- Deploy config pipeline override ---
+        # Models that report a generic model_type (e.g. "dense") with no HF
+        # architectures cannot be matched by the paths above. If an explicit
+        # deploy config is provided and it carries a ``pipeline:`` key, use
+        # that to resolve the registry entry. This completes the intent of
+        # DeployConfig.pipeline ("overrides auto-detected pipeline registry
+        # key") for the case where auto-detection itself fails.
+        if deploy_config_path is not None:
+            _deploy_path = Path(deploy_config_path)
+            if _deploy_path.exists():
+                _deploy_cfg = load_deploy_config(_deploy_path)
+                if _deploy_cfg.pipeline and _deploy_cfg.pipeline in _PIPELINE_REGISTRY:
+                    return cls._create_from_registry(
+                        _deploy_cfg.pipeline, cli_overrides, deploy_config_path, cli_explicit_keys
+                    )
+
         # --- Legacy path: load from pipeline YAML ---
         pipeline = cls._load_pipeline(model, trust_remote_code=trust_remote_code)
 
diff --git a/vllm_omni/deploy/ming_tts.yaml b/vllm_omni/deploy/ming_tts.yaml
index 0d498c5441a..327f61a1785 100644
--- a/vllm_omni/deploy/ming_tts.yaml
+++ b/vllm_omni/deploy/ming_tts.yaml
@@ -1,6 +1,10 @@
 # Ming-omni-tts deploy: stage-0 LLM+flow -> stage-1 audio VAE.
 # Verified legacy settings migrated from ming_tts.yaml and ming_tts_async_chunk.yaml.
 # Default mode is async-chunk streaming on a single GPU.
+# pipeline: required — HF config reports model_type="dense" with no architectures,
+# so auto-detection falls through. This key routes the deploy-config probe in
+# StageConfigFactory.create_from_model to the correct registry entry.
+pipeline: ming_tts
 async_chunk: true
 trust_remote_code: false
 dtype: bfloat16
diff --git a/vllm_omni/model_executor/models/ming_tts/pipeline.py b/vllm_omni/model_executor/models/ming_tts/pipeline.py
index c431390645b..31ec98e5a05 100644
--- a/vllm_omni/model_executor/models/ming_tts/pipeline.py
+++ b/vllm_omni/model_executor/models/ming_tts/pipeline.py
@@ -13,7 +13,7 @@
 MING_TTS_PIPELINE = PipelineConfig(
     model_type="ming_tts",
     model_arch="MingTTSForConditionalGeneration",
-    hf_architectures=("BailingMMNativeForConditionalGeneration",),
+    hf_architectures=("MingTTSForConditionalGeneration",),
     stages=(
         StagePipelineConfig(
             stage_id=0,

From d0a51e8e4811daa00df94915e51afccaf96b67e6 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Fri, 24 Apr 2026 02:44:18 +0530
Subject: [PATCH 11/54] fix ming_tts offline runner truncating multi-chunk
 audio

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../offline_inference/ming_tts/_runner.py     | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/examples/offline_inference/ming_tts/_runner.py b/examples/offline_inference/ming_tts/_runner.py
index c5682bffaff..35852ce04ec 100644
--- a/examples/offline_inference/ming_tts/_runner.py
+++ b/examples/offline_inference/ming_tts/_runner.py
@@ -21,21 +21,14 @@
 
 def coerce_audio_tensor(audio, *, async_chunk: bool) -> torch.Tensor:
     if isinstance(audio, list):
-        if async_chunk:
-            parts = []
-            for item in audio:
-                tensor = torch.as_tensor(item, dtype=torch.float32).reshape(-1)
-                if tensor.numel() > 0:
-                    parts.append(tensor)
-            if not parts:
-                return torch.zeros((0,), dtype=torch.float32)
-            return torch.cat(parts, dim=0)
-
-        for item in reversed(audio):
+        parts = []
+        for item in audio:
             tensor = torch.as_tensor(item, dtype=torch.float32).reshape(-1)
             if tensor.numel() > 0:
-                return tensor
-        return torch.zeros((0,), dtype=torch.float32)
+                parts.append(tensor)
+        if not parts:
+            return torch.zeros((0,), dtype=torch.float32)
+        return torch.cat(parts, dim=0)
 
     return torch.as_tensor(audio, dtype=torch.float32).reshape(-1)
 

From 01055e308a92221a8bb6a0b2faa7496d394adeec Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Tue, 28 Apr 2026 19:03:39 +0530
Subject: [PATCH 12/54] docs: migrate Ming TTS docs to deploy config

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../examples/offline_inference/ming_tts.md    | 14 ++++-----
 .../examples/online_serving/ming_tts.md       |  2 +-
 examples/offline_inference/ming_tts/README.md | 10 ++----
 examples/online_serving/ming_tts/README.md    |  2 +-
 .../models/ming_tts/audio_tokenizer/istft.py  | 31 ++++++++++++++++---
 5 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/docs/user_guide/examples/offline_inference/ming_tts.md b/docs/user_guide/examples/offline_inference/ming_tts.md
index 4a572873e42..6a51c7965f6 100644
--- a/docs/user_guide/examples/offline_inference/ming_tts.md
+++ b/docs/user_guide/examples/offline_inference/ming_tts.md
@@ -11,7 +11,7 @@ Run a zero-speaker style case:
 ```bash
 python examples/offline_inference/ming_tts/end2end.py \
     --case style \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager
 ```
 
@@ -21,7 +21,7 @@ Run emotion-controlled speech:
 python examples/offline_inference/ming_tts/end2end.py \
     --case emotion \
     --ref-audio /path/to/emotion_prompt.wav \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager
 ```
 
@@ -32,7 +32,7 @@ python examples/offline_inference/ming_tts/end2end.py \
     --case zero_shot \
     --ref-audio /path/to/reference.wav \
     --ref-text "在此奉劝大家别乱打美白针。" \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager
 ```
 
@@ -42,7 +42,7 @@ Run podcast generation:
 python examples/offline_inference/ming_tts/end2end.py \
     --case podcast \
     --ref-audio-paths /path/to/CTS-CN-F2F-2019-11-11-423-012-A.wav /path/to/CTS-CN-F2F-2019-11-11-423-012-B.wav \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager
 ```
 
@@ -51,7 +51,7 @@ Run text-to-audio event generation:
 ```bash
 python examples/offline_inference/ming_tts/end2end.py \
     --case tta \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager
 ```
 
@@ -60,7 +60,7 @@ Run with stats and a manifest:
 ```bash
 python examples/offline_inference/ming_tts/end2end.py \
     --case style \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager \
     --enable-stats \
     --stats-log-file output_audio/ming_style_pipeline.log \
@@ -90,7 +90,7 @@ python examples/offline_inference/ming_tts/end2end.py \
     --case basic \
     --ref-audio /path/to/10002287-00000095.wav \
     --streaming \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager
 ```
 
diff --git a/docs/user_guide/examples/online_serving/ming_tts.md b/docs/user_guide/examples/online_serving/ming_tts.md
index dd16a732106..a011bddbd58 100644
--- a/docs/user_guide/examples/online_serving/ming_tts.md
+++ b/docs/user_guide/examples/online_serving/ming_tts.md
@@ -12,7 +12,7 @@ Please refer to [README.md](https://github.com/vllm-project/vllm-omni/tree/main/
 
 ```bash
 vllm-omni serve inclusionAI/Ming-omni-tts-0.5B \
-    --stage-configs-path vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --omni \
     --port 8091 \
     --enforce-eager
diff --git a/examples/offline_inference/ming_tts/README.md b/examples/offline_inference/ming_tts/README.md
index 2ae5906de79..0c296620acd 100644
--- a/examples/offline_inference/ming_tts/README.md
+++ b/examples/offline_inference/ming_tts/README.md
@@ -19,7 +19,7 @@ Ming dense 0.5B is exposed here as a two-stage offline pipeline:
 
 The example supports both:
 
-- **Sequential eager** via `vllm_omni/deploy/ming_tts.yaml` with `--no-async-chunk`
+- **Blocking eager** via `vllm_omni/deploy/ming_tts.yaml`
 - **Async chunk eager** via `vllm_omni/deploy/ming_tts.yaml` (default `async_chunk: true`)
 
 ## Setup
@@ -59,7 +59,6 @@ Run the zero-speaker style example:
 python examples/offline_inference/ming_tts/end2end.py \
     --case style \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --no-async-chunk \
     --enforce-eager
 ```
 
@@ -71,7 +70,6 @@ python examples/offline_inference/ming_tts/end2end.py \
     --ref-audio /path/to/10002287-00000094.wav \
     --ref-text "在此奉劝大家别乱打美白针。" \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --no-async-chunk \
     --enforce-eager
 ```
 
@@ -82,7 +80,6 @@ python examples/offline_inference/ming_tts/end2end.py \
     --case emotion \
     --ref-audio /path/to/emotion_prompt.wav \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --no-async-chunk \
     --enforce-eager
 ```
 
@@ -93,7 +90,6 @@ python examples/offline_inference/ming_tts/end2end.py \
     --case podcast \
     --ref-audio-paths /path/to/CTS-CN-F2F-2019-11-11-423-012-A.wav /path/to/CTS-CN-F2F-2019-11-11-423-012-B.wav \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --no-async-chunk \
     --enforce-eager
 ```
 
@@ -113,7 +109,6 @@ Run text-to-audio event generation:
 python examples/offline_inference/ming_tts/end2end.py \
     --case tta \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --no-async-chunk \
     --enforce-eager
 ```
 
@@ -138,7 +133,6 @@ Collect runtime stats and a manifest:
 python examples/offline_inference/ming_tts/end2end.py \
     --case style \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --no-async-chunk \
     --enforce-eager \
     --enable-stats \
     --stats-log-file output_audio/ming_style_pipeline.log \
@@ -161,7 +155,7 @@ The upstream Ming cookbook uses these public audio fixtures from `inclusionAI/Mi
 The repo-facing example is intended to cover the same dense TTS workflows used
 by the local Ming validation script:
 
-| Case | Blocking `ming_tts.yaml` | Async chunk `deploy/ming_tts.yaml` | Extra inputs |
+| Case | Blocking `deploy/ming_tts.yaml` | Async chunk `deploy/ming_tts.yaml` | Extra inputs |
 |---|---:|---:|---|
 | `style` | Yes | Optional smoke test | none |
 | `ip` | Yes | Optional smoke test | none |
diff --git a/examples/online_serving/ming_tts/README.md b/examples/online_serving/ming_tts/README.md
index f61b068bebf..2717d62ca87 100644
--- a/examples/online_serving/ming_tts/README.md
+++ b/examples/online_serving/ming_tts/README.md
@@ -4,7 +4,7 @@
 
 Please refer to [README.md](../../../README.md)
 
-## Model
+## Ming Model
 
 | Model | Description |
 |-------|-------------|
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py
index c365381c87f..6824de74140 100644
--- a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py
+++ b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py
@@ -36,7 +36,14 @@ def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str =
         self.window_buffer = None
         self.buffer_len = self.win_length - self.hop_length
 
-    def __buffer_process(self, x, buffer, pad, last_chunk=False, streaming=False):
+    def __buffer_process(
+        self,
+        x: torch.Tensor,
+        buffer: torch.Tensor | None,
+        pad: int,
+        last_chunk: bool = False,
+        streaming: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         if streaming:
             if buffer is None:
                 # first chunk
@@ -54,7 +61,14 @@ def __buffer_process(self, x, buffer, pad, last_chunk=False, streaming=False):
 
         return x, buffer
 
-    def forward(self, spec: torch.Tensor, audio_buffer=None, window_buffer=None, streaming=False, last_chunk=False):
+    def forward(
+        self,
+        spec: torch.Tensor,
+        audio_buffer: torch.Tensor | None = None,
+        window_buffer: torch.Tensor | None = None,
+        streaming: bool = False,
+        last_chunk: bool = False,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
         """
         Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
 
@@ -68,7 +82,7 @@ def forward(self, spec: torch.Tensor, audio_buffer=None, window_buffer=None, str
             last_chunk: When `streaming=True` and `last_chunk=True`, the function can perform final "flush" operations
 
         Returns:
-            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
+            Reconstructed signal, plus streaming buffers when `padding="same"`.
         """
         if self.padding == "center":
             # Fallback to pytorch native implementation
@@ -156,7 +170,14 @@ def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same")
         self.out = torch.nn.Linear(dim, out_dim)
         self.istft = ISTFT(n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding)
 
-    def forward(self, x: torch.Tensor, audio_buffer=None, window_buffer=None, streaming=False, last_chunk=False):
+    def forward(
+        self,
+        x: torch.Tensor,
+        audio_buffer: torch.Tensor | None = None,
+        window_buffer: torch.Tensor | None = None,
+        streaming: bool = False,
+        last_chunk: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
         """
         Forward pass of the ISTFTHead module.
 
@@ -165,7 +186,7 @@ def forward(self, x: torch.Tensor, audio_buffer=None, window_buffer=None, stream
                         L is the sequence length, and H denotes the model dimension.
 
         Returns:
-            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+            Audio, predicted spectrogram coefficients, and streaming buffers.
         """
         x_pred = self.out(x)
         # x_pred = x

From 169800b4b55c39bebc1b2b706965888650c6bfe6 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Tue, 28 Apr 2026 19:46:26 +0530
Subject: [PATCH 13/54] fix incorrectly importing OmniServerParams

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 tests/e2e/online_serving/test_ming_tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/online_serving/test_ming_tts.py b/tests/e2e/online_serving/test_ming_tts.py
index 11d6bd56970..cc8bcd1db7a 100644
--- a/tests/e2e/online_serving/test_ming_tts.py
+++ b/tests/e2e/online_serving/test_ming_tts.py
@@ -14,7 +14,7 @@
 
 import pytest
 
-from tests.conftest import OmniServerParams
+from tests.helpers.runtime import OmniServerParams
 from tests.utils import hardware_test
 from vllm_omni.model_executor.models.ming_tts.config_ming_tts import SAMPLE_RATE
 

From b4187e338e3c8619960534682a0636a4cc663a26 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Tue, 28 Apr 2026 20:02:44 +0530
Subject: [PATCH 14/54] tests: align Ming TTS offline coverage

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 tests/e2e/offline_inference/test_ming_tts.py | 212 ++++++++++++-------
 1 file changed, 138 insertions(+), 74 deletions(-)

diff --git a/tests/e2e/offline_inference/test_ming_tts.py b/tests/e2e/offline_inference/test_ming_tts.py
index d9e88050164..83a33748704 100644
--- a/tests/e2e/offline_inference/test_ming_tts.py
+++ b/tests/e2e/offline_inference/test_ming_tts.py
@@ -4,12 +4,7 @@
 """End-to-end offline inference tests for Ming-omni-tts."""
 
 import asyncio
-import os
 import uuid
-from pathlib import Path
-
-os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
 
 import numpy as np
 import pytest
@@ -17,8 +12,10 @@
 from transformers import AutoTokenizer
 from vllm import SamplingParams
 
-from tests.utils import hardware_test
-from vllm_omni import AsyncOmni, Omni
+from tests.helpers.mark import hardware_test
+from tests.helpers.runtime import OmniRunner
+from tests.helpers.stage_config import get_deploy_config_path
+from vllm_omni import AsyncOmni
 from vllm_omni.model_executor.models.ming_tts.config_ming_tts import (
     KEY_MAX_DECODE_STEPS,
     SAMPLE_RATE,
@@ -27,19 +24,35 @@
 from vllm_omni.model_executor.models.ming_tts.prompt_builder import build_ming_dense_prompt
 
 MODEL = "inclusionAI/Ming-omni-tts-0.5B"
-DEPLOY_CONFIG = str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "deploy" / "ming_tts.yaml")
+DEPLOY_CONFIG = get_deploy_config_path("ming_tts.yaml")
 TEST_TEXT = "我会一直在这里陪着你，直到你慢慢地沉入那个最温柔的梦里。"
 TEST_INSTRUCTION = "轻柔的ASMR耳语，慢速，贴近麦克风"
 MIN_AUDIO_SAMPLES = 1000
 
 
+@pytest.fixture(scope="module")
+def ming_tokenizer():
+    return AutoTokenizer.from_pretrained(MODEL, trust_remote_code=False)
+
+
+@pytest.fixture(scope="module")
+def ming_engine():
+    with OmniRunner(
+        MODEL,
+        deploy_config=DEPLOY_CONFIG,
+        stage_init_timeout=300,
+        enforce_eager=True,
+    ) as runner:
+        yield runner.omni
+
+
 def _build_prompt(
+    tokenizer,
     *,
     text: str = TEST_TEXT,
     instruction=TEST_INSTRUCTION,
     use_zero_spk_emb: bool = True,
 ) -> dict:
-    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=False)
     return build_ming_dense_prompt(
         tokenizer,
         prompt="Please generate speech based on the following description.\n",
@@ -72,9 +85,9 @@ def _flatten_audio(audio) -> torch.Tensor:
 
 
 def _extract_audio(multimodal_output: dict) -> torch.Tensor:
-    audio = multimodal_output.get("audio")
-    if audio is None:
-        raise RuntimeError("Expected multimodal_output['audio']")
+    assert isinstance(multimodal_output, dict), f"Expected dict, got {type(multimodal_output)}"
+    audio = multimodal_output.get("audio", multimodal_output.get("model_outputs"))
+    assert audio is not None, f"No audio output found, keys={list(multimodal_output.keys())}"
     waveform = _flatten_audio(audio)
     if waveform.numel() == 0:
         raise RuntimeError("Generated audio waveform is empty")
@@ -92,85 +105,136 @@ def _extract_sample_rate(multimodal_output: dict) -> int:
     return int(sample_rate)
 
 
+def _extract_final_audio_outputs(outputs):
+    final_outputs = []
+    for item in outputs:
+        if getattr(item, "final_output_type", None) == "audio":
+            final_outputs.append(item)
+            continue
+        request_output = getattr(item, "request_output", None)
+        if request_output is None:
+            continue
+        multimodal_output = getattr(request_output, "multimodal_output", None)
+        if isinstance(multimodal_output, dict):
+            final_outputs.append(item)
+            continue
+        completions = getattr(request_output, "outputs", None) or []
+        if any(isinstance(getattr(completion, "multimodal_output", None), dict) for completion in completions):
+            final_outputs.append(item)
+    return final_outputs
+
+
+def _extract_multimodal_output(output) -> dict:
+    multimodal_output = getattr(output, "multimodal_output", None)
+    if isinstance(multimodal_output, dict):
+        return multimodal_output
+
+    request_output = getattr(output, "request_output", None)
+    if request_output is not None:
+        multimodal_output = getattr(request_output, "multimodal_output", None)
+        if isinstance(multimodal_output, dict):
+            return multimodal_output
+        completions = getattr(request_output, "outputs", None) or []
+        for completion in completions:
+            multimodal_output = getattr(completion, "multimodal_output", None)
+            if isinstance(multimodal_output, dict):
+                return multimodal_output
+
+    raise AssertionError("No multimodal audio output found in Ming generate results")
+
+
 @pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
-def test_ming_tts_offline_basic() -> None:
+def test_ming_tts_offline_basic(ming_engine, ming_tokenizer) -> None:
     """Test blocking Ming generation through Omni."""
-    omni = Omni(
-        model=MODEL,
-        deploy_config=DEPLOY_CONFIG,
-        stage_init_timeout=300,
-        enforce_eager=True,
+    outputs = ming_engine.generate(
+        prompts=[_build_prompt(ming_tokenizer)],
+        sampling_params_list=_sampling_params_list(),
+        py_generator=False,
     )
-    try:
-        outputs = omni.generate(
-            prompts=[_build_prompt()],
-            sampling_params_list=_sampling_params_list(),
-            py_generator=False,
-        )
-        final_output = next((item for item in outputs if item.final_output_type == "audio"), None)
-        assert final_output is not None, "No final audio output produced"
-        multimodal_output = final_output.multimodal_output or {}
-        waveform = _extract_audio(multimodal_output)
-        sample_rate = _extract_sample_rate(multimodal_output)
-        assert waveform.ndim == 1
-        assert waveform.shape[0] == waveform.numel()
-        assert waveform.numel() > MIN_AUDIO_SAMPLES
-        assert np.max(np.abs(waveform.numpy())) > 0.01, "Audio appears silent"
-        assert sample_rate == SAMPLE_RATE, f"Expected Ming output sample rate {SAMPLE_RATE}, got {sample_rate}"
-    finally:
-        omni.close()
+    final_outputs = _extract_final_audio_outputs(outputs)
+    assert len(final_outputs) == 1, f"Expected one final audio output, got {len(final_outputs)}"
+    multimodal_output = _extract_multimodal_output(final_outputs[0])
+    waveform = _extract_audio(multimodal_output)
+    sample_rate = _extract_sample_rate(multimodal_output)
+    assert waveform.ndim == 1
+    assert waveform.shape[0] == waveform.numel()
+    assert waveform.numel() > MIN_AUDIO_SAMPLES
+    assert np.max(np.abs(waveform.numpy())) > 0.01, "Audio appears silent"
+    assert sample_rate == SAMPLE_RATE, f"Expected Ming output sample rate {SAMPLE_RATE}, got {sample_rate}"
 
 
 @pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
-def test_ming_tts_speaker_conditioning_differs() -> None:
+def test_ming_tts_speaker_conditioning_differs(ming_engine, ming_tokenizer) -> None:
     """Test that different Ming speaker controls produce different waveform outputs."""
-    omni = Omni(
-        model=MODEL,
-        deploy_config=DEPLOY_CONFIG,
-        stage_init_timeout=300,
-        enforce_eager=True,
+    style_outputs = ming_engine.generate(
+        prompts=[_build_prompt(ming_tokenizer)],
+        sampling_params_list=_sampling_params_list(),
+        py_generator=False,
+    )
+    ip_outputs = ming_engine.generate(
+        prompts=[_build_prompt(ming_tokenizer, text=TEST_TEXT, instruction={"IP": "灵小甄"}, use_zero_spk_emb=True)],
+        sampling_params_list=_sampling_params_list(),
+        py_generator=False,
     )
-    try:
-        style_outputs = omni.generate(
-            prompts=[_build_prompt()],
-            sampling_params_list=_sampling_params_list(),
-            py_generator=False,
-        )
-        ip_outputs = omni.generate(
-            prompts=[_build_prompt(text=TEST_TEXT, instruction={"IP": "灵小甄"}, use_zero_spk_emb=True)],
-            sampling_params_list=_sampling_params_list(),
-            py_generator=False,
-        )
 
-        style_final_output = next((item for item in style_outputs if item.final_output_type == "audio"), None)
-        ip_final_output = next((item for item in ip_outputs if item.final_output_type == "audio"), None)
-        assert style_final_output is not None, "No style audio output produced"
-        assert ip_final_output is not None, "No IP audio output produced"
-
-        style_waveform = _extract_audio(style_final_output.multimodal_output or {})
-        ip_waveform = _extract_audio(ip_final_output.multimodal_output or {})
-        assert style_waveform.numel() > MIN_AUDIO_SAMPLES
-        assert ip_waveform.numel() > MIN_AUDIO_SAMPLES
-        assert np.max(np.abs(style_waveform.numpy())) > 0.01, "Style audio appears silent"
-        assert np.max(np.abs(ip_waveform.numpy())) > 0.01, "IP audio appears silent"
-
-        overlap = min(int(style_waveform.numel()), int(ip_waveform.numel()))
-        mean_abs_diff = torch.mean(torch.abs(style_waveform[:overlap] - ip_waveform[:overlap])).item()
-        assert style_waveform.shape != ip_waveform.shape or mean_abs_diff > 1e-4, (
-            "Speaker-conditioned outputs should differ, but style and IP waveforms were effectively identical"
-        )
-    finally:
-        omni.close()
+    style_final_outputs = _extract_final_audio_outputs(style_outputs)
+    ip_final_outputs = _extract_final_audio_outputs(ip_outputs)
+    assert len(style_final_outputs) == 1, "No style audio output produced"
+    assert len(ip_final_outputs) == 1, "No IP audio output produced"
+
+    style_waveform = _extract_audio(_extract_multimodal_output(style_final_outputs[0]))
+    ip_waveform = _extract_audio(_extract_multimodal_output(ip_final_outputs[0]))
+    assert style_waveform.numel() > MIN_AUDIO_SAMPLES
+    assert ip_waveform.numel() > MIN_AUDIO_SAMPLES
+    assert np.max(np.abs(style_waveform.numpy())) > 0.01, "Style audio appears silent"
+    assert np.max(np.abs(ip_waveform.numpy())) > 0.01, "IP audio appears silent"
+
+    overlap = min(int(style_waveform.numel()), int(ip_waveform.numel()))
+    mean_abs_diff = torch.mean(torch.abs(style_waveform[:overlap] - ip_waveform[:overlap])).item()
+    assert style_waveform.shape != ip_waveform.shape or mean_abs_diff > 1e-4, (
+        "Speaker-conditioned outputs should differ, but style and IP waveforms were effectively identical"
+    )
+
+
+@pytest.mark.advanced_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+def test_ming_tts_multiple_prompts_queued(ming_engine, ming_tokenizer) -> None:
+    """Regression: supported max_num_seqs=1 config must still drain queued prompts."""
+    prompts = [
+        _build_prompt(
+            ming_tokenizer,
+            text="第一条语音用于验证 Ming dense 队列中的长请求可以完成。",
+            instruction="平静自然的旁白，语速中等",
+        ),
+        _build_prompt(
+            ming_tokenizer,
+            text="第二条短语音也必须生成完成。",
+            instruction="清晰明亮的女声",
+        ),
+    ]
+    outputs = ming_engine.generate(
+        prompts=prompts,
+        sampling_params_list=_sampling_params_list(),
+        py_generator=False,
+    )
+    final_outputs = _extract_final_audio_outputs(outputs)
+    assert len(final_outputs) == len(prompts), f"Expected {len(prompts)} audio outputs, got {len(final_outputs)}"
+    for i, output in enumerate(final_outputs):
+        waveform = _extract_audio(_extract_multimodal_output(output))
+        duration_s = waveform.numel() / SAMPLE_RATE
+        assert 0.1 < duration_s < 30.0, f"Request {i} audio duration out of range: {duration_s:.2f}s"
+        assert np.max(np.abs(waveform.numpy())) > 0.01, f"Request {i} audio appears silent"
 
 
 @pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
-def test_ming_tts_offline_streaming() -> None:
+def test_ming_tts_offline_streaming(ming_tokenizer) -> None:
     """Test async_chunk streaming Ming generation through AsyncOmni."""
 
     async def _run() -> None:
@@ -186,7 +250,7 @@ async def _run() -> None:
             chunk_idx = 0
             sample_rate = None
             async for stage_output in async_omni.generate(
-                prompt=_build_prompt(),
+                prompt=_build_prompt(ming_tokenizer),
                 request_id=str(uuid.uuid4()),
                 sampling_params_list=_sampling_params_list(),
             ):

From 5aeb88cf2ccf1e0cab0bd516eb65c84c0b585ffd Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Tue, 28 Apr 2026 20:12:05 +0530
Subject: [PATCH 15/54] tests: fix Ming TTS online imports

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 tests/e2e/online_serving/test_ming_tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/online_serving/test_ming_tts.py b/tests/e2e/online_serving/test_ming_tts.py
index cc8bcd1db7a..28c04cc74b1 100644
--- a/tests/e2e/online_serving/test_ming_tts.py
+++ b/tests/e2e/online_serving/test_ming_tts.py
@@ -14,8 +14,8 @@
 
 import pytest
 
+from tests.helpers.mark import hardware_test
 from tests.helpers.runtime import OmniServerParams
-from tests.utils import hardware_test
 from vllm_omni.model_executor.models.ming_tts.config_ming_tts import SAMPLE_RATE
 
 MODEL = "inclusionAI/Ming-omni-tts-0.5B"

From dc2ea22be6f43cd2ff4f24f2ee06fc279aac130c Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Tue, 28 Apr 2026 21:33:03 +0530
Subject: [PATCH 16/54]   test(ming_tts): fix L3 runtime and deploy config path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

   Offline: promote AsyncOmni to a module-scoped fixture so the streaming
   test shares the engine init with the other three tests instead of
   paying
   a fresh two-stage load each run (~30 min → ~15 min on L4). Also cleans
   up the inline try/finally that the fixture teardown now handles.

   Online: replace four-level Path(__file__).parent chain with
   get_deploy_config_path("ming_tts.yaml"), matching the convention used
   by cosyvoice3 and moss_tts_nano. Drops the now-unused pathlib import.

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 tests/e2e/offline_inference/test_ming_tts.py | 91 ++++++++++----------
 tests/e2e/online_serving/test_ming_tts.py    |  4 +-
 2 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/tests/e2e/offline_inference/test_ming_tts.py b/tests/e2e/offline_inference/test_ming_tts.py
index 83a33748704..9672dcdae1d 100644
--- a/tests/e2e/offline_inference/test_ming_tts.py
+++ b/tests/e2e/offline_inference/test_ming_tts.py
@@ -46,6 +46,18 @@ def ming_engine():
         yield runner.omni
 
 
+@pytest.fixture(scope="module")
+def async_omni_engine():
+    engine = AsyncOmni(
+        model=MODEL,
+        deploy_config=DEPLOY_CONFIG,
+        stage_init_timeout=300,
+        enforce_eager=True,
+    )
+    yield engine
+    engine.shutdown()
+
+
 def _build_prompt(
     tokenizer,
     *,
@@ -234,53 +246,44 @@ def test_ming_tts_multiple_prompts_queued(ming_engine, ming_tokenizer) -> None:
 @pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
-def test_ming_tts_offline_streaming(ming_tokenizer) -> None:
+def test_ming_tts_offline_streaming(async_omni_engine, ming_tokenizer) -> None:
     """Test async_chunk streaming Ming generation through AsyncOmni."""
 
     async def _run() -> None:
-        async_omni = AsyncOmni(
-            model=MODEL,
-            deploy_config=DEPLOY_CONFIG,
-            stage_init_timeout=300,
-            enforce_eager=True,
-        )
-        try:
-            all_audio_chunks = []
-            accumulated_samples = 0
-            chunk_idx = 0
-            sample_rate = None
-            async for stage_output in async_omni.generate(
-                prompt=_build_prompt(ming_tokenizer),
-                request_id=str(uuid.uuid4()),
-                sampling_params_list=_sampling_params_list(),
-            ):
-                multimodal_output = stage_output.multimodal_output or {}
-                audio = multimodal_output.get("audio")
-                if "sr" in multimodal_output:
-                    sample_rate = _extract_sample_rate(multimodal_output)
-                if audio is None:
-                    continue
-                finished = stage_output.finished
-                if isinstance(audio, torch.Tensor):
-                    if finished:
-                        audio_chunk = audio[accumulated_samples:].float().detach().cpu()
-                    else:
-                        audio_chunk = audio.float().detach().cpu()
-                elif isinstance(audio, list):
-                    audio_chunk = torch.as_tensor(audio[chunk_idx], dtype=torch.float32).reshape(-1).cpu()
+        all_audio_chunks = []
+        accumulated_samples = 0
+        chunk_idx = 0
+        sample_rate = None
+        async for stage_output in async_omni_engine.generate(
+            prompt=_build_prompt(ming_tokenizer),
+            request_id=str(uuid.uuid4()),
+            sampling_params_list=_sampling_params_list(),
+        ):
+            multimodal_output = stage_output.multimodal_output or {}
+            audio = multimodal_output.get("audio")
+            if "sr" in multimodal_output:
+                sample_rate = _extract_sample_rate(multimodal_output)
+            if audio is None:
+                continue
+            finished = stage_output.finished
+            if isinstance(audio, torch.Tensor):
+                if finished:
+                    audio_chunk = audio[accumulated_samples:].float().detach().cpu()
                 else:
-                    audio_chunk = torch.as_tensor(audio, dtype=torch.float32).reshape(-1).cpu()
-                accumulated_samples += int(audio_chunk.numel())
-                chunk_idx += 1
-                if audio_chunk.numel() > 0:
-                    all_audio_chunks.append(audio_chunk)
-            assert all_audio_chunks, "No streaming audio chunks received"
-            waveform = torch.cat(all_audio_chunks, dim=0)
-            assert waveform.numel() > MIN_AUDIO_SAMPLES
-            assert np.max(np.abs(waveform.numpy())) > 0.01, "Audio appears silent"
-            assert sample_rate is not None, "Streaming path did not return a sample rate"
-            assert sample_rate == SAMPLE_RATE, f"Expected Ming output sample rate {SAMPLE_RATE}, got {sample_rate}"
-        finally:
-            async_omni.shutdown()
+                    audio_chunk = audio.float().detach().cpu()
+            elif isinstance(audio, list):
+                audio_chunk = torch.as_tensor(audio[chunk_idx], dtype=torch.float32).reshape(-1).cpu()
+            else:
+                audio_chunk = torch.as_tensor(audio, dtype=torch.float32).reshape(-1).cpu()
+            accumulated_samples += int(audio_chunk.numel())
+            chunk_idx += 1
+            if audio_chunk.numel() > 0:
+                all_audio_chunks.append(audio_chunk)
+        assert all_audio_chunks, "No streaming audio chunks received"
+        waveform = torch.cat(all_audio_chunks, dim=0)
+        assert waveform.numel() > MIN_AUDIO_SAMPLES
+        assert np.max(np.abs(waveform.numpy())) > 0.01, "Audio appears silent"
+        assert sample_rate is not None, "Streaming path did not return a sample rate"
+        assert sample_rate == SAMPLE_RATE, f"Expected Ming output sample rate {SAMPLE_RATE}, got {sample_rate}"
 
     asyncio.run(_run())
diff --git a/tests/e2e/online_serving/test_ming_tts.py b/tests/e2e/online_serving/test_ming_tts.py
index 28c04cc74b1..4204d4c5462 100644
--- a/tests/e2e/online_serving/test_ming_tts.py
+++ b/tests/e2e/online_serving/test_ming_tts.py
@@ -7,7 +7,6 @@
 import io
 import os
 import wave
-from pathlib import Path
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
@@ -16,10 +15,11 @@
 
 from tests.helpers.mark import hardware_test
 from tests.helpers.runtime import OmniServerParams
+from tests.helpers.stage_config import get_deploy_config_path
 from vllm_omni.model_executor.models.ming_tts.config_ming_tts import SAMPLE_RATE
 
 MODEL = "inclusionAI/Ming-omni-tts-0.5B"
-DEPLOY_CONFIG = str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "deploy" / "ming_tts.yaml")
+DEPLOY_CONFIG = get_deploy_config_path("ming_tts.yaml")
 
 SERVER_PARAMS = [
     pytest.param(

From a781827c649a28e4d8e7c83b8ffd161cfa8f4990 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Tue, 28 Apr 2026 23:01:04 +0530
Subject: [PATCH 17/54] fix(ming_tts): align async chunk payload with
 generation adapter

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../test_ming_tts_async_chunk.py                     | 12 ++++++++++++
 .../stage_input_processors/ming_tts.py               |  4 ++++
 2 files changed, 16 insertions(+)

diff --git a/tests/model_executor/stage_input_processors/test_ming_tts_async_chunk.py b/tests/model_executor/stage_input_processors/test_ming_tts_async_chunk.py
index 1a7acd04263..cfd0e43045f 100644
--- a/tests/model_executor/stage_input_processors/test_ming_tts_async_chunk.py
+++ b/tests/model_executor/stage_input_processors/test_ming_tts_async_chunk.py
@@ -131,6 +131,8 @@ def test_llm2audio_vae_async_chunk_emits_full_chunk():
     )
 
     assert payload is not None
+    assert payload["codes"]["audio"] == [0]
+    assert payload["meta"]["finished"].item() is False
     assert payload["finished"].item() is False
     assert payload["stream_finished"].item() is False
     assert payload[KEY_REQUEST_ID] == request_id
@@ -210,6 +212,8 @@ def test_llm2audio_vae_async_chunk_finish_after_full_chunk_only_emits_eof():
     )
 
     assert finish_payload == {
+        "codes": {"audio": []},
+        "meta": {"finished": torch.tensor(True, dtype=torch.bool)},
         "code_predictor_codes": [],
         "finished": torch.tensor(True, dtype=torch.bool),
         "stream_finished": torch.tensor(True, dtype=torch.bool),
@@ -238,6 +242,8 @@ def test_llm2audio_vae_async_chunk_flushes_tail_on_finish_without_new_patch():
     )
 
     assert payload is not None
+    assert payload["codes"]["audio"] == [0]
+    assert payload["meta"]["finished"].item() is True
     assert payload["finished"].item() is True
     assert payload["stream_finished"].item() is True
     assert payload[KEY_REQUEST_ID] == request_id
@@ -262,6 +268,8 @@ def test_llm2audio_vae_async_chunk_final_flush_emits_partial_chunk_with_new_patc
     )
 
     assert payload is not None
+    assert payload["codes"]["audio"] == [0]
+    assert payload["meta"]["finished"].item() is True
     assert payload["finished"].item() is True
     assert payload["stream_finished"].item() is True
     assert payload[MING_EMIT_PATCH_COUNT_KEY] == 2
@@ -283,6 +291,8 @@ def test_llm2audio_vae_async_chunk_emits_eof_when_finished_without_frames():
     )
 
     assert payload == {
+        "codes": {"audio": []},
+        "meta": {"finished": torch.tensor(True, dtype=torch.bool)},
         "code_predictor_codes": [],
         "finished": torch.tensor(True, dtype=torch.bool),
         "stream_finished": torch.tensor(True, dtype=torch.bool),
@@ -308,6 +318,8 @@ def test_llm2audio_vae_async_chunk_zero_latent_final_flush_returns_empty_payload
     )
 
     assert payload == {
+        "codes": {"audio": []},
+        "meta": {"finished": torch.tensor(True, dtype=torch.bool)},
         "code_predictor_codes": [],
         "finished": torch.tensor(True, dtype=torch.bool),
         "stream_finished": torch.tensor(True, dtype=torch.bool),
diff --git a/vllm_omni/model_executor/stage_input_processors/ming_tts.py b/vllm_omni/model_executor/stage_input_processors/ming_tts.py
index f66794cfb3e..cfbf0e401aa 100644
--- a/vllm_omni/model_executor/stage_input_processors/ming_tts.py
+++ b/vllm_omni/model_executor/stage_input_processors/ming_tts.py
@@ -207,6 +207,8 @@ def llm2audio_vae_async_chunk(
         if finished and not bool(state.get("terminal_sent", False)):
             observability = _build_chunk_observability(None, final_flush=True)
             payload = {
+                "codes": {"audio": []},
+                "meta": {"finished": torch.tensor(True, dtype=torch.bool)},
                 "code_predictor_codes": [],
                 "finished": torch.tensor(True, dtype=torch.bool),
                 "stream_finished": torch.tensor(True, dtype=torch.bool),
@@ -233,6 +235,8 @@ def llm2audio_vae_async_chunk(
     observability = _build_chunk_observability(latent_patches, final_flush=finished)
 
     payload = {
+        "codes": {"audio": [0]},
+        "meta": {"finished": torch.tensor(finished, dtype=torch.bool)},
         "code_predictor_codes": [0],
         "ming_latent_patches": latent_patches,
         "finished": torch.tensor(finished, dtype=torch.bool),

From 0508d2bf9b9b1670d386809a0fccef52868bf415 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 29 Apr 2026 01:33:26 +0530
Subject: [PATCH 18/54] Fix Ming TTS codec frame rate derivation for online
 serving

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../openai_api/test_serving_speech.py         | 26 +++++++++++++++++++
 .../entrypoints/openai/serving_speech.py      | 26 +++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py
index 2f6ab5213d3..7ec6358a8e6 100644
--- a/tests/entrypoints/openai_api/test_serving_speech.py
+++ b/tests/entrypoints/openai_api/test_serving_speech.py
@@ -2439,6 +2439,32 @@ def test_prepare_speech_generation_extracts_ming_single_ref_audio_speaker_embedd
         )
         assert request.speaker_embedding == [0.3] * 192
 
+    def test_load_codec_frame_rate_derives_ming_rate_from_hf_config(self, mocker: MockerFixture):
+        server = object.__new__(OmniOpenAIServingSpeech)
+        server._tts_model_type = "ming_tts"
+        server.engine_client = mocker.MagicMock()
+        server.engine_client.model_config = mocker.MagicMock()
+        server.engine_client.model_config.model = "inclusionAI/Ming-omni-tts-0.5B"
+        server.engine_client.model_config.hf_config = SimpleNamespace(
+            llm_config={},
+            ditar_config={"patch_size": 4},
+            aggregator_config={},
+            audio_tokenizer_config={
+                "sample_rate": 44100,
+                "enc_kwargs": {
+                    "hop_size": 882,
+                    "input_dim": 882,
+                    "latent_dim": 80,
+                },
+                "dec_kwargs": {"output_dim": 882},
+                "patch_size": 16,
+            },
+        )
+
+        rate = OmniOpenAIServingSpeech._load_codec_frame_rate(server)
+
+        assert rate == pytest.approx(12.5)
+
 
 class TestWAVStreaming:
     """Integration tests for WAV format streaming."""
diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index 5e994a77ef6..5ae99c23cce 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -256,6 +256,32 @@ def __init__(self, *args, **kwargs):
 
     def _load_codec_frame_rate(self) -> float | None:
         """Load codec frame rate from speech tokenizer config for prompt length estimation."""
+        if self._tts_model_type == "ming_tts":
+            try:
+                from vllm_omni.model_executor.models.ming_tts.config_ming_tts import MingTTSConfig
+
+                hf_config = self.engine_client.model_config.hf_config
+                ming_cfg = MingTTSConfig.from_hf_config(hf_config)
+                patch_size = int(ming_cfg.patch_size)
+                audio_frame_hop = int(ming_cfg.audio_frame_hop)
+                sample_rate = int(ming_cfg.sample_rate)
+                if patch_size <= 0 or audio_frame_hop <= 0 or sample_rate <= 0:
+                    raise ValueError(
+                        "Ming config has invalid tokenizer timing values: "
+                        f"patch_size={patch_size}, audio_frame_hop={audio_frame_hop}, sample_rate={sample_rate}"
+                    )
+                rate = float(sample_rate) / float(audio_frame_hop * patch_size)
+                logger.info(
+                    "Derived Ming codec frame rate: %.1f Hz (sample_rate=%s, audio_frame_hop=%s, patch_size=%s)",
+                    rate,
+                    sample_rate,
+                    audio_frame_hop,
+                    patch_size,
+                )
+                return rate
+            except Exception as e:
+                logger.warning(f"Failed to derive Ming codec frame rate from hf_config: {e}")
+
         try:
             model_path = self.engine_client.model_config.model
             st_config_path = os.path.join(model_path, "speech_tokenizer", "config.json")

From 88199f42b9e99bf56ad333fe6ebefa99148e0a20 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 13 May 2026 19:06:54 +0530
Subject: [PATCH 19/54] refactor(ming-tts): flatten prompt helpers and remove
 legacy dense   scaffolding

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../offline_inference/ming_tts/end2end.py     |   2 +-
 tests/e2e/offline_inference/test_ming_tts.py  |   2 +-
 .../openai_api/test_serving_speech.py         | 401 -----------
 .../ming_tts/test_ming_tts_components.py      | 505 -------------
 .../ming_tts/test_ming_tts_config_shim.py     |  51 --
 .../models/ming_tts/test_ming_tts_loaders.py  | 524 --------------
 .../ming_tts/test_ming_tts_prompt_builder.py  | 375 ----------
 .../test_ming_tts_async_chunk.py              | 433 -----------
 tests/worker/test_ming_tts_runner.py          | 676 ------------------
 vllm_omni/engine/arg_utils.py                 |   4 +-
 .../entrypoints/openai/serving_speech.py      |  14 +-
 .../models/ming_tts/__init__.py               |   2 +-
 .../models/ming_tts/backbone.py               |  62 --
 .../models/ming_tts/config_ming_tts.py        |  49 ++
 .../ming_tts/configuration_ming_dense.py      |  57 --
 .../model_executor/models/ming_tts/fm/cfm.py  |  58 --
 .../model_executor/models/ming_tts/fm/dit.py  |  21 +-
 .../model_executor/models/ming_tts/ingress.py |   2 +-
 .../model_executor/models/ming_tts/loader.py  | 270 -------
 .../models/ming_tts/ming_tts.py               |   7 +-
 .../models/ming_tts/ming_tts_llm.py           |  77 +-
 .../ming_tts/prompt_builder/__init__.py       |  37 -
 .../models/ming_tts/prompt_builder/_base.py   | 210 ------
 .../ming_tts/prompt_builder/builders.py       | 210 ------
 .../stage_configs/ming_tts.yaml               |  65 --
 .../stage_configs/ming_tts_async_chunk.yaml   |  86 ---
 26 files changed, 73 insertions(+), 4127 deletions(-)
 delete mode 100644 tests/model_executor/models/ming_tts/test_ming_tts_components.py
 delete mode 100644 tests/model_executor/models/ming_tts/test_ming_tts_config_shim.py
 delete mode 100644 tests/model_executor/models/ming_tts/test_ming_tts_loaders.py
 delete mode 100644 tests/model_executor/models/ming_tts/test_ming_tts_prompt_builder.py
 delete mode 100644 tests/model_executor/stage_input_processors/test_ming_tts_async_chunk.py
 delete mode 100644 tests/worker/test_ming_tts_runner.py
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/backbone.py
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/configuration_ming_dense.py
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/prompt_builder/__init__.py
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/prompt_builder/_base.py
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/prompt_builder/builders.py
 delete mode 100644 vllm_omni/model_executor/stage_configs/ming_tts.yaml
 delete mode 100644 vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml

diff --git a/examples/offline_inference/ming_tts/end2end.py b/examples/offline_inference/ming_tts/end2end.py
index e86c3b969f1..43d693b7675 100644
--- a/examples/offline_inference/ming_tts/end2end.py
+++ b/examples/offline_inference/ming_tts/end2end.py
@@ -20,7 +20,7 @@
     KEY_TEMPERATURE,
     SAMPLE_RATE,
 )
-from vllm_omni.model_executor.models.ming_tts.prompt_builder import build_ming_dense_prompt
+from vllm_omni.model_executor.models.ming_tts.prompt_utils import build_ming_dense_prompt
 from vllm_omni.model_executor.models.ming_tts.speaker_extractor import MingSpeakerEmbeddingExtractor
 
 try:
diff --git a/tests/e2e/offline_inference/test_ming_tts.py b/tests/e2e/offline_inference/test_ming_tts.py
index 9672dcdae1d..b6562b19c3e 100644
--- a/tests/e2e/offline_inference/test_ming_tts.py
+++ b/tests/e2e/offline_inference/test_ming_tts.py
@@ -21,7 +21,7 @@
     SAMPLE_RATE,
     TEXT_EOS_TOKEN_ID,
 )
-from vllm_omni.model_executor.models.ming_tts.prompt_builder import build_ming_dense_prompt
+from vllm_omni.model_executor.models.ming_tts.prompt_utils import build_ming_dense_prompt
 
 MODEL = "inclusionAI/Ming-omni-tts-0.5B"
 DEPLOY_CONFIG = get_deploy_config_path("ming_tts.yaml")
diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py
index 59db25c051b..72767ec02d0 100644
--- a/tests/entrypoints/openai_api/test_serving_speech.py
+++ b/tests/entrypoints/openai_api/test_serving_speech.py
@@ -6,7 +6,6 @@
 from inspect import Signature, signature
 from pathlib import Path
 from types import SimpleNamespace
-from unittest.mock import AsyncMock, MagicMock
 
 import numpy as np
 import pytest
@@ -1995,48 +1994,6 @@ def fish_speech_server(mocker: MockerFixture):
     server.shutdown()
 
 
-@pytest.fixture
-def ming_speech_server(mocker: MockerFixture):
-    mocker.patch.object(OmniOpenAIServingSpeech, "_load_supported_speakers", return_value={"灵小甄"})
-    mocker.patch.object(OmniOpenAIServingSpeech, "_load_codec_frame_rate", return_value=None)
-
-    mock_engine_client = mocker.MagicMock()
-    mock_engine_client.errored = False
-    mock_engine_client.model_config = mocker.MagicMock(model="inclusionAI/Ming-omni-tts-0.5B")
-    mock_engine_client.default_sampling_params_list = [
-        SimpleNamespace(max_tokens=512, stop_token_ids=[]),
-        SimpleNamespace(max_tokens=1, stop_token_ids=[]),
-    ]
-    mock_engine_client.tts_batch_max_items = 32
-    mock_engine_client.generate = mocker.MagicMock(return_value="generator")
-    mock_engine_client.stage_configs = [
-        SimpleNamespace(
-            engine_args=SimpleNamespace(
-                model_stage="llm",
-                model_arch="MingTTSForConditionalGeneration",
-                worker_type="ar",
-            ),
-            tts_args={},
-        )
-    ]
-
-    mock_models = mocker.MagicMock()
-    mock_models.is_base_model.return_value = True
-
-    server = OmniOpenAIServingSpeech(
-        engine_client=mock_engine_client,
-        models=mock_models,
-        request_logger=mocker.MagicMock(),
-    )
-    server._build_ming_prompt = MagicMock(
-        return_value={
-            "prompt_token_ids": [1, 2, 3],
-            "additional_information": {},
-        }
-    )
-    return server
-
-
 class TestFishSpeechServing:
     def test_build_fish_prompt_normalizes_legacy_speaker_tags(self, fish_speech_server):
         tokenizer = _FakeFishTokenizer()
@@ -2173,364 +2130,6 @@ def test_create_speech_batch_allows_fish_text_only_items(self, fish_speech_serve
         fish_speech_server._generate_audio_bytes.assert_awaited_once()
 
 
-class TestMingSpeechServing:
-    class _FakeMingTokenizer:
-        def __init__(self):
-            self._token_to_id = {
-                "<audioPatch>": 9001,
-                "<|vision_start|>": 9002,
-                "<|vision_pad|>": 9003,
-                "<|vision_end|>\n": 9004,
-            }
-            self._next = 100
-
-        def encode(self, text):
-            if text not in self._token_to_id:
-                self._token_to_id[text] = self._next
-                self._next += 1
-            return [self._token_to_id[text]]
-
-        def convert_tokens_to_ids(self, token):
-            if token not in self._token_to_id:
-                self._token_to_id[token] = self._next
-                self._next += 1
-            return self._token_to_id[token]
-
-    def test_protocol_accepts_ming_podcast_ref_audio_and_nested_embeddings(self):
-        request = OpenAICreateSpeechRequest(
-            input=" speaker_1:你好。\n speaker_2:你好。\n",
-            ref_audio=["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"],
-            ref_text=" speaker_1:参考一。\n speaker_2:参考二。\n",
-            speaker_embedding=[[0.1] * 192, [0.2] * 192],
-        )
-
-        assert request.ref_audio == ["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"]
-        assert request.speaker_embedding == [[0.1] * 192, [0.2] * 192]
-
-    def test_protocol_preserves_single_ming_ref_audio_and_flat_embedding(self):
-        single_ref = OpenAICreateSpeechRequest(
-            input="Hello",
-            ref_audio="data:audio/wav;base64,aaa",
-            ref_text="reference",
-        )
-        single_embedding = OpenAICreateSpeechRequest(
-            input="Hello",
-            speaker_embedding=[0.1] * 192,
-        )
-
-        assert single_ref.ref_audio == "data:audio/wav;base64,aaa"
-        assert single_embedding.speaker_embedding == [0.1] * 192
-
-    def test_validate_ming_podcast_rules(self, ming_speech_server):
-        valid = OpenAICreateSpeechRequest(
-            input=" speaker_1:你好。\n speaker_2:你好。\n",
-            ref_audio=["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"],
-            ref_text=" speaker_1:参考一。\n speaker_2:参考二。\n",
-        )
-        one_clip = OpenAICreateSpeechRequest(
-            input=" speaker_1:你好。\n",
-            ref_audio=["data:audio/wav;base64,aaa"],
-            ref_text=" speaker_1:参考一。\n",
-        )
-        missing_ref_text = OpenAICreateSpeechRequest(
-            input=" speaker_1:你好。\n speaker_2:你好。\n",
-            ref_audio=["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"],
-        )
-        mismatched_embeddings = OpenAICreateSpeechRequest(
-            input=" speaker_1:你好。\n speaker_2:你好。\n",
-            ref_audio=["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"],
-            ref_text=" speaker_1:参考一。\n speaker_2:参考二。\n",
-            speaker_embedding=[[0.1] * 192],
-        )
-
-        assert ming_speech_server._validate_ming_tts_request(valid) is None
-        assert "at least two" in ming_speech_server._validate_ming_tts_request(one_clip)
-        assert "ref_text" in ming_speech_server._validate_ming_tts_request(missing_ref_text)
-        assert "one speaker embedding per ref_audio" in ming_speech_server._validate_ming_tts_request(
-            mismatched_embeddings
-        )
-
-    def test_validate_ming_single_speaker_clone_still_accepts_existing_shape(self, ming_speech_server):
-        request = OpenAICreateSpeechRequest(
-            input="Hello",
-            ref_audio="data:audio/wav;base64,aaa",
-            ref_text="reference text",
-        )
-
-        assert ming_speech_server._validate_ming_tts_request(request) is None
-
-    def test_resolve_ref_audio_many_preserves_order(self, ming_speech_server):
-        ming_speech_server._resolve_ref_audio = AsyncMock(
-            side_effect=[
-                ([0.1, 0.2], 24000),
-                ([0.3, 0.4], 44100),
-            ]
-        )
-
-        resolved = asyncio.run(
-            ming_speech_server._resolve_ref_audio_many(["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"])
-        )
-
-        assert resolved == [([0.1, 0.2], 24000), ([0.3, 0.4], 44100)]
-        ming_speech_server._resolve_ref_audio.assert_any_await("data:audio/wav;base64,aaa")
-        ming_speech_server._resolve_ref_audio.assert_any_await("data:audio/wav;base64,bbb")
-
-    def test_extract_ming_speaker_embeddings_uses_one_call_per_wav(self, ming_speech_server, mocker: MockerFixture):
-        calls = []
-
-        class _FakeExtractor:
-            def __init__(self, model, target_sr=16000):
-                self.model = model
-                self.target_sr = target_sr
-
-            def extract_from_waveform(self, waveform, sample_rate):
-                calls.append(
-                    {
-                        "model": self.model,
-                        "target_sr": self.target_sr,
-                        "shape": tuple(waveform.shape),
-                        "sample_rate": int(sample_rate),
-                    }
-                )
-                return torch.full((192,), float(len(calls)), dtype=torch.float32)
-
-        mocker.patch(
-            "vllm_omni.model_executor.models.ming_tts.speaker_extractor.MingSpeakerEmbeddingExtractor",
-            _FakeExtractor,
-        )
-
-        embeddings = ming_speech_server._extract_ming_speaker_embeddings_from_ref_audio(
-            [
-                ([0.1, 0.2], 22050),
-                ([0.3, 0.4, 0.5], 44100),
-            ]
-        )
-
-        assert len(embeddings) == 2
-        assert embeddings[0] == [1.0] * 192
-        assert embeddings[1] == [2.0] * 192
-        assert calls == [
-            {
-                "model": "inclusionAI/Ming-omni-tts-0.5B",
-                "target_sr": 16000,
-                "shape": (1, 2),
-                "sample_rate": 22050,
-            },
-            {
-                "model": "inclusionAI/Ming-omni-tts-0.5B",
-                "target_sr": 16000,
-                "shape": (1, 3),
-                "sample_rate": 44100,
-            },
-        ]
-
-    def test_build_ming_prompt_handles_multi_speaker_podcast_inputs(self, ming_speech_server):
-        from vllm_omni.model_executor.models.ming_tts.config_ming_tts import KEY_SPEAKER_EMBEDDING
-
-        ming_speech_server._tts_tokenizer = self._FakeMingTokenizer()
-        request = OpenAICreateSpeechRequest(
-            input=" speaker_1:你好。\n speaker_2:你好。\n",
-            ref_audio=["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"],
-            ref_text=" speaker_1:参考一。\n speaker_2:参考二。\n",
-            speaker_embedding=[[0.1] * 192, [0.2] * 192],
-        )
-
-        prompt = OmniOpenAIServingSpeech._build_ming_dense_prompt(
-            ming_speech_server,
-            request,
-            ref_audio_data=[
-                ([0.1] * 10, 44100),
-                ([0.2] * 20, 44100),
-            ],
-        )
-
-        info = prompt["additional_information"]
-        assert tuple(info[KEY_SPEAKER_EMBEDDING].shape) == (2, 192)
-        assert int(info["prompt_waveform_length"].item()) >= 30
-        assert info["prompt_text"] == " speaker_1:参考一。\n speaker_2:参考二。\n"
-        assert (
-            prompt["prompt_token_ids"].count(
-                ming_speech_server._tts_tokenizer.convert_tokens_to_ids("<|vision_start|>")
-            )
-            == 2
-        )
-
-    def test_build_ming_prompt_concatenates_podcast_waveforms_before_builder(
-        self, ming_speech_server, mocker: MockerFixture
-    ):
-        captured = {}
-
-        def _fake_build_ming_dense_prompt(*args, **kwargs):
-            captured.update(kwargs)
-            return {"prompt_token_ids": [1], "additional_information": {}}
-
-        mocker.patch(
-            "vllm_omni.model_executor.models.ming_tts.prompt_builder.build_ming_dense_prompt",
-            side_effect=_fake_build_ming_dense_prompt,
-        )
-        ming_speech_server._tts_tokenizer = object()
-        request = OpenAICreateSpeechRequest(
-            input=" speaker_1:你好。\n speaker_2:你好。\n",
-            ref_audio=["data:audio/wav;base64,aaa", "data:audio/wav;base64,bbb"],
-            ref_text=" speaker_1:参考一。\n speaker_2:参考二。\n",
-            speaker_embedding=[[0.1] * 192, [0.2] * 192],
-        )
-
-        OmniOpenAIServingSpeech._build_ming_dense_prompt(
-            ming_speech_server,
-            request,
-            ref_audio_data=[
-                ([0.1] * 10, 44100),
-                ([0.2] * 20, 44100),
-            ],
-        )
-
-        assert tuple(captured["prompt_waveform"].shape) == (1, 30)
-        assert captured["speaker_embedding"] == [[0.1] * 192, [0.2] * 192]
-        assert captured["prompt_text"] == " speaker_1:参考一。\n speaker_2:参考二。\n"
-
-    def test_build_ming_prompt_omits_prompt_waveform_without_ref_text(self, ming_speech_server, mocker: MockerFixture):
-        captured = {}
-
-        def _fake_build_ming_dense_prompt(*args, **kwargs):
-            captured.update(kwargs)
-            return {"prompt_token_ids": [1], "additional_information": {}}
-
-        mocker.patch(
-            "vllm_omni.model_executor.models.ming_tts.prompt_builder.build_ming_dense_prompt",
-            side_effect=_fake_build_ming_dense_prompt,
-        )
-        ming_speech_server._tts_tokenizer = object()
-        request = OpenAICreateSpeechRequest(
-            input="我竟然抢到了陈奕迅的演唱会门票！",
-            ref_audio="data:audio/wav;base64,aaa",
-            instructions='{"情感":"高兴"}',
-        )
-
-        OmniOpenAIServingSpeech._build_ming_dense_prompt(
-            ming_speech_server,
-            request,
-            ref_audio_data=([0.1] * 10, 44100),
-        )
-
-        assert captured["prompt_waveform"] is None
-        assert captured["prompt_text"] is None
-        assert captured["speaker_embedding"] is None
-
-    def test_build_ming_prompt_keeps_single_ref_audio_waveform_with_ref_text(
-        self, ming_speech_server, mocker: MockerFixture
-    ):
-        captured = {}
-
-        def _fake_build_ming_dense_prompt(*args, **kwargs):
-            captured.update(kwargs)
-            return {"prompt_token_ids": [1], "additional_information": {}}
-
-        mocker.patch(
-            "vllm_omni.model_executor.models.ming_tts.prompt_builder.build_ming_dense_prompt",
-            side_effect=_fake_build_ming_dense_prompt,
-        )
-        ming_speech_server._tts_tokenizer = object()
-        request = OpenAICreateSpeechRequest(
-            input="我们的愿景是构建未来服务业的数字化基础设施。",
-            ref_audio="data:audio/wav;base64,aaa",
-            ref_text="在此奉劝大家别乱打美白针。",
-        )
-
-        OmniOpenAIServingSpeech._build_ming_dense_prompt(
-            ming_speech_server,
-            request,
-            ref_audio_data=([0.1] * 10, 44100),
-        )
-
-        assert tuple(captured["prompt_waveform"].shape) == (1, 10)
-        assert captured["prompt_text"] == "在此奉劝大家别乱打美白针。"
-        assert captured["speaker_embedding"] is None
-
-    def test_prepare_speech_generation_sets_ming_stop_token(self, ming_speech_server):
-        from vllm_omni.model_executor.models.ming_tts.config_ming_tts import TEXT_EOS_TOKEN_ID
-
-        request = OpenAICreateSpeechRequest(
-            input="这款产品的名字，叫变态坑爹牛肉丸。",
-            voice="灵小甄",
-        )
-
-        request_id, generator, _ = asyncio.run(ming_speech_server._prepare_speech_generation(request))
-
-        assert request_id.startswith("speech-")
-        assert generator == "generator"
-        sampling_params_list = ming_speech_server.engine_client.generate.call_args.kwargs["sampling_params_list"]
-        assert sampling_params_list[0].stop_token_ids == [int(TEXT_EOS_TOKEN_ID)]
-        assert sampling_params_list[0].max_tokens == 512
-        assert ming_speech_server.engine_client.default_sampling_params_list[0].stop_token_ids == []
-        assert ming_speech_server.engine_client.default_sampling_params_list[0].max_tokens == 512
-
-    def test_prepare_speech_generation_overrides_ming_stage_max_tokens(self, ming_speech_server):
-        from vllm_omni.model_executor.models.ming_tts.config_ming_tts import TEXT_EOS_TOKEN_ID
-
-        request = OpenAICreateSpeechRequest(
-            input="这款产品的名字，叫变态坑爹牛肉丸。",
-            voice="灵小甄",
-            max_new_tokens=16,
-        )
-
-        request_id, generator, _ = asyncio.run(ming_speech_server._prepare_speech_generation(request))
-
-        assert request_id.startswith("speech-")
-        assert generator == "generator"
-        sampling_params_list = ming_speech_server.engine_client.generate.call_args.kwargs["sampling_params_list"]
-        assert sampling_params_list[0].stop_token_ids == [int(TEXT_EOS_TOKEN_ID)]
-        assert sampling_params_list[0].max_tokens == 17
-        assert ming_speech_server.engine_client.default_sampling_params_list[0].max_tokens == 512
-
-    def test_prepare_speech_generation_extracts_ming_single_ref_audio_speaker_embedding(
-        self, ming_speech_server, mocker: MockerFixture
-    ):
-        request = OpenAICreateSpeechRequest(
-            input="我竟然抢到了陈奕迅的演唱会门票！",
-            ref_audio="data:audio/wav;base64,aaa",
-            instructions='{"情感":"高兴"}',
-        )
-        ming_speech_server._max_instructions_length = 500
-        ming_speech_server._resolve_ref_audio = AsyncMock(return_value=([0.1, 0.2], 44100))
-        ming_speech_server._extract_ming_speaker_embeddings_from_ref_audio = mocker.MagicMock(
-            return_value=[[0.3] * 192]
-        )
-
-        asyncio.run(ming_speech_server._prepare_speech_generation(request))
-
-        ming_speech_server._extract_ming_speaker_embeddings_from_ref_audio.assert_called_once_with(
-            [([0.1, 0.2], 44100)]
-        )
-        assert request.speaker_embedding == [0.3] * 192
-
-    def test_load_codec_frame_rate_derives_ming_rate_from_hf_config(self, mocker: MockerFixture):
-        server = object.__new__(OmniOpenAIServingSpeech)
-        server._tts_model_type = "ming_tts"
-        server.engine_client = mocker.MagicMock()
-        server.engine_client.model_config = mocker.MagicMock()
-        server.engine_client.model_config.model = "inclusionAI/Ming-omni-tts-0.5B"
-        server.engine_client.model_config.hf_config = SimpleNamespace(
-            llm_config={},
-            ditar_config={"patch_size": 4},
-            aggregator_config={},
-            audio_tokenizer_config={
-                "sample_rate": 44100,
-                "enc_kwargs": {
-                    "hop_size": 882,
-                    "input_dim": 882,
-                    "latent_dim": 80,
-                },
-                "dec_kwargs": {"output_dim": 882},
-                "patch_size": 16,
-            },
-        )
-
-        rate = OmniOpenAIServingSpeech._load_codec_frame_rate(server)
-
-        assert rate == pytest.approx(12.5)
-
-
 class TestWAVStreaming:
     """Integration tests for WAV format streaming."""
 
diff --git a/tests/model_executor/models/ming_tts/test_ming_tts_components.py b/tests/model_executor/models/ming_tts/test_ming_tts_components.py
deleted file mode 100644
index 866ef4b3f29..00000000000
--- a/tests/model_executor/models/ming_tts/test_ming_tts_components.py
+++ /dev/null
@@ -1,505 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from types import SimpleNamespace
-
-import pytest
-import torch
-import torch.nn as nn
-
-from vllm_omni.model_executor.models.ming_tts.aggregator import Aggregator
-from vllm_omni.model_executor.models.ming_tts.audio_tokenizer.configuration_audio_vae import AudioVAEconfig
-from vllm_omni.model_executor.models.ming_tts.audio_tokenizer.istft import ISTFT, ISTFTHead
-from vllm_omni.model_executor.models.ming_tts.audio_tokenizer.modeling_audio_vae import AudioVAE
-from vllm_omni.model_executor.models.ming_tts.audio_tokenizer.vae_modules import StreamingLinearUpsample
-from vllm_omni.model_executor.models.ming_tts.flowloss_head import FlowLoss
-from vllm_omni.model_executor.models.ming_tts.fm.cfm import CFM, Solver, get_epss_timesteps
-from vllm_omni.model_executor.models.ming_tts.fm.dit import (
-    CondEmbedder,
-    DiT,
-    SinusPositionEmbedding,
-    TimestepEmbedder,
-)
-from vllm_omni.model_executor.models.ming_tts.fm.modules import Attention, DiTBlock, RMSNorm
-from vllm_omni.model_executor.models.ming_tts.ming_tts import (
-    _coerce_prompt_latents,
-    _find_audio_placeholder_positions,
-    _initial_history,
-)
-from vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae import _coerce_finished, _coerce_latent_chunk
-from vllm_omni.model_executor.models.ming_tts.ming_tts_llm import _coerce_latent_history
-from vllm_omni.model_executor.stage_input_processors.ming_tts import llm2audio_vae_async_chunk
-
-pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
-
-
-def _tiny_qwen_config(hidden_size=8):
-    return {
-        "hidden_size": hidden_size,
-        "intermediate_size": hidden_size * 2,
-        "num_hidden_layers": 1,
-        "num_attention_heads": 2,
-        "num_key_value_heads": 2,
-        "vocab_size": 32,
-        "max_position_embeddings": 64,
-    }
-
-
-def _tiny_audio_vae_config():
-    return AudioVAEconfig(
-        sample_rate=16000,
-        patch_size=2,
-        enc_kwargs={
-            "backbone": _tiny_qwen_config(),
-            "input_dim": 4,
-            "hop_size": 4,
-            "latent_dim": 2,
-        },
-        dec_kwargs={
-            "backbone": _tiny_qwen_config(),
-            "output_dim": 4,
-            "latent_dim": 2,
-        },
-        semantic_module_kwargs=None,
-    )
-
-
-class _DummyCFMModel(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.anchor = nn.Parameter(torch.zeros(()))
-
-    def forward(self, x, t, c, latent_history, mask=None):
-        del t, c, latent_history
-        if mask is not None:
-            x = x.masked_fill(~mask.unsqueeze(-1), 0.0)
-        return x
-
-    def forward_with_cfg(self, x, t, c, cfg_scale, latent_history, patch_size):
-        del t, c, cfg_scale, latent_history
-        cond = x[:, -patch_size:, :] + 1.0
-        uncond = x[:, -patch_size:, :]
-        return torch.cat([cond, uncond], dim=0)
-
-
-def test_rmsnorm_preserves_shape_and_dtype():
-    norm = RMSNorm(dim=8, eps=1e-6)
-    x = torch.randn(2, 3, 8, dtype=torch.float32)
-
-    out = norm(x)
-
-    assert out.shape == x.shape
-    assert out.dtype == x.dtype
-
-
-def test_attention_forward_shape_and_mask():
-    attn = Attention(dim=8, heads=2, dim_head=4, dropout=0.0)
-    x = torch.randn(1, 5, 8)
-    mask = torch.tensor([[True, True, True, True, False]])
-
-    out = attn(x, mask=mask)
-
-    assert out.shape == x.shape
-    assert torch.allclose(out[:, -1], torch.zeros_like(out[:, -1]))
-
-
-def test_attention_rejects_bad_mask_shape():
-    attn = Attention(dim=8, heads=2, dim_head=4, dropout=0.0)
-    x = torch.randn(1, 5, 8)
-
-    with pytest.raises(ValueError, match="Mask shape mismatch"):
-        attn(x, mask=torch.ones(1, 4, dtype=torch.bool))
-
-
-def test_dit_block_forward_shape():
-    block = DiTBlock(hidden_size=8, num_heads=2, mlp_ratio=2.0, dropout=0.0)
-    x = torch.randn(1, 5, 8)
-    mask = torch.ones(1, 5, dtype=torch.bool)
-
-    out = block(x, mask, rope=None)
-
-    assert out.shape == x.shape
-
-
-def test_sinus_position_embedding_shape():
-    embed = SinusPositionEmbedding(dim=8)
-    t = torch.tensor([0.0, 1.0], dtype=torch.float32)
-
-    out = embed(t)
-
-    assert out.shape == (2, 8)
-
-
-def test_timestep_embedder_distinguishes_steps():
-    embedder = TimestepEmbedder(dim=8, freq_embed_dim=8)
-
-    out_a = embedder(torch.tensor([0.0], dtype=torch.float32))
-    out_b = embedder(torch.tensor([1.0], dtype=torch.float32))
-
-    assert out_a.shape == (1, 8)
-    assert not torch.allclose(out_a, out_b)
-
-
-def test_cond_embedder_rejects_bad_rank():
-    embedder = CondEmbedder(input_feature_size=4, hidden_size=8, dropout_prob=0.0)
-
-    with pytest.raises(ValueError, match="rank-3"):
-        embedder(torch.randn(1, 4), train=False)
-
-
-def test_cond_drop_preserves_conditioning_dtype():
-    embedder = CondEmbedder(input_feature_size=4, hidden_size=8, dropout_prob=1.0)
-    llm_cond = torch.randn(1, 1, 4, dtype=torch.float16)
-
-    out = embedder.cond_drop(llm_cond)
-
-    assert out.dtype == llm_cond.dtype
-
-
-def test_dit_forward_shape():
-    model = DiT(
-        in_channels=2,
-        hidden_size=8,
-        depth=1,
-        num_heads=2,
-        mlp_ratio=2.0,
-        llm_cond_dim=4,
-        cfg_dropout_prob=0.0,
-    )
-    x = torch.randn(1, 2, 2)
-    latent_history = torch.randn(1, 4, 2)
-    c = torch.randn(1, 1, 4)
-    mask = torch.ones(1, 2, dtype=torch.bool)
-
-    out = model(x=x, t=torch.tensor([0.5]), c=c, latent_history=latent_history, mask=mask)
-
-    assert out.shape == (1, 7, 2)
-
-
-def test_dit_forward_with_cfg_preserves_conditioning_dtype(monkeypatch):
-    model = DiT(
-        in_channels=2,
-        hidden_size=8,
-        depth=1,
-        num_heads=2,
-        mlp_ratio=2.0,
-        llm_cond_dim=4,
-        cfg_dropout_prob=0.0,
-    )
-    seen = {}
-
-    def _fake_forward(x, t, c, latent_history, mask=None):
-        del x, t, latent_history, mask
-        seen["dtype"] = c.dtype
-        return torch.zeros((c.shape[0], 7, 2), dtype=torch.float32)
-
-    monkeypatch.setattr(model, "forward", _fake_forward)
-    x = torch.randn(1, 2, 2, dtype=torch.float16)
-    latent_history = torch.randn(1, 4, 2, dtype=torch.float16)
-    c = torch.randn(1, 1, 4, dtype=torch.float16)
-
-    model.forward_with_cfg(
-        x=x,
-        t=torch.tensor([0.5], dtype=torch.float16),
-        c=c,
-        cfg_scale=2.0,
-        latent_history=latent_history,
-        patch_size=2,
-    )
-
-    assert seen["dtype"] == c.dtype
-
-
-def test_aggregator_forward_shape():
-    agg = Aggregator(
-        in_channels=2,
-        hidden_size=8,
-        depth=1,
-        num_heads=2,
-        mlp_ratio=2.0,
-        llm_input_dim=4,
-    )
-    x = torch.randn(2, 3, 2)
-    mask = torch.ones(2, 3, dtype=torch.bool)
-
-    out = agg(x, mask=mask)
-
-    assert out.shape == (2, 1, 4)
-
-
-def test_get_epss_timesteps_predefined_and_fallback():
-    predefined = get_epss_timesteps(10, device=torch.device("cpu"), dtype=torch.float32)
-    fallback = get_epss_timesteps(9, device=torch.device("cpu"), dtype=torch.float32)
-
-    assert predefined.shape == (11,)
-    assert torch.allclose(predefined[-1], torch.tensor(1.0))
-    assert fallback.shape == (10,)
-    assert torch.allclose(fallback, torch.linspace(0, 1, 10))
-
-
-def test_solver_integrate_zero_function_is_stable():
-    y0 = torch.ones(1, 2, 2)
-    solver = Solver(lambda t, y: torch.zeros_like(y), y0=y0, sigma=0.0, temperature=0.0)
-    t = torch.linspace(0, 1, 4)
-
-    out = solver.integrate(t)
-
-    assert out.shape == (4, 1, 2, 2)
-    assert torch.allclose(out[0], y0)
-    assert torch.allclose(out[-1], y0)
-
-
-def test_cfm_forward_returns_scalar_loss():
-    torch.manual_seed(0)
-    cfm = CFM(model=_DummyCFMModel())
-    cond = torch.randn(1, 1, 4)
-    target = torch.randn(1, 2, 2)
-    latent_history = torch.randn(1, 4, 2)
-    mask = torch.ones(1, 2, dtype=torch.bool)
-
-    loss = cfm(cond=cond, target=target, latent_history=latent_history, mask=mask, patch_size=2)
-
-    assert loss.ndim == 0
-    assert torch.isfinite(loss)
-
-
-def test_cfm_sample_returns_sample_and_trajectory():
-    torch.manual_seed(0)
-    cfm = CFM(model=_DummyCFMModel())
-    noise = torch.randn(1, 2, 2)
-    cond = torch.randn(1, 1, 4)
-    latent_history = torch.randn(1, 4, 2)
-
-    out, trajectory = cfm.sample(noise=noise, c=cond, latent_history=latent_history, steps=4, patch_size=2)
-
-    assert out.shape == (1, 2, 2)
-    assert trajectory.shape == (5, 1, 2, 2)
-
-
-def test_cfm_sample_rejects_low_cfg_scale():
-    cfm = CFM(model=_DummyCFMModel())
-    noise = torch.randn(1, 2, 2)
-    cond = torch.randn(1, 1, 4)
-    latent_history = torch.randn(1, 4, 2)
-
-    out, trajectory = cfm.sample(
-        noise=noise,
-        c=cond,
-        latent_history=latent_history,
-        cfg_scale=0.0,
-        patch_size=2,
-    )
-
-    assert out.shape == (1, 2, 2)
-    assert trajectory.ndim == 4
-
-
-def test_flowloss_sample_returns_tensor_shape_and_dtype(monkeypatch):
-    flow = FlowLoss(
-        z_channels=2,
-        llm_cond_dim=4,
-        hidden_size=8,
-        depth=1,
-        num_heads=2,
-        mlp_ratio=2.0,
-        cfg_dropout_prob=0.0,
-    )
-
-    def _fake_sample(**kwargs):
-        noise = kwargs["noise"]
-        return noise.transpose(1, 2), torch.zeros(1)
-
-    monkeypatch.setattr(flow.cfm, "sample", _fake_sample)
-    z = torch.randn(1, 1, 4, dtype=torch.float32)
-    latent_history = torch.randn(1, 4, 2, dtype=torch.float32)
-
-    out = flow.sample(z=z, latent_history=latent_history, patch_size=3)
-
-    assert out.shape == (1, 3, 2)
-    assert out.dtype == z.dtype
-
-
-def test_streaming_linear_upsample_rejects_empty_final_flush():
-    upsample = StreamingLinearUpsample(scale_factor=2)
-
-    with pytest.raises(ValueError, match="end-of-stream"):
-        upsample(None, state=None, is_last=True)
-
-
-def test_streaming_linear_upsample_streams_and_flushes():
-    upsample = StreamingLinearUpsample(scale_factor=2)
-    chunk_a = torch.randn(1, 2, 3)
-    chunk_b = torch.randn(1, 2, 3)
-
-    out_a, state = upsample(chunk_a, state=None, is_last=False)
-    out_b, state = upsample(chunk_b, state=state, is_last=True)
-
-    assert out_a is None
-    assert out_b is not None
-    assert out_b.shape[0] == 1
-    assert out_b.shape[-1] == 3
-    assert state is None
-
-
-def test_istft_rejects_bad_rank():
-    istft = ISTFT(n_fft=16, hop_length=4, win_length=16, padding="same")
-
-    with pytest.raises(ValueError, match="rank-3"):
-        istft(torch.randn(1, 9))
-
-
-def test_istft_head_output_shape():
-    head = ISTFTHead(dim=8, n_fft=16, hop_length=4, padding="same")
-    x = torch.randn(1, 3, 8)
-
-    audio, spec, audio_buffer, window_buffer = head(x)
-
-    assert audio.shape[0] == 1
-    assert audio.shape[1] == 1
-    assert spec.shape == (1, 18, 3)
-    assert audio_buffer is None
-    assert window_buffer is None
-
-
-def test_audio_vae_encode_and_decode_shapes():
-    torch.manual_seed(0)
-    vae = AudioVAE(_tiny_audio_vae_config())
-    waveform = torch.randn(1, 12)
-    waveform_length = torch.tensor([12], dtype=torch.int32)
-
-    latent, frame_num = vae.encode_latent(waveform, waveform_length)
-    audio, stream_state, past_key_values = vae.decode(latent, use_cache=False)
-
-    assert latent.ndim == 3
-    assert latent.shape[0] == 1
-    assert latent.shape[-1] == 2
-    assert frame_num.tolist() == [2]
-    assert audio.ndim == 3
-    assert audio.shape[0] == 1
-    assert audio.shape[1] == 1
-    assert stream_state == (None, None, None)
-    assert past_key_values is None
-
-
-def test_audio_vae_rejects_invalid_inputs():
-    vae = AudioVAE(_tiny_audio_vae_config())
-
-    with pytest.raises(ValueError, match="waveform rank-2"):
-        vae.encode_latent(torch.randn(12), torch.tensor([12], dtype=torch.int32))
-
-    with pytest.raises(ValueError, match="Latent dim mismatch"):
-        vae.decode(torch.randn(1, 2, 3))
-
-
-def test_coerce_prompt_latents_supports_frames_and_patch_groups():
-    frames = torch.arange(8, dtype=torch.float32).reshape(4, 2)
-    patches = torch.arange(16, dtype=torch.float32).reshape(2, 2, 4)
-
-    out_frames = _coerce_prompt_latents(frames, patch_size=2, latent_dim=2)
-    out_patches = _coerce_prompt_latents(patches, patch_size=2, latent_dim=4)
-
-    assert out_frames["patches"].shape == (2, 2, 2)
-    assert out_frames["frames"].shape == (4, 2)
-    assert out_patches["patches"].shape == (2, 2, 4)
-    assert out_patches["frames"].shape == (4, 4)
-
-
-def test_initial_history_keeps_tail():
-    frames = torch.arange(12, dtype=torch.float32).reshape(6, 2)
-
-    history = _initial_history(
-        frames,
-        history_size=4,
-        latent_dim=2,
-        device=torch.device("cpu"),
-        dtype=torch.float32,
-    )
-
-    assert history.shape == (4, 2)
-    assert torch.allclose(history, frames[-4:])
-
-
-def test_find_audio_placeholder_positions_uses_audio_span():
-    cfg = SimpleNamespace(
-        audio_dummy_token_id=151705,
-        audio_start_token_id=151706,
-        audio_end_token_id=151707,
-    )
-    input_ids = torch.tensor([151705, 1, 151706, 151705, 151705, 151707, 151705], dtype=torch.long)
-
-    out = _find_audio_placeholder_positions(input_ids, cfg)
-
-    assert out.tolist() == [3, 4]
-
-
-def test_helper_coercions_fail_loudly():
-    cfg = SimpleNamespace(history_patch_size=4, latent_dim=2)
-
-    assert _coerce_finished(torch.tensor([1], dtype=torch.bool)) is True
-    latent_chunk = _coerce_latent_chunk(
-        torch.ones(4, 2),
-        device=torch.device("cpu"),
-        dtype=torch.float32,
-        latent_dim=2,
-        patch_size=4,
-    )
-    assert latent_chunk.shape == (1, 4, 2)
-
-    grouped_chunk = _coerce_latent_chunk(
-        torch.ones(2, 4, 2),
-        device=torch.device("cpu"),
-        dtype=torch.float32,
-        latent_dim=2,
-        patch_size=4,
-    )
-    assert grouped_chunk.shape == (2, 4, 2)
-
-    with pytest.raises(RuntimeError, match="latent_history shape mismatch"):
-        _coerce_latent_history(torch.ones(3, 2), device=torch.device("cpu"), dtype=torch.float32, cfg=cfg)
-
-    with pytest.raises(ValueError, match="Latent patch size mismatch"):
-        _coerce_latent_chunk(
-            torch.ones(1, 3, 2),
-            device=torch.device("cpu"),
-            dtype=torch.float32,
-            latent_dim=2,
-            patch_size=4,
-        )
-
-    with pytest.raises(ValueError, match="Latent dim mismatch"):
-        _coerce_latent_chunk(
-            torch.ones(4, 3),
-            device=torch.device("cpu"),
-            dtype=torch.float32,
-            latent_dim=2,
-            patch_size=4,
-        )
-
-
-def test_ming_async_chunk_rejects_left_context_replay():
-    transfer_manager = SimpleNamespace(
-        connector=SimpleNamespace(config={"extra": {"latent_chunk_size": 10, "latent_left_context": 1}}),
-        put_req_chunk={"req-1": 0},
-        request_payload={},
-    )
-    request = SimpleNamespace(external_req_id="req-1", is_finished=lambda: False)
-
-    with pytest.raises(ValueError, match="latent_left_context replay"):
-        llm2audio_vae_async_chunk(
-            transfer_manager=transfer_manager,
-            pooling_output=None,
-            request=request,
-            is_finished=False,
-        )
-
-
-def test_coerce_latent_history_casts_to_requested_dtype():
-    cfg = SimpleNamespace(history_patch_size=4, latent_dim=2)
-
-    history = _coerce_latent_history(
-        torch.ones(1, 4, 2, dtype=torch.float16),
-        device=torch.device("cpu"),
-        dtype=torch.float32,
-        cfg=cfg,
-    )
-
-    assert history.dtype == torch.float32
diff --git a/tests/model_executor/models/ming_tts/test_ming_tts_config_shim.py b/tests/model_executor/models/ming_tts/test_ming_tts_config_shim.py
deleted file mode 100644
index 06cd4a8a787..00000000000
--- a/tests/model_executor/models/ming_tts/test_ming_tts_config_shim.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from transformers import AutoConfig
-
-from vllm_omni.engine.arg_utils import _register_omni_hf_configs
-from vllm_omni.model_executor.models.ming_tts.configuration_ming_dense import MingDenseConfig
-
-
-def test_ming_dense_autoconfig_registration_uses_local_config(tmp_path):
-    _register_omni_hf_configs()
-    model_dir = tmp_path / "ming"
-    model_dir.mkdir()
-    (model_dir / "config.json").write_text(
-        """
-{
-  "model_type": "dense",
-  "auto_map": {"AutoConfig": "configuration_bailingmm.BailingMMConfig"},
-  "llm_config": {
-    "model_type": "qwen2",
-    "hidden_size": 896,
-    "intermediate_size": 4864,
-    "num_hidden_layers": 24,
-    "num_attention_heads": 14,
-    "num_key_value_heads": 2,
-    "vocab_size": 151936
-  },
-  "audio_tokenizer_config": {
-    "sample_rate": 44100,
-    "patch_size": 4,
-    "enc_kwargs": {
-      "latent_dim": 64,
-      "input_dim": 882,
-      "hop_size": 882,
-      "backbone": {"attn_implementation": "flash_attention_2"}
-    },
-    "dec_kwargs": {
-      "latent_dim": 64,
-      "output_dim": 882,
-      "backbone": {"_attn_implementation": "flash_attention_2"}
-    }
-  }
-}
-""".strip()
-    )
-
-    cfg = AutoConfig.from_pretrained(model_dir, trust_remote_code=False, local_files_only=True)
-
-    assert isinstance(cfg, MingDenseConfig)
-    assert cfg.get_text_config().num_attention_heads == 14
-    assert cfg.audio_tokenizer_config.sample_rate == 44100
-    assert cfg.audio_tokenizer_config.patch_size == 4
diff --git a/tests/model_executor/models/ming_tts/test_ming_tts_loaders.py b/tests/model_executor/models/ming_tts/test_ming_tts_loaders.py
deleted file mode 100644
index b7f95469bf4..00000000000
--- a/tests/model_executor/models/ming_tts/test_ming_tts_loaders.py
+++ /dev/null
@@ -1,524 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from types import SimpleNamespace
-
-import pytest
-import torch
-from vllm.v1.outputs import SamplerOutput
-
-from vllm_omni.model_executor.models.ming_tts.config_ming_tts import KEY_PROMPT_LATENTS, KEY_REQUEST_ID, MingTTSConfig
-from vllm_omni.model_executor.models.ming_tts.ming_tts import MingTTSForConditionalGeneration
-from vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae import MingAudioVAEModel
-from vllm_omni.model_executor.models.ming_tts.ming_tts_llm import MingLLMModel
-
-
-class _DummyBackbone(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.model = torch.nn.Module()
-        self.model.layers = torch.nn.ModuleList([torch.nn.Linear(2, 2, bias=False)])
-        self.last_forward_kwargs = None
-
-    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return torch.zeros((input_ids.shape[0], 2), dtype=torch.float32)
-
-    def forward(self, *args, **kwargs):
-        del args
-        self.last_forward_kwargs = dict(kwargs)
-        return torch.zeros((1, 2), dtype=torch.float32)
-
-
-class _DummyAggregator(torch.nn.Module):
-    def __init__(self, in_channels: int, llm_input_dim: int, **kwargs):
-        super().__init__()
-        del kwargs
-        self.proj_in = torch.nn.Linear(in_channels, llm_input_dim, bias=False)
-
-    def forward(self, patch: torch.Tensor) -> torch.Tensor:
-        return self.proj_in(patch.mean(dim=1)).unsqueeze(1)
-
-
-class _DummyFlowLoss(torch.nn.Module):
-    def __init__(self, z_channels: int, llm_cond_dim: int, **kwargs):
-        super().__init__()
-        del z_channels, kwargs
-        self.dummy = torch.nn.Linear(llm_cond_dim, 64, bias=False)
-
-    def sample(self, **kwargs):
-        del kwargs
-        return torch.zeros((1, 4, 64), dtype=torch.float32)
-
-
-class _DummyAudioVAE(torch.nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        del config
-        self.encoder = torch.nn.Linear(2, 2, bias=False)
-        self.decoder = torch.nn.Linear(2, 2, bias=False)
-        self.last_chunk_values = []
-
-    def encode_latent(self, waveform: torch.Tensor, waveform_length: torch.Tensor):
-        del waveform_length
-        batch = int(waveform.shape[0])
-        return torch.zeros((batch, 8, 64), dtype=torch.float32), None
-
-    def decode(
-        self,
-        latent_patch: torch.Tensor,
-        *,
-        past_key_values=None,
-        use_cache=True,
-        stream_state=None,
-        last_chunk=False,
-    ):
-        del past_key_values, use_cache, stream_state
-        self.last_chunk_values.append(last_chunk)
-        samples = int(latent_patch.shape[1]) * 8
-        waveform = torch.ones((1, 1, samples), dtype=torch.float32)
-        return waveform, (None, None, None), None
-
-
-def _make_audio_cfg():
-    return SimpleNamespace(
-        enc_kwargs={
-            "backbone": {"hidden_size": 2},
-            "input_dim": 882,
-            "hop_size": 882,
-            "latent_dim": 64,
-        },
-        dec_kwargs={
-            "backbone": {"hidden_size": 2},
-            "output_dim": 882,
-            "latent_dim": 64,
-        },
-        patch_size=4,
-        sample_rate=44100,
-        semantic_module_kwargs=None,
-    )
-
-
-def _make_config() -> MingTTSConfig:
-    cfg = MingTTSConfig(audio_tokenizer_config=_make_audio_cfg())
-    cfg.validate()
-    return cfg
-
-
-def _make_vllm_config(model_stage: str):
-    return SimpleNamespace(
-        model_config=SimpleNamespace(hf_config=SimpleNamespace(), model_stage=model_stage),
-        quant_config=None,
-        device_config=SimpleNamespace(device=torch.device("cpu")),
-    )
-
-
-def test_ming_llm_load_weights_maps_and_loads_expected_prefixes(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: _DummyBackbone())
-    monkeypatch.setattr(llm_mod, "Aggregator", _DummyAggregator)
-    monkeypatch.setattr(llm_mod, "FlowLoss", _DummyFlowLoss)
-
-    model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
-    weights = [
-        ("model.model.layers.0.weight", torch.full((2, 2), 1.0, dtype=torch.float32)),
-        ("linear_proj_audio.proj_in.weight", torch.full((896, 64), 2.0, dtype=torch.float32)),
-        ("flowloss.dummy.weight", torch.full((64, 896), 3.0, dtype=torch.float32)),
-        ("stop_head.weight", torch.full((2, 896), 4.0, dtype=torch.float32)),
-        ("stop_head.bias", torch.full((2,), 5.0, dtype=torch.float32)),
-        ("spk_head.weight", torch.full((896, 192), 6.0, dtype=torch.float32)),
-        ("spk_head.bias", torch.full((896,), 7.0, dtype=torch.float32)),
-    ]
-
-    loaded = model.load_weights(weights)
-
-    assert "model.model.layers.0.weight" in loaded
-    assert "linear_proj_audio.proj_in.weight" in loaded
-    assert "flowloss.dummy.weight" in loaded
-    assert "stop_head.weight" in loaded
-    assert "spk_head.weight" in loaded
-    assert torch.allclose(model.model.model.layers[0].weight, torch.full((2, 2), 1.0))
-    assert torch.allclose(model.linear_proj_audio.proj_in.weight, torch.full((896, 64), 2.0))
-    assert torch.allclose(model.flowloss.dummy.weight, torch.full((64, 896), 3.0))
-
-
-def test_ming_llm_load_weights_accepts_complete_checkpoint_and_forward_shape(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: _DummyBackbone())
-    monkeypatch.setattr(llm_mod, "Aggregator", _DummyAggregator)
-    monkeypatch.setattr(llm_mod, "FlowLoss", _DummyFlowLoss)
-
-    model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
-    model.load_weights(
-        [
-            ("model.layers.0.weight", torch.ones((2, 2), dtype=torch.float32)),
-            ("linear_proj_audio.proj_in.weight", torch.ones((896, 64), dtype=torch.float32)),
-            ("flowloss.dummy.weight", torch.ones((64, 896), dtype=torch.float32)),
-            ("stop_head.weight", torch.ones((2, 896), dtype=torch.float32)),
-            ("stop_head.bias", torch.ones((2,), dtype=torch.float32)),
-            ("spk_head.weight", torch.ones((896, 192), dtype=torch.float32)),
-            ("spk_head.bias", torch.ones((896,), dtype=torch.float32)),
-        ]
-    )
-
-    output = model.forward(
-        input_ids=torch.tensor([1], dtype=torch.long),
-        positions=torch.tensor([0], dtype=torch.long),
-    )
-
-    assert output.text_hidden_states.shape == (1, 2)
-    assert output.multimodal_outputs is None
-
-
-def test_ming_llm_load_weights_fails_when_custom_heads_missing(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: _DummyBackbone())
-    monkeypatch.setattr(llm_mod, "Aggregator", _DummyAggregator)
-    monkeypatch.setattr(llm_mod, "FlowLoss", _DummyFlowLoss)
-
-    model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
-    weights = [
-        ("model.layers.0.weight", torch.full((2, 2), 1.0, dtype=torch.float32)),
-        ("stop_head.weight", torch.full((2, 896), 4.0, dtype=torch.float32)),
-        ("stop_head.bias", torch.full((2,), 5.0, dtype=torch.float32)),
-        ("spk_head.weight", torch.full((896, 192), 6.0, dtype=torch.float32)),
-        ("spk_head.bias", torch.full((896,), 7.0, dtype=torch.float32)),
-    ]
-
-    with pytest.raises(RuntimeError, match="flowloss|linear_proj_audio"):
-        model.load_weights(weights)
-
-
-def test_ming_llm_load_weights_rejects_incomplete_checkpoint(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: _DummyBackbone())
-    monkeypatch.setattr(llm_mod, "Aggregator", _DummyAggregator)
-    monkeypatch.setattr(llm_mod, "FlowLoss", _DummyFlowLoss)
-
-    model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
-
-    with pytest.raises(RuntimeError, match="flowloss|linear_proj_audio|stop_head|spk_head"):
-        model.load_weights(
-            [
-                ("model.layers.0.weight", torch.ones((2, 2), dtype=torch.float32)),
-                ("stop_head.weight", torch.ones((2, 896), dtype=torch.float32)),
-                ("stop_head.bias", torch.ones((2,), dtype=torch.float32)),
-            ]
-        )
-
-
-def test_ming_audio_vae_load_weights_fails_when_audio_params_missing(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae as vae_mod
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(vae_mod, "AudioVAE", _DummyAudioVAE)
-
-    model = MingAudioVAEModel(vllm_config=_make_vllm_config("audio_vae"))
-
-    with pytest.raises(RuntimeError, match="params not loaded"):
-        model.load_weights(
-            [
-                ("audio.encoder.weight", torch.full((2, 2), 1.0, dtype=torch.float32)),
-            ]
-        )
-
-
-def test_ming_audio_vae_load_weights_accepts_complete_checkpoint_and_forward_shape(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae as vae_mod
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(vae_mod, "AudioVAE", _DummyAudioVAE)
-
-    model = MingAudioVAEModel(vllm_config=_make_vllm_config("audio_vae"))
-    model.load_weights(
-        [
-            ("audio.encoder.weight", torch.ones((2, 2), dtype=torch.float32)),
-            ("audio.decoder.weight", torch.ones((2, 2), dtype=torch.float32)),
-        ]
-    )
-
-    output = model.forward(
-        runtime_additional_information=[
-            {
-                KEY_REQUEST_ID: "rid-audio",
-                "ming_latent_patches": torch.ones((1, 4, 64), dtype=torch.float32),
-                "stream_finished": torch.tensor(True, dtype=torch.bool),
-            }
-        ]
-    )
-
-    waveform = output.multimodal_outputs["model_outputs"][0]
-    sample_rate = output.multimodal_outputs["sr"][0]
-    assert waveform.ndim == 1
-    assert waveform.dtype == torch.float32
-    assert waveform.shape == (32,)
-    assert int(sample_rate.item()) == 44100
-    assert model.audio.last_chunk_values == [True]
-
-
-def test_ming_audio_vae_load_weights_rejects_incomplete_checkpoint(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae as vae_mod
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(vae_mod, "AudioVAE", _DummyAudioVAE)
-
-    model = MingAudioVAEModel(vllm_config=_make_vllm_config("audio_vae"))
-
-    with pytest.raises(RuntimeError, match="params not loaded|no checkpoint weights"):
-        model.load_weights(
-            [
-                ("audio.encoder.weight", torch.ones((2, 2), dtype=torch.float32)),
-            ]
-        )
-
-
-def test_ming_audio_vae_load_weights_rejects_empty_input(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae as vae_mod
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(vae_mod, "AudioVAE", _DummyAudioVAE)
-
-    model = MingAudioVAEModel(vllm_config=_make_vllm_config("audio_vae"))
-
-    with pytest.raises(RuntimeError, match="no checkpoint weights"):
-        model.load_weights([])
-
-
-def test_ming_llm_forward_drops_runner_only_kwargs(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
-
-    cfg = _make_config()
-    backbone = _DummyBackbone()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: backbone)
-    monkeypatch.setattr(llm_mod, "Aggregator", _DummyAggregator)
-    monkeypatch.setattr(llm_mod, "FlowLoss", _DummyFlowLoss)
-
-    model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
-    output = model.forward(
-        input_ids=torch.tensor([1], dtype=torch.long),
-        positions=torch.tensor([0], dtype=torch.long),
-        sampling_metadata=object(),
-        logits_index=0,
-        sampler=object(),
-        additional_information={"text": "hello"},
-    )
-
-    assert set(backbone.last_forward_kwargs) == {
-        "input_ids",
-        "positions",
-        "intermediate_tensors",
-        "inputs_embeds",
-    }
-    assert torch.equal(backbone.last_forward_kwargs["input_ids"], torch.tensor([1], dtype=torch.long))
-    assert torch.equal(backbone.last_forward_kwargs["positions"], torch.tensor([0], dtype=torch.long))
-    assert backbone.last_forward_kwargs["intermediate_tensors"] is None
-    assert torch.allclose(backbone.last_forward_kwargs["inputs_embeds"], torch.zeros((1, 2), dtype=torch.float32))
-    assert output.text_hidden_states.shape == (1, 2)
-    assert output.multimodal_outputs is None
-
-
-def test_ming_llm_forward_normalizes_runtime_additional_information(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
-
-    cfg = _make_config()
-    backbone = _DummyBackbone()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: backbone)
-    monkeypatch.setattr(llm_mod, "Aggregator", _DummyAggregator)
-    monkeypatch.setattr(llm_mod, "FlowLoss", _DummyFlowLoss)
-
-    model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
-    output = model.forward(
-        input_ids=torch.tensor([1], dtype=torch.long),
-        positions=torch.tensor([0], dtype=torch.long),
-        runtime_additional_information=[{"decode_step": 0}],
-    )
-
-    assert set(backbone.last_forward_kwargs) == {
-        "input_ids",
-        "positions",
-        "intermediate_tensors",
-        "inputs_embeds",
-    }
-    assert torch.equal(backbone.last_forward_kwargs["input_ids"], torch.tensor([1], dtype=torch.long))
-    assert torch.equal(backbone.last_forward_kwargs["positions"], torch.tensor([0], dtype=torch.long))
-    assert backbone.last_forward_kwargs["intermediate_tensors"] is None
-    assert torch.allclose(backbone.last_forward_kwargs["inputs_embeds"], torch.zeros((1, 2), dtype=torch.float32))
-    assert output.text_hidden_states.shape == (1, 2)
-    assert output.multimodal_outputs is None
-
-
-def test_ming_stage0_sampler_uses_model_sample(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts as ming_mod
-
-    class _DummyStage0(torch.nn.Module):
-        def sample(self, logits, sampling_metadata):
-            del logits, sampling_metadata
-            return SamplerOutput(
-                sampled_token_ids=torch.tensor([[151705]], dtype=torch.int32),
-                logprobs_tensors=None,
-            )
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(ming_mod, "init_vllm_registered_model", lambda **kwargs: _DummyStage0())
-
-    model = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("llm"))
-    sampler_output = model.sampler(
-        torch.zeros((1, cfg.llm_vocab_size), dtype=torch.float32),
-        SimpleNamespace(seq_groups=[]),
-    )
-
-    assert isinstance(sampler_output, SamplerOutput)
-    assert sampler_output.sampled_token_ids.dtype == torch.int32
-    assert sampler_output.sampled_token_ids.tolist() == [[151705]]
-
-
-def test_ming_stage0_load_weights_does_not_load_audio_weights(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts as ming_mod
-
-    class _DummyStage0(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.loaded = None
-
-        def load_weights(self, weights):
-            self.loaded = list(weights)
-            return {name for name, _ in self.loaded}
-
-    cfg = _make_config()
-    stage0 = _DummyStage0()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(ming_mod, "init_vllm_registered_model", lambda **kwargs: stage0)
-
-    model = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("llm"))
-    loaded = model.load_weights(
-        [
-            ("model.layers.0.weight", torch.ones((2, 2), dtype=torch.float32)),
-            ("linear_proj_audio.proj_in.weight", torch.ones((896, 64), dtype=torch.float32)),
-            ("flowloss.dummy.weight", torch.ones((64, 896), dtype=torch.float32)),
-            ("stop_head.weight", torch.ones((2, 896), dtype=torch.float32)),
-            ("spk_head.weight", torch.ones((896, 192), dtype=torch.float32)),
-            ("audio.encoder.weight", torch.ones((2, 2), dtype=torch.float32)),
-        ]
-    )
-
-    assert "model.audio.encoder.weight" not in loaded
-    assert all(not name.startswith("audio.") for name, _ in stage0.loaded)
-    assert not hasattr(model, "_prompt_audio_encoder")
-
-
-def test_ming_resolve_prompt_latents_accepts_raw_waveform(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts as ming_mod
-
-    class _DummyStage0(torch.nn.Module):
-        def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-            return torch.zeros((input_ids.shape[0], 2), dtype=torch.float32)
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(ming_mod, "init_vllm_registered_model", lambda **kwargs: _DummyStage0())
-
-    model = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("llm"))
-    direct = torch.ones((8, 64), dtype=torch.float32)
-
-    resolved = model._resolve_prompt_latents({KEY_PROMPT_LATENTS: direct})
-    assert resolved is not None
-    assert torch.equal(resolved["frames"], direct)
-
-    model._encode_prompt_waveform_to_latents = lambda waveform, waveform_length=None: torch.ones(
-        (8, 64), dtype=torch.float32
-    )
-    resolved = model._resolve_prompt_latents(
-        {
-            "prompt_waveform": torch.ones((1, 1000), dtype=torch.float32),
-            "prompt_waveform_length": torch.tensor([1000], dtype=torch.int32),
-            "prompt_text": "Reference words.",
-        }
-    )
-    assert resolved is not None
-    assert resolved["patches"].shape == (2, 4, 64)
-
-
-def test_ming_resolve_prompt_latents_rejects_dual_truth_waveform_and_latents(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts as ming_mod
-
-    class _DummyStage0(torch.nn.Module):
-        def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-            return torch.zeros((input_ids.shape[0], 2), dtype=torch.float32)
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(ming_mod, "init_vllm_registered_model", lambda **kwargs: _DummyStage0())
-
-    model = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("llm"))
-
-    with pytest.raises(ValueError, match="Choose exactly one source of truth"):
-        model._resolve_prompt_latents(
-            {
-                KEY_PROMPT_LATENTS: torch.ones((8, 64), dtype=torch.float32),
-                "prompt_waveform": torch.ones((1, 1000), dtype=torch.float32),
-                "prompt_waveform_length": torch.tensor([1000], dtype=torch.int32),
-                "prompt_text": "Reference words.",
-            }
-        )
-
-
-def test_ming_prefill_overwrites_speaker_slot_embedding(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts as ming_mod
-
-    class _DummyStage0(torch.nn.Module):
-        def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-            return torch.arange(int(input_ids.shape[0]) * 2, dtype=torch.float32).reshape(int(input_ids.shape[0]), 2)
-
-        def project_speaker_embedding(self, spk_emb: torch.Tensor) -> torch.Tensor:
-            del spk_emb
-            return torch.tensor([[101.0, 202.0]], dtype=torch.float32)
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(ming_mod, "init_vllm_registered_model", lambda **kwargs: _DummyStage0())
-
-    vllm_config = _make_vllm_config("llm")
-    vllm_config.model_config.hf_config = SimpleNamespace(vision_start_token_id=10)
-    model = MingTTSForConditionalGeneration(vllm_config=vllm_config)
-
-    input_ids = torch.tensor([1, 10, 20, 2], dtype=torch.long)
-    input_embeds = model.model.embed_input_ids(input_ids)
-    _, updated_embeds, _ = model._prefill_preprocess(
-        input_ids,
-        input_embeds,
-        speaker_embedding=torch.ones((192,), dtype=torch.float32),
-    )
-
-    assert torch.allclose(updated_embeds[2], torch.tensor([101.0, 202.0], dtype=torch.float32))
diff --git a/tests/model_executor/models/ming_tts/test_ming_tts_prompt_builder.py b/tests/model_executor/models/ming_tts/test_ming_tts_prompt_builder.py
deleted file mode 100644
index 4381b913021..00000000000
--- a/tests/model_executor/models/ming_tts/test_ming_tts_prompt_builder.py
+++ /dev/null
@@ -1,375 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from types import SimpleNamespace
-
-import pytest
-import torch
-
-from vllm_omni.model_executor.models.ming_tts.config_ming_tts import (
-    AUDIO_FRAME_HOP,
-    KEY_CFG,
-    KEY_MAX_DECODE_STEPS,
-    KEY_MIN_DECODE_STEPS,
-    KEY_PROMPT_LATENTS,
-    KEY_SPEAKER_EMBEDDING,
-    PATCH_SIZE,
-    SAMPLE_RATE,
-)
-from vllm_omni.model_executor.models.ming_tts.ingress import MingIngressProcessor
-from vllm_omni.model_executor.models.ming_tts.prompt_builder import (
-    build_dense_prompt_token_ids,
-    build_ming_dense_prompt,
-    count_prompt_waveform_patches,
-    pad_prompt_waveform,
-)
-
-
-class _DummyTokenizer:
-    def __init__(self):
-        self._token_to_id = {"<audioPatch>": 9001, "<|vision_start|>": 9002}
-        self._id_to_token = {token_id: token for token, token_id in self._token_to_id.items()}
-        self._next = 100
-
-    def encode(self, text):
-        if text not in self._token_to_id:
-            self._token_to_id[text] = self._next
-            self._id_to_token[self._next] = text
-            self._next += 1
-        return [self._token_to_id[text]]
-
-    def convert_tokens_to_ids(self, token):
-        if token not in self._token_to_id:
-            self._token_to_id[token] = self._next
-            self._id_to_token[self._next] = token
-            self._next += 1
-        return self._token_to_id[token]
-
-    def decode(self, token_ids):
-        return "".join(self._id_to_token[int(token_id)] for token_id in token_ids)
-
-
-def _make_dummy_ingress_processor(tokenizer):
-    processor = MingIngressProcessor.__new__(MingIngressProcessor)
-    processor.tokenizer = tokenizer
-    processor.profile_ingress = False
-    processor.ming_config = SimpleNamespace(patch_size=4, latent_dim=64, vae_patch_size=4, audio_frame_hop=882)
-    return processor
-
-
-def test_build_dense_prompt_token_ids_matches_ming_dense_layout():
-    tokenizer = _DummyTokenizer()
-
-    prompt_ids = build_dense_prompt_token_ids(
-        tokenizer,
-        prompt="Prompt text.",
-        text="Target text.",
-        instruction="instruction-json",
-        prompt_text="reference transcript",
-        speaker_count=2,
-        prompt_patch_count=3,
-    )
-
-    assert prompt_ids.count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == 3
-    assert prompt_ids.count(tokenizer.convert_tokens_to_ids("<|vision_start|>")) == 2
-    assert tokenizer.encode("instruction-json")[0] in prompt_ids
-    assert tokenizer.encode("reference transcript")[0] in prompt_ids
-
-
-def test_build_ming_dense_prompt_pads_prompt_waveform_and_zero_speaker():
-    tokenizer = _DummyTokenizer()
-    waveform = torch.ones((1, 1000), dtype=torch.float32)
-
-    prompt = build_ming_dense_prompt(
-        tokenizer,
-        prompt="Please imitate the reference speech.",
-        text="Hello world.",
-        prompt_text="Reference words.",
-        prompt_waveform=waveform,
-        use_zero_spk_emb=True,
-    )
-
-    info = prompt["additional_information"]
-    padded_waveform = info["prompt_waveform"]
-
-    assert padded_waveform.shape == (1, 14112)
-    assert int(info[KEY_SPEAKER_EMBEDDING].numel()) == 192
-    expected_patch_count = count_prompt_waveform_patches(waveform)
-    assert prompt["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == expected_patch_count
-
-
-def test_build_ming_dense_prompt_uses_patch_count_not_frame_count_for_zero_shot_waveform():
-    tokenizer = _DummyTokenizer()
-    waveform = torch.ones((1, 211680), dtype=torch.float32)
-
-    prompt = build_ming_dense_prompt(
-        tokenizer,
-        prompt="Please generate speech based on the following description.\n",
-        text="Target text.",
-        prompt_text="Reference words.",
-        prompt_waveform=waveform,
-        speaker_embedding=torch.ones((192,), dtype=torch.float32),
-    )
-
-    expected_patch_count = count_prompt_waveform_patches(waveform)
-    assert prompt["additional_information"].get(KEY_PROMPT_LATENTS) is None
-    assert prompt["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == expected_patch_count
-
-
-def test_build_ming_dense_prompt_accepts_flat_speaker_embedding_list():
-    tokenizer = _DummyTokenizer()
-    speaker_embedding = [0.1] * 192
-
-    prompt = build_ming_dense_prompt(
-        tokenizer,
-        prompt="Please imitate the reference speech.",
-        text="Hello world.",
-        speaker_embedding=speaker_embedding,
-    )
-
-    info = prompt["additional_information"]
-    assert tuple(info[KEY_SPEAKER_EMBEDDING].shape) == (192,)
-    assert prompt["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<|vision_start|>")) == 1
-
-
-def test_build_ming_dense_prompt_uses_prompt_latents_to_set_patch_count():
-    tokenizer = _DummyTokenizer()
-    prompt_latents = torch.ones((15, 4, 64), dtype=torch.float32)
-
-    prompt = build_ming_dense_prompt(
-        tokenizer,
-        prompt="Please generate speech based on the following description.\n",
-        text="Target text.",
-        prompt_text="Reference words.",
-        prompt_latents=prompt_latents,
-        speaker_embedding=torch.ones((192,), dtype=torch.float32),
-    )
-
-    assert torch.equal(prompt["additional_information"][KEY_PROMPT_LATENTS], prompt_latents)
-    assert prompt["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == 15
-
-
-def test_build_ming_dense_prompt_allows_raw_waveform_shell_without_explicit_prompt_latents():
-    tokenizer = _DummyTokenizer()
-    waveform = torch.ones((1, 1000), dtype=torch.float32)
-
-    prompt = build_ming_dense_prompt(
-        tokenizer,
-        prompt="Please imitate the reference speech.",
-        text="Hello world.",
-        prompt_text="Reference words.",
-        prompt_waveform=waveform,
-        speaker_embedding=torch.ones((192,), dtype=torch.float32),
-    )
-
-    expected_patch_count = count_prompt_waveform_patches(waveform)
-    assert prompt["additional_information"].get(KEY_PROMPT_LATENTS) is None
-    assert prompt["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == expected_patch_count
-
-
-def test_build_ming_dense_prompt_rejects_dual_truth_waveform_and_prompt_latents():
-    tokenizer = _DummyTokenizer()
-    waveform = torch.ones((1, 1000), dtype=torch.float32)
-    prompt_latents = torch.ones((4, 64), dtype=torch.float32)
-
-    with pytest.raises(ValueError, match="Choose exactly one source of truth"):
-        build_ming_dense_prompt(
-            tokenizer,
-            prompt="Please imitate the reference speech.",
-            text="Hello world.",
-            prompt_text="Reference words.",
-            prompt_waveform=waveform,
-            prompt_latents=prompt_latents,
-        )
-
-
-def test_ming_ingress_processor_preserves_raw_waveform_for_stage0_encoding():
-    tokenizer = _DummyTokenizer()
-    waveform = torch.ones((1, 1000), dtype=torch.float32)
-    prompt_text = "Reference words."
-    prompt = build_ming_dense_prompt(
-        tokenizer,
-        prompt="Please imitate the reference speech.",
-        text="Hello world.",
-        prompt_text=prompt_text,
-        prompt_waveform=waveform,
-        speaker_embedding=torch.ones((192,), dtype=torch.float32),
-    )
-    prompt["prompt"] = "Please imitate the reference speech."
-    prompt["text"] = "Hello world."
-    prompt["prompt_text"] = prompt_text
-    prompt["prompt_waveform"] = waveform
-    prompt["prompt_waveform_length"] = torch.tensor([1000], dtype=torch.int32)
-
-    processor = _make_dummy_ingress_processor(tokenizer)
-    finalized = processor(prompt)
-
-    assert finalized["prompt_waveform"] is waveform
-    assert torch.equal(finalized["prompt_waveform_length"], torch.tensor([1000], dtype=torch.int32))
-    assert finalized["additional_information"]["prompt_waveform"] is prompt["additional_information"]["prompt_waveform"]
-    assert torch.equal(
-        finalized["additional_information"]["prompt_waveform_length"],
-        prompt["additional_information"]["prompt_waveform_length"],
-    )
-    assert KEY_PROMPT_LATENTS not in finalized["additional_information"]
-    expected_patch_count = count_prompt_waveform_patches(waveform)
-    assert finalized["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == expected_patch_count
-
-
-def test_build_ming_dense_prompt_rejects_prompt_waveform_without_prompt_text():
-    tokenizer = _DummyTokenizer()
-    waveform = torch.ones((1, 1000), dtype=torch.float32)
-
-    with pytest.raises(ValueError, match="prompt_waveform requires prompt_text"):
-        build_ming_dense_prompt(
-            tokenizer,
-            prompt="Please generate speech based on the following description.\n",
-            text="我竟然抢到了陈奕迅的演唱会门票！",
-            instruction={"情感": "高兴"},
-            prompt_waveform=waveform,
-        )
-
-
-def test_ming_ingress_processor_rejects_raw_prompt_waveform_without_prompt_text():
-    tokenizer = _DummyTokenizer()
-    waveform = torch.ones((1, 1000), dtype=torch.float32)
-    prompt = {
-        "prompt": "Please generate speech based on the following description.\n",
-        "text": "我竟然抢到了陈奕迅的演唱会门票！",
-        "prompt_token_ids": [1, 2, 3],
-        "additional_information": {
-            "prompt_waveform": waveform,
-            "prompt_waveform_length": torch.tensor([1000], dtype=torch.int32),
-        },
-    }
-
-    processor = _make_dummy_ingress_processor(tokenizer)
-
-    with pytest.raises(RuntimeError, match="prompt_waveform requires prompt_text"):
-        processor(prompt)
-
-
-def test_ming_ingress_processor_rebuilds_podcast_prompt_with_prompt_text_before_target_text():
-    tokenizer = _DummyTokenizer()
-    prompt_prefix = "Please generate speech based on the following description.\n"
-    prompt_text = " speaker_1:reference one\n speaker_2:reference two\n"
-    target_text = " speaker_1:target one\n speaker_2:target two\n"
-    speaker_embeddings = torch.ones((2, 192), dtype=torch.float32)
-    prompt_waveform = [
-        torch.ones((1, 1000), dtype=torch.float32),
-        torch.ones((1, 2000), dtype=torch.float32),
-    ]
-
-    prompt = build_ming_dense_prompt(
-        tokenizer,
-        prompt=prompt_prefix,
-        text=target_text,
-        prompt_text=prompt_text,
-        prompt_waveform=prompt_waveform,
-        speaker_embedding=speaker_embeddings,
-    )
-
-    processor = _make_dummy_ingress_processor(tokenizer)
-    finalized = processor(prompt)
-    decoded = tokenizer.decode(finalized["prompt_token_ids"])
-    expected_patch_count = count_prompt_waveform_patches(prompt_waveform)
-
-    assert decoded.index(prompt_text) < decoded.index(target_text)
-    assert finalized["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<|vision_start|>")) == 2
-    assert finalized["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == expected_patch_count
-    assert "prompt_waveform" in finalized["additional_information"]
-    assert KEY_PROMPT_LATENTS not in finalized["additional_information"]
-
-
-def test_build_ming_dense_prompt_keeps_single_speaker_initial_payload_compatible():
-    tokenizer = _DummyTokenizer()
-    prompt_prefix = "Please imitate the reference speech."
-    target_text = "Hello world."
-    prompt_text = "Reference words."
-    waveform = torch.ones((1, 1000), dtype=torch.float32)
-
-    prompt = build_ming_dense_prompt(
-        tokenizer,
-        prompt=prompt_prefix,
-        text=target_text,
-        prompt_text=prompt_text,
-        prompt_waveform=waveform,
-        speaker_embedding=torch.ones((192,), dtype=torch.float32),
-    )
-    expected_patch_count = count_prompt_waveform_patches(waveform)
-    expected_prompt_token_ids = build_dense_prompt_token_ids(
-        tokenizer,
-        prompt=prompt_prefix,
-        text=target_text,
-        prompt_text=prompt_text,
-        speaker_count=1,
-        prompt_patch_count=expected_patch_count,
-    )
-
-    assert prompt["prompt"] == prompt_prefix
-    assert prompt["text"] == target_text
-    assert prompt["prompt_token_ids"] == expected_prompt_token_ids
-    assert prompt["prompt_token_ids"].count(tokenizer.convert_tokens_to_ids("<audioPatch>")) == expected_patch_count
-    assert prompt["additional_information"]["prompt_text"] == prompt_text
-
-
-def test_pad_prompt_waveform_matches_upstream_ming_alignment():
-    padded = pad_prompt_waveform(torch.ones((1, 3529), dtype=torch.float32))
-    assert int(padded.shape[-1]) == 14112
-    assert int(padded.shape[-1]) % int((float(SAMPLE_RATE) / 12.5) * int(PATCH_SIZE)) == 0
-    assert int(padded.shape[-1]) % int(AUDIO_FRAME_HOP * PATCH_SIZE) == 0
-
-
-def test_build_ming_dense_prompt_injects_duration_window_when_missing():
-    tokenizer = _DummyTokenizer()
-
-    prompt = build_ming_dense_prompt(
-        tokenizer,
-        prompt="Please generate music based on the following description.\n",
-        text=" Genre: electronic. Mood: confident. Instrument: drums. Theme: festival. Duration: 30s.",
-        runtime_controls={KEY_CFG: 2.0},
-    )
-
-    info = prompt["additional_information"]
-    assert float(info[KEY_CFG].item()) == 2.0
-    assert int(info[KEY_MIN_DECODE_STEPS].item()) == 91
-    assert int(info[KEY_MAX_DECODE_STEPS].item()) == 97
-
-
-def test_build_ming_dense_prompt_preserves_explicit_decode_window_overrides():
-    tokenizer = _DummyTokenizer()
-
-    prompt = build_ming_dense_prompt(
-        tokenizer,
-        prompt="Please generate music based on the following description.\n",
-        text=" Genre: electronic. Mood: confident. Instrument: drums. Theme: festival. Duration: 30s.",
-        runtime_controls={
-            KEY_MIN_DECODE_STEPS: 11,
-            KEY_MAX_DECODE_STEPS: 13,
-        },
-    )
-
-    info = prompt["additional_information"]
-    assert int(info[KEY_MIN_DECODE_STEPS].item()) == 11
-    assert int(info[KEY_MAX_DECODE_STEPS].item()) == 13
-
-
-def test_build_ming_dense_prompt_does_not_inject_duration_window_without_valid_duration():
-    tokenizer = _DummyTokenizer()
-
-    prompt_missing = build_ming_dense_prompt(
-        tokenizer,
-        prompt="Please generate music based on the following description.\n",
-        text=" Genre: electronic. Mood: confident. Instrument: drums. Theme: festival.",
-        runtime_controls={KEY_CFG: 2.0},
-    )
-    prompt_malformed = build_ming_dense_prompt(
-        tokenizer,
-        prompt="Please generate music based on the following description.\n",
-        text=" Genre: electronic. Mood: confident. Instrument: drums. Theme: festival. Duration: nope.",
-        runtime_controls={KEY_CFG: 2.0},
-    )
-
-    for prompt in (prompt_missing, prompt_malformed):
-        info = prompt["additional_information"]
-        assert KEY_MIN_DECODE_STEPS not in info
-        assert KEY_MAX_DECODE_STEPS not in info
diff --git a/tests/model_executor/stage_input_processors/test_ming_tts_async_chunk.py b/tests/model_executor/stage_input_processors/test_ming_tts_async_chunk.py
deleted file mode 100644
index cfd0e43045f..00000000000
--- a/tests/model_executor/stage_input_processors/test_ming_tts_async_chunk.py
+++ /dev/null
@@ -1,433 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections import defaultdict
-from types import SimpleNamespace
-
-import pytest
-import torch
-
-from vllm_omni.model_executor.models.ming_tts.config_ming_tts import (
-    KEY_REQUEST_ID,
-    LATENT_CHUNK_SIZE,
-    LATENT_LEFT_CONTEXT,
-    PATCH_SIZE,
-)
-from vllm_omni.model_executor.stage_input_processors.ming_tts import (
-    MING_EMIT_PATCH_COUNT_KEY,
-    MING_ESTIMATED_BYTES_KEY,
-    MING_FINAL_DECODE_STEP_KEY,
-    MING_FINAL_FLUSH_KEY,
-    MING_LATENT_SHAPE_KEY,
-    MING_STOP_REASON_KEY,
-    _extract_last_patch,
-    llm2audio_vae,
-    llm2audio_vae_async_chunk,
-)
-
-pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
-
-_LATENT_D = 64
-
-
-def _req(external_req_id: str, *, finished: bool):
-    return SimpleNamespace(
-        external_req_id=external_req_id,
-        is_finished=lambda: finished,
-    )
-
-
-def _manager(*, chunk_size: int | None = 2, left_context: int | None = 0, raw_config=None):
-    if raw_config is None:
-        extra = {}
-        if chunk_size is not None:
-            extra["latent_chunk_size"] = chunk_size
-        if left_context is not None:
-            extra["latent_left_context"] = left_context
-        raw_config = {"extra": extra}
-    return SimpleNamespace(
-        code_prompt_token_ids=defaultdict(list),
-        put_req_chunk=defaultdict(int),
-        request_payload={},
-        connector=SimpleNamespace(config=raw_config),
-    )
-
-
-def _patch(fill: float) -> torch.Tensor:
-    return torch.full((PATCH_SIZE, _LATENT_D), fill, dtype=torch.float32)
-
-
-def _payload(fill: float, *, has_patch=True, decode_step=None, stop_reason=None) -> dict[str, object]:
-    payload = {
-        "ming_has_patch": torch.tensor([has_patch]),
-        "ming_latent_patch": _patch(fill).unsqueeze(0),
-    }
-    if decode_step is not None:
-        payload["ming_decode_step"] = torch.tensor([decode_step], dtype=torch.int32)
-    if stop_reason is not None:
-        payload[MING_STOP_REASON_KEY] = (stop_reason,)
-    return payload
-
-
-def test_extract_last_patch_uses_active_mask():
-    patch = torch.arange(3 * PATCH_SIZE * _LATENT_D, dtype=torch.float16).reshape(3, PATCH_SIZE, _LATENT_D)
-    payload = {
-        "ming_has_patch": torch.tensor([False, True, False]),
-        "ming_latent_patch": patch,
-    }
-
-    out = _extract_last_patch(payload)
-
-    assert out is not None
-    assert out.shape == (PATCH_SIZE, _LATENT_D)
-    assert out.dtype == torch.float32
-    assert out.device.type == "cpu"
-    assert torch.allclose(out, patch[1].to(torch.float32).cpu())
-
-
-def test_llm2audio_vae_async_chunk_waits_for_full_chunk():
-    transfer_manager = _manager(chunk_size=2)
-    request = _req("rid-wait", finished=False)
-
-    payload = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=_payload(1.0),
-        request=request,
-    )
-
-    assert payload is None
-    assert len(transfer_manager.code_prompt_token_ids["rid-wait"]) == 1
-
-
-def test_llm2audio_vae_async_chunk_partial_chunk_does_not_emit():
-    transfer_manager = _manager(chunk_size=3)
-    request = _req("rid-partial", finished=False)
-
-    first = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=_payload(1.0),
-        request=request,
-    )
-    second = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=_payload(2.0),
-        request=request,
-    )
-
-    assert first is None
-    assert second is None
-    assert len(transfer_manager.code_prompt_token_ids["rid-partial"]) == 2
-
-
-def test_llm2audio_vae_async_chunk_emits_full_chunk():
-    transfer_manager = _manager(chunk_size=2)
-    request_id = "rid-full"
-    request = _req(request_id, finished=False)
-    transfer_manager.code_prompt_token_ids[request_id].append(_patch(1.0))
-
-    payload = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=_payload(2.0),
-        request=request,
-    )
-
-    assert payload is not None
-    assert payload["codes"]["audio"] == [0]
-    assert payload["meta"]["finished"].item() is False
-    assert payload["finished"].item() is False
-    assert payload["stream_finished"].item() is False
-    assert payload[KEY_REQUEST_ID] == request_id
-    assert payload["code_predictor_codes"] == [0]
-    assert payload["ming_latent_patches"].shape == (2, PATCH_SIZE, _LATENT_D)
-    assert payload[MING_EMIT_PATCH_COUNT_KEY] == 2
-    assert payload[MING_LATENT_SHAPE_KEY] == (2, PATCH_SIZE, _LATENT_D)
-    assert payload[MING_ESTIMATED_BYTES_KEY] == int(
-        payload["ming_latent_patches"].numel() * payload["ming_latent_patches"].element_size()
-    )
-    assert payload[MING_ESTIMATED_BYTES_KEY] > 0
-    assert payload[MING_FINAL_FLUSH_KEY] is False
-    assert torch.allclose(payload["ming_latent_patches"][0], _patch(1.0))
-    assert torch.allclose(payload["ming_latent_patches"][1], _patch(2.0))
-    assert transfer_manager.request_payload[request_id]["_ming_async_state"]["seen_patch_len"] == 2
-
-
-def test_llm2audio_vae_async_chunk_multi_request_interleaving_has_no_state_bleed():
-    transfer_manager = _manager(chunk_size=2)
-    req_a = _req("rid-a", finished=False)
-    req_b = _req("rid-b", finished=False)
-
-    assert (
-        llm2audio_vae_async_chunk(transfer_manager=transfer_manager, pooling_output=_payload(1.0), request=req_a)
-        is None
-    )
-    assert (
-        llm2audio_vae_async_chunk(transfer_manager=transfer_manager, pooling_output=_payload(10.0), request=req_b)
-        is None
-    )
-
-    payload_a = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=_payload(2.0),
-        request=req_a,
-    )
-    assert payload_a is not None
-    assert payload_a[KEY_REQUEST_ID] == "rid-a"
-    assert torch.allclose(payload_a["ming_latent_patches"][0], _patch(1.0))
-    assert torch.allclose(payload_a["ming_latent_patches"][1], _patch(2.0))
-
-    assert len(transfer_manager.code_prompt_token_ids["rid-b"]) == 1
-
-    payload_b = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=_payload(20.0),
-        request=req_b,
-    )
-    assert payload_b is not None
-    assert payload_b[KEY_REQUEST_ID] == "rid-b"
-    assert torch.allclose(payload_b["ming_latent_patches"][0], _patch(10.0))
-    assert torch.allclose(payload_b["ming_latent_patches"][1], _patch(20.0))
-
-    assert transfer_manager.request_payload["rid-a"]["_ming_async_state"]["seen_patch_len"] == 2
-    assert transfer_manager.request_payload["rid-b"]["_ming_async_state"]["seen_patch_len"] == 2
-
-
-def test_llm2audio_vae_async_chunk_finish_after_full_chunk_only_emits_eof():
-    transfer_manager = _manager(chunk_size=2)
-    request_id = "rid-drain"
-    request = _req(request_id, finished=False)
-    transfer_manager.code_prompt_token_ids[request_id].append(_patch(1.0))
-
-    payload = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=_payload(2.0),
-        request=request,
-    )
-
-    assert payload is not None
-    assert transfer_manager.request_payload[request_id]["_ming_async_state"]["seen_patch_len"] == 2
-
-    finish_payload = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=None,
-        request=_req(request_id, finished=True),
-    )
-
-    assert finish_payload == {
-        "codes": {"audio": []},
-        "meta": {"finished": torch.tensor(True, dtype=torch.bool)},
-        "code_predictor_codes": [],
-        "finished": torch.tensor(True, dtype=torch.bool),
-        "stream_finished": torch.tensor(True, dtype=torch.bool),
-        "ming_chunk_id": 0,
-        KEY_REQUEST_ID: request_id,
-        MING_EMIT_PATCH_COUNT_KEY: 0,
-        MING_LATENT_SHAPE_KEY: None,
-        MING_ESTIMATED_BYTES_KEY: 0,
-        MING_FINAL_FLUSH_KEY: True,
-    }
-
-
-def test_llm2audio_vae_async_chunk_flushes_tail_on_finish_without_new_patch():
-    transfer_manager = _manager(chunk_size=3)
-    request_id = "rid-tail"
-    request = _req(request_id, finished=True)
-    transfer_manager.code_prompt_token_ids[request_id] = [
-        _patch(1.0),
-        _patch(2.0),
-    ]
-
-    payload = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=None,
-        request=request,
-    )
-
-    assert payload is not None
-    assert payload["codes"]["audio"] == [0]
-    assert payload["meta"]["finished"].item() is True
-    assert payload["finished"].item() is True
-    assert payload["stream_finished"].item() is True
-    assert payload[KEY_REQUEST_ID] == request_id
-    assert payload["ming_latent_patches"].shape == (2, PATCH_SIZE, _LATENT_D)
-    assert payload[MING_EMIT_PATCH_COUNT_KEY] == 2
-    assert payload[MING_LATENT_SHAPE_KEY] == (2, PATCH_SIZE, _LATENT_D)
-    assert payload[MING_ESTIMATED_BYTES_KEY] > 0
-    assert payload[MING_FINAL_FLUSH_KEY] is True
-    assert torch.allclose(payload["ming_latent_patches"][0], _patch(1.0))
-    assert torch.allclose(payload["ming_latent_patches"][1], _patch(2.0))
-
-
-def test_llm2audio_vae_async_chunk_final_flush_emits_partial_chunk_with_new_patch():
-    transfer_manager = _manager(chunk_size=3)
-    request_id = "rid-tail-new"
-
-    transfer_manager.code_prompt_token_ids[request_id].append(_patch(1.0))
-    payload = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=_payload(2.0, decode_step=7, stop_reason="stop_head"),
-        request=_req(request_id, finished=True),
-    )
-
-    assert payload is not None
-    assert payload["codes"]["audio"] == [0]
-    assert payload["meta"]["finished"].item() is True
-    assert payload["finished"].item() is True
-    assert payload["stream_finished"].item() is True
-    assert payload[MING_EMIT_PATCH_COUNT_KEY] == 2
-    assert payload[MING_FINAL_FLUSH_KEY] is True
-    assert payload[MING_FINAL_DECODE_STEP_KEY] == 7
-    assert payload[MING_STOP_REASON_KEY] == "stop_head"
-    assert torch.allclose(payload["ming_latent_patches"][0], _patch(1.0))
-    assert torch.allclose(payload["ming_latent_patches"][1], _patch(2.0))
-
-
-def test_llm2audio_vae_async_chunk_emits_eof_when_finished_without_frames():
-    transfer_manager = _manager(chunk_size=2)
-    request = _req("rid-eof", finished=True)
-
-    payload = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=None,
-        request=request,
-    )
-
-    assert payload == {
-        "codes": {"audio": []},
-        "meta": {"finished": torch.tensor(True, dtype=torch.bool)},
-        "code_predictor_codes": [],
-        "finished": torch.tensor(True, dtype=torch.bool),
-        "stream_finished": torch.tensor(True, dtype=torch.bool),
-        "ming_chunk_id": 0,
-        KEY_REQUEST_ID: "rid-eof",
-        MING_EMIT_PATCH_COUNT_KEY: 0,
-        MING_LATENT_SHAPE_KEY: None,
-        MING_ESTIMATED_BYTES_KEY: 0,
-        MING_FINAL_FLUSH_KEY: True,
-    }
-
-
-def test_llm2audio_vae_async_chunk_zero_latent_final_flush_returns_empty_payload_not_error():
-    transfer_manager = _manager(chunk_size=2)
-
-    payload = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output={
-            "ming_has_patch": torch.tensor([False]),
-            "ming_latent_patch": torch.zeros((1, PATCH_SIZE, _LATENT_D), dtype=torch.float32),
-        },
-        request=_req("rid-zero-final", finished=True),
-    )
-
-    assert payload == {
-        "codes": {"audio": []},
-        "meta": {"finished": torch.tensor(True, dtype=torch.bool)},
-        "code_predictor_codes": [],
-        "finished": torch.tensor(True, dtype=torch.bool),
-        "stream_finished": torch.tensor(True, dtype=torch.bool),
-        "ming_chunk_id": 0,
-        KEY_REQUEST_ID: "rid-zero-final",
-        MING_EMIT_PATCH_COUNT_KEY: 0,
-        MING_LATENT_SHAPE_KEY: None,
-        MING_ESTIMATED_BYTES_KEY: 0,
-        MING_FINAL_FLUSH_KEY: True,
-    }
-
-
-def test_llm2audio_vae_async_chunk_rejects_left_context_config():
-    transfer_manager = _manager(chunk_size=2, left_context=1)
-    request = _req("rid-bad-cfg", finished=False)
-
-    with pytest.raises(
-        ValueError,
-        match="does not support latent_left_context replay.*Got latent_left_context=1",
-    ):
-        llm2audio_vae_async_chunk(
-            transfer_manager=transfer_manager,
-            pooling_output=_payload(1.0),
-            request=request,
-        )
-
-
-def test_llm2audio_vae_async_chunk_rejects_non_positive_chunk_size():
-    transfer_manager = _manager(chunk_size=0, left_context=0)
-
-    with pytest.raises(ValueError, match="Invalid Ming latent_chunk_size=0"):
-        llm2audio_vae_async_chunk(
-            transfer_manager=transfer_manager,
-            pooling_output=_payload(1.0),
-            request=_req("rid-bad-chunk", finished=False),
-        )
-
-
-def test_llm2audio_vae_async_chunk_missing_config_uses_fallback_defaults():
-    transfer_manager = _manager(raw_config={"extra": {}})
-    request_id = "rid-fallback"
-
-    for idx in range(LATENT_CHUNK_SIZE - 1):
-        payload = llm2audio_vae_async_chunk(
-            transfer_manager=transfer_manager,
-            pooling_output=_payload(float(idx + 1)),
-            request=_req(request_id, finished=False),
-        )
-        assert payload is None
-
-    payload = llm2audio_vae_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=_payload(float(LATENT_CHUNK_SIZE)),
-        request=_req(request_id, finished=False),
-    )
-
-    assert payload is not None
-    assert payload[MING_EMIT_PATCH_COUNT_KEY] == LATENT_CHUNK_SIZE
-    assert payload[MING_LATENT_SHAPE_KEY] == (LATENT_CHUNK_SIZE, PATCH_SIZE, _LATENT_D)
-    assert LATENT_LEFT_CONTEXT == 0
-
-
-def test_llm2audio_vae_builds_generation_prompt_from_stage_output():
-    patches = torch.arange(2 * PATCH_SIZE * _LATENT_D, dtype=torch.float32).reshape(2, PATCH_SIZE, _LATENT_D)
-    stage_output = SimpleNamespace(
-        request_id="rid-stage",
-        finished=True,
-        outputs=[
-            SimpleNamespace(
-                multimodal_output={
-                    "ming_has_patch": torch.tensor([True, True]),
-                    "ming_latent_patch": patches,
-                    "ming_decode_step": torch.tensor([26, 27], dtype=torch.int32),
-                    "ming_stop_reason": ("continue", "stop_head"),
-                }
-            )
-        ],
-    )
-    stage = SimpleNamespace(engine_outputs=[stage_output])
-
-    prompts = llm2audio_vae(stage_list=[stage], engine_input_source=[0])
-
-    assert len(prompts) == 1
-    info = prompts[0]["additional_information"]
-    assert info[KEY_REQUEST_ID] == "rid-stage"
-    assert info["finished"].item() is True
-    assert info["ming_latent_patches"].shape == (2, PATCH_SIZE, _LATENT_D)
-    assert torch.allclose(info["ming_latent_patches"], patches)
-    assert info[MING_FINAL_DECODE_STEP_KEY] == 27
-    assert info[MING_STOP_REASON_KEY] == "stop_head"
-
-
-def test_llm2audio_vae_skips_unfinished_stage_output():
-    patch = torch.arange(PATCH_SIZE * _LATENT_D, dtype=torch.float32).reshape(1, PATCH_SIZE, _LATENT_D)
-    stage_output = SimpleNamespace(
-        request_id="rid-unfinished",
-        finished=False,
-        outputs=[
-            SimpleNamespace(
-                multimodal_output={
-                    "ming_has_patch": torch.tensor([True]),
-                    "ming_latent_patch": patch,
-                }
-            )
-        ],
-    )
-    stage = SimpleNamespace(engine_outputs=[stage_output])
-
-    prompts = llm2audio_vae(stage_list=[stage], engine_input_source=[0])
-
-    assert prompts == []
diff --git a/tests/worker/test_ming_tts_runner.py b/tests/worker/test_ming_tts_runner.py
deleted file mode 100644
index b964bf7659b..00000000000
--- a/tests/worker/test_ming_tts_runner.py
+++ /dev/null
@@ -1,676 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from types import SimpleNamespace
-
-import pytest
-import torch
-
-from vllm_omni.model_executor.models.ming_tts.config_ming_tts import (
-    KEY_LATENT_HISTORY,
-    KEY_NEXT_EMBEDS,
-    KEY_PROMPT_LATENTS,
-    KEY_REQUEST_ID,
-    KEY_SPEAKER_EMBEDDING,
-    MingTTSConfig,
-)
-from vllm_omni.model_executor.models.ming_tts.ming_tts import MingTTSForConditionalGeneration
-from vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae import MingAudioVAEModel
-from vllm_omni.model_executor.models.ming_tts.ming_tts_llm import (
-    MING_STOP_REASON_CONTINUE,
-    MING_STOP_REASON_KEY,
-    MING_STOP_REASON_MAX_DECODE_STEPS,
-    MING_STOP_REASON_STOP_HEAD,
-    MingLLMModel,
-    _resolve_ming_stop_decision,
-)
-from vllm_omni.model_executor.models.output_templates import OmniOutput
-
-pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
-
-
-class DummyBackbone(torch.nn.Module):
-    def __init__(self, hidden_size: int):
-        super().__init__()
-        self.hidden_size = hidden_size
-
-    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-        ids = input_ids.to(torch.float32).reshape(-1, 1)
-        return ids.repeat(1, self.hidden_size) / 100.0
-
-    def get_input_embeddings(self):
-        return None
-
-    def forward(self, input_ids, positions, intermediate_tensors=None, inputs_embeds=None, **kwargs):
-        del input_ids, positions, intermediate_tensors, kwargs
-        return inputs_embeds
-
-
-class NaNOnSecondDecodeBackbone(DummyBackbone):
-    def __init__(self, hidden_size: int):
-        super().__init__(hidden_size)
-        self.decode_calls = 0
-
-    def forward(self, input_ids, positions, intermediate_tensors=None, inputs_embeds=None, **kwargs):
-        del input_ids, positions, intermediate_tensors, kwargs
-        self.decode_calls += 1
-        if self.decode_calls >= 2:
-            return torch.full_like(inputs_embeds, float("nan"))
-        return inputs_embeds
-
-
-class DummyAggregator(torch.nn.Module):
-    def __init__(self, in_channels: int, llm_input_dim: int, **kwargs):
-        super().__init__()
-        del in_channels, kwargs
-        self.hidden_size = llm_input_dim
-
-    def forward(self, patch: torch.Tensor) -> torch.Tensor:
-        pooled = patch.mean(dim=1)
-        repeats = self.hidden_size // pooled.shape[-1]
-        return pooled.repeat(1, repeats).reshape(pooled.shape[0], 1, self.hidden_size)
-
-
-class DummyFlowLoss(torch.nn.Module):
-    def __init__(self, z_channels: int, llm_cond_dim: int, **kwargs):
-        super().__init__()
-        del z_channels, llm_cond_dim, kwargs
-
-    def sample(self, z, latent_history, cfg, patch_size, sigma, temperature):
-        del latent_history, cfg, sigma, temperature
-        base = z[:, 0, :64]
-        return torch.stack([base + float(i + 1) for i in range(patch_size)], dim=1)
-
-
-class DummyAudioVAE(torch.nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.weight = torch.nn.Parameter(torch.tensor(1.0))
-        self.decode_calls: list[dict[str, object]] = []
-
-    def encode_latent(self, waveform: torch.Tensor, waveform_length: torch.Tensor):
-        if waveform.ndim == 2:
-            frames = waveform.shape[-1] // 64
-            latent = waveform[:, : frames * 64].reshape(waveform.shape[0], frames, 64)
-        else:
-            latent = waveform
-        frame_num = torch.full((latent.shape[0],), latent.shape[1], dtype=torch.int32, device=latent.device)
-        return latent.to(torch.float32), frame_num
-
-    def decode(self, latent, past_key_values=None, use_cache=False, stream_state=(None, None, None), last_chunk=False):
-        del use_cache, last_chunk
-        prev_frames = int((past_key_values or {}).get("frames", 0))
-        waveform = latent.sum(dim=-1).reshape(latent.shape[0], -1).to(torch.float32) + prev_frames * 10.0
-        new_stream_state = ("stream", prev_frames + latent.shape[1], tuple(latent.shape))
-        new_past = {"frames": prev_frames + int(latent.shape[1])}
-        self.decode_calls.append(
-            {
-                "stream_state": stream_state,
-                "past_key_values": past_key_values,
-                "latent_shape": tuple(latent.shape),
-            }
-        )
-        return waveform, new_stream_state, new_past
-
-
-class _DummySamplingMetadata:
-    def __init__(self, step: int):
-        self.output_token_ids = [[0] * int(step)]
-
-
-def _make_config() -> MingTTSConfig:
-    audio_cfg = SimpleNamespace(
-        enc_kwargs={"latent_dim": 64, "input_dim": 882, "hop_size": 882},
-        dec_kwargs={"latent_dim": 64, "output_dim": 882},
-        patch_size=4,
-        sample_rate=44100,
-    )
-    cfg = MingTTSConfig(audio_tokenizer_config=audio_cfg)
-    cfg.validate()
-    return cfg
-
-
-def _make_vllm_config(model_stage: str, **hf_overrides):
-    return SimpleNamespace(
-        model_config=SimpleNamespace(hf_config=SimpleNamespace(**hf_overrides), model_stage=model_stage),
-        quant_config=None,
-        device_config=SimpleNamespace(device=torch.device("cpu")),
-    )
-
-
-def _make_runner_for_ming(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts as wrapper_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae as vae_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-
-    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: DummyBackbone(cfg.llm_hidden_size))
-    monkeypatch.setattr(llm_mod, "Aggregator", DummyAggregator)
-    monkeypatch.setattr(llm_mod, "FlowLoss", DummyFlowLoss)
-    monkeypatch.setattr(wrapper_mod, "AudioVAE", DummyAudioVAE, raising=False)
-    monkeypatch.setattr(vae_mod, "AudioVAE", DummyAudioVAE)
-
-    llm_model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
-    vae_model = MingAudioVAEModel(vllm_config=_make_vllm_config("audio_vae"))
-
-    def _wrapper_loader(*, architectures, **kwargs):
-        arch = architectures[0]
-        if arch == "MingLLMModel":
-            return llm_model
-        if arch == "MingAudioVAEModel":
-            return vae_model
-        raise AssertionError(f"unexpected architecture {arch}")
-
-    monkeypatch.setattr(wrapper_mod, "init_vllm_registered_model", _wrapper_loader)
-
-    stage1 = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("llm"))
-    stage2 = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("audio_vae"))
-
-    return SimpleNamespace(config=cfg, llm=llm_model, vae=vae_model, stage1=stage1, stage2=stage2)
-
-
-def test_ming_llm_step_shapes(monkeypatch):
-    runner = _make_runner_for_ming(monkeypatch)
-    cfg = runner.config
-
-    prefill_ids = torch.tensor(
-        [1, cfg.audio_start_token_id, cfg.audio_dummy_token_id, cfg.audio_dummy_token_id, cfg.audio_end_token_id, 2],
-        dtype=torch.long,
-    )
-    prefill_embeds = torch.zeros((prefill_ids.shape[0], cfg.llm_hidden_size), dtype=torch.float32)
-    prompt_latents = torch.arange(8 * 64, dtype=torch.float32).reshape(1, 8, 64)
-
-    _, prefill_out_embeds, prefill_info = runner.stage1.preprocess_input(
-        prefill_ids,
-        prefill_embeds,
-        **{KEY_PROMPT_LATENTS: prompt_latents},
-        **{KEY_REQUEST_ID: "req-1"},
-    )
-
-    assert prefill_info[KEY_LATENT_HISTORY].shape == (32, 64)
-    assert torch.allclose(prefill_info[KEY_LATENT_HISTORY][-8:], prompt_latents.reshape(8, 64))
-    assert torch.count_nonzero(prefill_out_embeds[1]).item() > 0
-    assert torch.count_nonzero(prefill_out_embeds[2]).item() > 0
-
-    decode_ids = torch.tensor([cfg.audio_dummy_token_id], dtype=torch.long)
-    decode_embeds = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
-    _, decode_embeds, decode_info = runner.stage1.preprocess_input(
-        decode_ids,
-        decode_embeds,
-        **prefill_info,
-    )
-
-    output = runner.llm.forward(
-        decode_ids,
-        positions=torch.tensor([0], dtype=torch.long),
-        inputs_embeds=decode_embeds,
-        model_intermediate_buffer=[decode_info],
-        seq_token_counts=[1],
-    )
-    mm = output.multimodal_outputs
-
-    assert mm["ming_latent_patch"].shape == (1, 4, 64)
-    assert mm["ming_next_embeds"].shape == (1, 1, cfg.llm_hidden_size)
-    assert mm["ming_new_history"].shape == (1, 32, 64)
-
-    update = runner.stage1.postprocess(output.text_hidden_states, multimodal_outputs=mm, **decode_info)
-    assert update[KEY_LATENT_HISTORY].shape == (1, 32, 64)
-    assert torch.allclose(update[KEY_LATENT_HISTORY][0, -4:], mm["ming_latent_patch"][0].cpu())
-    assert update[KEY_NEXT_EMBEDS].shape == (1, 1, cfg.llm_hidden_size)
-
-
-def test_ming_prefill_injects_speaker_into_dense_placeholder(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts as wrapper_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: DummyBackbone(cfg.llm_hidden_size))
-    monkeypatch.setattr(llm_mod, "Aggregator", DummyAggregator)
-    monkeypatch.setattr(llm_mod, "FlowLoss", DummyFlowLoss)
-    monkeypatch.setattr(
-        wrapper_mod, "init_vllm_registered_model", lambda **kwargs: MingLLMModel(vllm_config=_make_vllm_config("llm"))
-    )
-
-    vision_start_token_id = 32001
-    stage1 = MingTTSForConditionalGeneration(
-        vllm_config=_make_vllm_config("llm", vision_start_token_id=vision_start_token_id)
-    )
-
-    input_ids = torch.tensor(
-        [
-            1,
-            vision_start_token_id,
-            77,
-            cfg.audio_start_token_id,
-            cfg.audio_dummy_token_id,
-            cfg.audio_end_token_id,
-        ],
-        dtype=torch.long,
-    )
-    input_embeds = torch.zeros((input_ids.shape[0], cfg.llm_hidden_size), dtype=torch.float32)
-    baseline_embeds = stage1.model.embed_input_ids(input_ids).clone()
-    speaker = torch.ones((192,), dtype=torch.float32)
-
-    _, out_embeds, _ = stage1.preprocess_input(
-        input_ids,
-        input_embeds,
-        **{KEY_SPEAKER_EMBEDDING: speaker},
-    )
-
-    assert torch.count_nonzero(out_embeds[2]).item() > 0
-    assert not torch.allclose(out_embeds[2], baseline_embeds[2])
-    assert torch.allclose(out_embeds[3], baseline_embeds[3])
-
-
-def test_ming_prefill_injects_multiple_speakers_into_multiple_dense_placeholders(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts as wrapper_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(llm_mod, "init_vllm_registered_model", lambda **kwargs: DummyBackbone(cfg.llm_hidden_size))
-    monkeypatch.setattr(llm_mod, "Aggregator", DummyAggregator)
-    monkeypatch.setattr(llm_mod, "FlowLoss", DummyFlowLoss)
-    monkeypatch.setattr(
-        wrapper_mod, "init_vllm_registered_model", lambda **kwargs: MingLLMModel(vllm_config=_make_vllm_config("llm"))
-    )
-
-    vision_start_token_id = 32001
-    stage1 = MingTTSForConditionalGeneration(
-        vllm_config=_make_vllm_config("llm", vision_start_token_id=vision_start_token_id)
-    )
-
-    input_ids = torch.tensor(
-        [
-            1,
-            vision_start_token_id,
-            77,
-            2,
-            vision_start_token_id,
-            88,
-            cfg.audio_start_token_id,
-            cfg.audio_dummy_token_id,
-            cfg.audio_end_token_id,
-        ],
-        dtype=torch.long,
-    )
-    input_embeds = torch.zeros((input_ids.shape[0], cfg.llm_hidden_size), dtype=torch.float32)
-    baseline_embeds = stage1.model.embed_input_ids(input_ids).clone()
-    speaker = torch.ones((2, 192), dtype=torch.float32)
-
-    _, out_embeds, _ = stage1.preprocess_input(
-        input_ids,
-        input_embeds,
-        **{KEY_SPEAKER_EMBEDDING: speaker},
-    )
-
-    assert torch.count_nonzero(out_embeds[2]).item() > 0
-    assert torch.count_nonzero(out_embeds[5]).item() > 0
-    assert not torch.allclose(out_embeds[2], baseline_embeds[2])
-    assert not torch.allclose(out_embeds[5], baseline_embeds[5])
-    assert torch.allclose(out_embeds[6], baseline_embeds[6])
-
-
-def test_ming_stop_logic_no_stop_before_min_required_decode_steps(monkeypatch):
-    runner = _make_runner_for_ming(monkeypatch)
-    cfg = runner.config
-
-    def _high_stop(_hidden_states):
-        return torch.tensor([[0.0, 10.0]], dtype=torch.float32)
-
-    monkeypatch.setattr(runner.llm.stop_head, "forward", _high_stop)
-    hidden = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
-
-    stop_reason, stop_now, force_stop, min_required_decode_steps, next_token_id = _resolve_ming_stop_decision(
-        step=4,
-        stop_prob=1.0,
-        stop_threshold=float(cfg.stop_head_threshold),
-        min_stop_step=int(cfg.stop_head_min_steps),
-        min_decode_steps=7,
-        max_decode_steps=int(cfg.max_decode_steps),
-        audio_dummy_token_id=int(cfg.audio_dummy_token_id),
-        text_eos_token_id=int(cfg.text_eos_token_id),
-    )
-    assert stop_reason == MING_STOP_REASON_CONTINUE
-    assert stop_now is False
-    assert force_stop is False
-    assert min_required_decode_steps == 7
-    assert next_token_id == cfg.audio_dummy_token_id
-
-    logits_step3 = runner.llm.compute_logits(
-        OmniOutput(
-            text_hidden_states=hidden,
-            multimodal_outputs={"ming_min_decode_steps": torch.tensor([7], dtype=torch.int32)},
-        ),
-        _DummySamplingMetadata(step=3),
-    )
-    out_step3 = runner.llm.sample(logits_step3, _DummySamplingMetadata(step=3))
-    assert int(out_step3.sampled_token_ids[0, 0]) == cfg.audio_dummy_token_id
-    assert torch.isfinite(logits_step3[0, int(cfg.audio_dummy_token_id)])
-    assert not torch.isfinite(logits_step3[0, int(cfg.text_eos_token_id)])
-
-
-def test_ming_stop_logic_stop_head_inside_window(monkeypatch):
-    runner = _make_runner_for_ming(monkeypatch)
-    cfg = runner.config
-
-    def _high_stop(_hidden_states):
-        return torch.tensor([[0.0, 10.0]], dtype=torch.float32)
-
-    monkeypatch.setattr(runner.llm.stop_head, "forward", _high_stop)
-    hidden = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
-
-    stop_reason, stop_now, force_stop, min_required_decode_steps, next_token_id = _resolve_ming_stop_decision(
-        step=4,
-        stop_prob=1.0,
-        stop_threshold=float(cfg.stop_head_threshold),
-        min_stop_step=int(cfg.stop_head_min_steps),
-        min_decode_steps=0,
-        max_decode_steps=int(cfg.max_decode_steps),
-        audio_dummy_token_id=int(cfg.audio_dummy_token_id),
-        text_eos_token_id=int(cfg.text_eos_token_id),
-    )
-    assert stop_reason == MING_STOP_REASON_STOP_HEAD
-    assert stop_now is True
-    assert force_stop is False
-    assert min_required_decode_steps == int(cfg.stop_head_min_steps) + 1
-    assert next_token_id == cfg.text_eos_token_id
-
-    logits_step4 = runner.llm.compute_logits(hidden, _DummySamplingMetadata(step=4))
-    out_step4 = runner.llm.sample(logits_step4, _DummySamplingMetadata(step=4))
-    assert int(out_step4.sampled_token_ids[0, 0]) == cfg.text_eos_token_id
-
-
-def test_ming_stop_logic_rejects_impossible_decode_window(monkeypatch):
-    runner = _make_runner_for_ming(monkeypatch)
-    cfg = runner.config
-    hidden = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
-
-    with pytest.raises(RuntimeError, match="Invalid Ming decode window"):
-        runner.llm.compute_logits(
-            OmniOutput(
-                text_hidden_states=hidden,
-                multimodal_outputs={
-                    "ming_min_decode_steps": torch.tensor([7], dtype=torch.int32),
-                    "ming_max_decode_steps": torch.tensor([5], dtype=torch.int32),
-                },
-            ),
-            _DummySamplingMetadata(step=4),
-        )
-
-
-def test_ming_stop_logic_max_decode_guard(monkeypatch):
-    runner = _make_runner_for_ming(monkeypatch)
-    cfg = runner.config
-    cfg.max_decode_steps = 5
-
-    def _high_stop(_hidden_states):
-        return torch.tensor([[0.0, 10.0]], dtype=torch.float32)
-
-    monkeypatch.setattr(runner.llm.stop_head, "forward", _high_stop)
-    hidden = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
-
-    stop_reason, stop_now, force_stop, min_required_decode_steps, next_token_id = _resolve_ming_stop_decision(
-        step=4,
-        stop_prob=1.0,
-        stop_threshold=float(cfg.stop_head_threshold),
-        min_stop_step=int(cfg.stop_head_min_steps),
-        min_decode_steps=0,
-        max_decode_steps=int(cfg.max_decode_steps),
-        audio_dummy_token_id=int(cfg.audio_dummy_token_id),
-        text_eos_token_id=int(cfg.text_eos_token_id),
-    )
-    assert stop_reason == MING_STOP_REASON_MAX_DECODE_STEPS
-    assert stop_now is True
-    assert force_stop is True
-    assert min_required_decode_steps == int(cfg.stop_head_min_steps) + 1
-    assert next_token_id == cfg.text_eos_token_id
-
-    logits = runner.llm.compute_logits(hidden, _DummySamplingMetadata(step=4))
-    out = runner.llm.sample(logits, _DummySamplingMetadata(step=4))
-    assert int(out.sampled_token_ids[0, 0]) == cfg.text_eos_token_id
-
-
-def test_ming_compute_logits_uses_forward_stop_prob_payload(monkeypatch):
-    runner = _make_runner_for_ming(monkeypatch)
-    cfg = runner.config
-
-    def _low_stop(_hidden_states):
-        return torch.tensor([[10.0, 0.0]], dtype=torch.float32)
-
-    monkeypatch.setattr(runner.llm.stop_head, "forward", _low_stop)
-    hidden = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
-
-    logits = runner.llm.compute_logits(
-        OmniOutput(
-            text_hidden_states=hidden,
-            multimodal_outputs={
-                "ming_stop_prob": torch.tensor([1.0], dtype=torch.float32),
-                "ming_decode_step": torch.tensor([4], dtype=torch.int32),
-            },
-        ),
-        _DummySamplingMetadata(step=4),
-    )
-    out = runner.llm.sample(logits, _DummySamplingMetadata(step=4))
-    assert int(out.sampled_token_ids[0, 0]) == cfg.text_eos_token_id
-
-
-def test_ming_compute_logits_uses_cached_forward_stop_prob_for_tensor_path(monkeypatch):
-    runner = _make_runner_for_ming(monkeypatch)
-    cfg = runner.config
-
-    def _low_stop(_hidden_states):
-        return torch.tensor([[10.0, 0.0]], dtype=torch.float32)
-
-    monkeypatch.setattr(runner.llm.stop_head, "forward", _low_stop)
-    runner.llm._last_sample_stop_probs = torch.tensor([1.0], dtype=torch.float32)
-    runner.llm._last_sample_decode_steps = torch.tensor([4], dtype=torch.int32)
-    hidden = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
-
-    logits = runner.llm.compute_logits(hidden, _DummySamplingMetadata(step=4))
-    out = runner.llm.sample(logits, _DummySamplingMetadata(step=4))
-    assert int(out.sampled_token_ids[0, 0]) == cfg.text_eos_token_id
-
-
-def test_ming_forward_exposes_stop_reason_in_outputs_and_pending_state(monkeypatch):
-    runner = _make_runner_for_ming(monkeypatch)
-    cfg = runner.config
-
-    def _low_stop(_hidden_states):
-        return torch.tensor([[10.0, 0.0]], dtype=torch.float32)
-
-    monkeypatch.setattr(runner.llm.stop_head, "forward", _low_stop)
-    decode_ids = torch.tensor([cfg.audio_dummy_token_id], dtype=torch.long)
-    decode_embeds = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
-    output = runner.llm.forward(
-        decode_ids,
-        positions=torch.tensor([0], dtype=torch.long),
-        inputs_embeds=decode_embeds,
-        model_intermediate_buffer=[
-            {
-                KEY_LATENT_HISTORY: torch.zeros((cfg.history_patch_size, cfg.latent_dim), dtype=torch.float32),
-                KEY_REQUEST_ID: "req-stop-reason",
-            }
-        ],
-        seq_token_counts=[1],
-    )
-
-    stop_reason_codes = output.multimodal_outputs[MING_STOP_REASON_KEY]
-    assert isinstance(stop_reason_codes, torch.Tensor)
-    assert int(stop_reason_codes.reshape(-1)[0].item()) == 0
-    pending = runner.llm.pop_postprocess_update("req-stop-reason")
-    assert pending[MING_STOP_REASON_KEY] == MING_STOP_REASON_CONTINUE
-
-
-def test_ming_postprocess_forwards_stop_reason(monkeypatch):
-    runner = _make_runner_for_ming(monkeypatch)
-    cfg = runner.config
-
-    decode_ids = torch.tensor([cfg.audio_dummy_token_id], dtype=torch.long)
-    decode_embeds = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
-    decode_info = {
-        KEY_LATENT_HISTORY: torch.zeros((cfg.history_patch_size, cfg.latent_dim), dtype=torch.float32),
-        KEY_REQUEST_ID: "req-postprocess-stop-reason",
-    }
-
-    output = runner.llm.forward(
-        decode_ids,
-        positions=torch.tensor([0], dtype=torch.long),
-        inputs_embeds=decode_embeds,
-        model_intermediate_buffer=[decode_info],
-        seq_token_counts=[1],
-    )
-    update = runner.stage1.postprocess(output.text_hidden_states, **decode_info)
-
-    assert update[MING_STOP_REASON_KEY] == MING_STOP_REASON_CONTINUE
-
-
-def test_ming_vae_incremental_decode(monkeypatch):
-    runner = _make_runner_for_ming(monkeypatch)
-
-    chunk_a = torch.stack(
-        [
-            torch.ones((4, 64), dtype=torch.float32),
-            torch.full((4, 64), 2.0, dtype=torch.float32),
-        ],
-        dim=0,
-    )
-    out_a = runner.stage2.forward(
-        model_intermediate_buffer=[
-            {
-                "ming_latent_patches": chunk_a,
-                "finished": torch.tensor(False),
-                "stream_finished": torch.tensor(False),
-                KEY_REQUEST_ID: "r1",
-            }
-        ]
-    )
-    wav_a = out_a.multimodal_outputs["model_outputs"][0]
-    state_a = runner.vae._stream_state["r1"]
-    past_a = runner.vae._past_key_values["r1"]
-
-    chunk_b = torch.full((1, 4, 64), 3.0, dtype=torch.float32)
-    out_b = runner.stage2.forward(
-        model_intermediate_buffer=[
-            {
-                "ming_latent_patches": chunk_b,
-                "finished": torch.tensor(False),
-                "stream_finished": torch.tensor(False),
-                KEY_REQUEST_ID: "r1",
-            }
-        ]
-    )
-    wav_b = out_b.multimodal_outputs["model_outputs"][0]
-    state_b = runner.vae._stream_state["r1"]
-
-    assert len(runner.vae.audio.decode_calls) == 3
-    assert runner.vae.audio.decode_calls[1]["latent_shape"] == (1, 4, 64)
-    assert runner.vae.audio.decode_calls[1]["past_key_values"] == {"frames": 4}
-    assert runner.vae.audio.decode_calls[2]["stream_state"] == state_a
-    assert runner.vae.audio.decode_calls[2]["past_key_values"] == past_a
-    assert state_b != state_a
-
-    expected_a = torch.cat(
-        [
-            chunk_a[0].sum(dim=-1),
-            chunk_a[1].sum(dim=-1) + 4 * 10.0,
-        ]
-    )
-    expected_b = chunk_b[0].sum(dim=-1) + 8 * 10.0
-    assert torch.allclose(wav_a, expected_a)
-    assert torch.allclose(wav_b, expected_b)
-    assert torch.allclose(torch.cat([wav_a, wav_b]), torch.cat([expected_a, expected_b]))
-
-
-def test_ming_vae_finalizes_when_stream_finished_is_absent(monkeypatch):
-    runner = _make_runner_for_ming(monkeypatch)
-    chunk = torch.stack(
-        [
-            torch.ones((4, 64), dtype=torch.float32),
-            torch.full((4, 64), 2.0, dtype=torch.float32),
-        ],
-        dim=0,
-    )
-
-    out = runner.stage2.forward(
-        model_intermediate_buffer=[
-            {
-                "ming_latent_patches": chunk,
-                "finished": torch.tensor(True),
-                KEY_REQUEST_ID: "r-sequential",
-            }
-        ]
-    )
-
-    wav = out.multimodal_outputs["model_outputs"][0]
-    assert wav.numel() > 0
-    assert "r-sequential" not in runner.vae._stream_state
-    assert "r-sequential" not in runner.vae._past_key_values
-
-
-def test_ming_recurrent_backbone_can_poison_hidden_states_before_flowloss(monkeypatch):
-    import vllm_omni.model_executor.models.ming_tts.config_ming_tts as cfg_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts as wrapper_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_audio_vae as vae_mod
-    import vllm_omni.model_executor.models.ming_tts.ming_tts_llm as llm_mod
-
-    cfg = _make_config()
-    monkeypatch.setattr(cfg_mod.MingTTSConfig, "from_hf_config", classmethod(lambda cls, hf: cfg))
-    monkeypatch.setattr(
-        llm_mod, "init_vllm_registered_model", lambda **kwargs: NaNOnSecondDecodeBackbone(cfg.llm_hidden_size)
-    )
-    monkeypatch.setattr(llm_mod, "Aggregator", DummyAggregator)
-    monkeypatch.setattr(llm_mod, "FlowLoss", DummyFlowLoss)
-    monkeypatch.setattr(vae_mod, "AudioVAE", DummyAudioVAE)
-
-    llm_model = MingLLMModel(vllm_config=_make_vllm_config("llm"))
-
-    def _wrapper_loader(*, architectures, **kwargs):
-        arch = architectures[0]
-        if arch == "MingLLMModel":
-            return llm_model
-        raise AssertionError(f"unexpected architecture {arch}")
-
-    monkeypatch.setattr(wrapper_mod, "init_vllm_registered_model", _wrapper_loader)
-    stage1 = MingTTSForConditionalGeneration(vllm_config=_make_vllm_config("llm"))
-
-    decode_ids = torch.tensor([cfg.audio_dummy_token_id], dtype=torch.long)
-    decode_embeds = torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32)
-    decode_info = {
-        KEY_LATENT_HISTORY: torch.zeros((cfg.history_patch_size, cfg.latent_dim), dtype=torch.float32),
-        KEY_REQUEST_ID: "req-nan",
-    }
-
-    _, decode_embeds, decode_info = stage1.preprocess_input(decode_ids, decode_embeds, **decode_info)
-    output = llm_model.forward(
-        decode_ids,
-        positions=torch.tensor([0], dtype=torch.long),
-        inputs_embeds=decode_embeds,
-        model_intermediate_buffer=[decode_info],
-        seq_token_counts=[1],
-    )
-    mm = output.multimodal_outputs
-    assert torch.isfinite(mm["ming_next_embeds"]).all()
-
-    update = stage1.postprocess(output.text_hidden_states, multimodal_outputs=mm, **decode_info)
-    _, next_decode_embeds, next_decode_info = stage1.preprocess_input(
-        decode_ids,
-        torch.zeros((1, cfg.llm_hidden_size), dtype=torch.float32),
-        **update,
-    )
-    assert torch.isfinite(next_decode_embeds).all()
-
-    with pytest.raises(RuntimeError, match="Non-finite z_diff_cond before FlowLoss.sample"):
-        llm_model.forward(
-            decode_ids,
-            positions=torch.tensor([1], dtype=torch.long),
-            inputs_embeds=next_decode_embeds,
-            model_intermediate_buffer=[next_decode_info],
-            seq_token_counts=[1],
-        )
diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py
index c31f812e848..32053b28cf7 100644
--- a/vllm_omni/engine/arg_utils.py
+++ b/vllm_omni/engine/arg_utils.py
@@ -36,9 +36,7 @@ def _register_omni_hf_configs() -> None:
         from transformers import AutoConfig
 
         from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
-        from vllm_omni.model_executor.models.ming_tts.configuration_ming_dense import (
-            MingDenseConfig,
-        )
+        from vllm_omni.model_executor.models.ming_tts.config_ming_tts import MingDenseConfig
         from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig
         from vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts import (
             Qwen3TTSConfig,
diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index 37a1c86c55b..4e2ba0e9b79 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -101,7 +101,7 @@
 _TTS_MAX_INSTRUCTIONS_LENGTH = 500
 _TTS_MAX_NEW_TOKENS_MIN = 1
 _TTS_MAX_NEW_TOKENS_MAX = 4096
-_MING_DEFAULT_PROMPT = "Please generate speech based on the following description.\n"
+_MING_DEFAULT_PROMPT = MING_DEFAULT_PROMPT
 
 
 def _create_wav_header(sample_rate: int, num_channels: int = 1, bits_per_sample: int = 16) -> bytes:
@@ -1738,7 +1738,7 @@ def _build_ming_dense_prompt(
         from transformers import AutoTokenizer
 
         from vllm_omni.model_executor.models.ming_tts.config_ming_tts import KEY_MAX_DECODE_STEPS
-        from vllm_omni.model_executor.models.ming_tts.prompt_builder import build_ming_dense_prompt
+        from vllm_omni.model_executor.models.ming_tts.prompt_utils import build_ming_dense_prompt
 
         if self._tts_tokenizer is None:
             model_name = self.engine_client.model_config.model
@@ -2945,14 +2945,4 @@ async def _run_item(idx: int, req: OpenAICreateSpeechRequest) -> SpeechBatchItem
         )
 
 
-ServingSpeech = OmniOpenAIServingSpeech
-     return BatchSpeechResponse(
-            id=batch_id,
-            results=final_results,
-            total=len(final_results),
-            succeeded=succeeded,
-            failed=len(final_results) - succeeded,
-        )
-
-
 ServingSpeech = OmniOpenAIServingSpeech
diff --git a/vllm_omni/model_executor/models/ming_tts/__init__.py b/vllm_omni/model_executor/models/ming_tts/__init__.py
index 5c945c5410e..35afa2ce51a 100644
--- a/vllm_omni/model_executor/models/ming_tts/__init__.py
+++ b/vllm_omni/model_executor/models/ming_tts/__init__.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from .configuration_ming_dense import MingDenseConfig
+from .config_ming_tts import MingDenseConfig
 from .ming_tts import MingTTSForConditionalGeneration
 from .ming_tts_audio_vae import MingAudioVAEModel
 from .ming_tts_llm import MingLLMModel
diff --git a/vllm_omni/model_executor/models/ming_tts/backbone.py b/vllm_omni/model_executor/models/ming_tts/backbone.py
deleted file mode 100644
index 76a3033fe66..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/backbone.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-from typing import Any
-
-import torch
-import torch.nn as nn
-from vllm.config import VllmConfig
-from vllm.model_executor.models.qwen2 import Qwen2Model
-from vllm.model_executor.models.utils import maybe_prefix
-from vllm.sequence import IntermediateTensors
-
-
-class MingQwen2Backbone(nn.Module):
-    """Thin Ming wrapper around upstream vLLM Qwen2Model."""
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        self.model = Qwen2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model"))
-
-    def get_input_embeddings(self) -> nn.Module:
-        if hasattr(self.model, "embed_tokens"):
-            return self.model.embed_tokens
-        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):
-            return self.model.model.embed_tokens
-        raise AttributeError("Could not locate token embeddings on Ming Qwen2 backbone.")
-
-    def embed_input_ids(
-        self,
-        input_ids: torch.Tensor,
-        inputs_embeds: torch.Tensor | None = None,
-        **_: Any,
-    ) -> torch.Tensor:
-        if inputs_embeds is not None:
-            return inputs_embeds
-        if hasattr(self.model, "embed_input_ids"):
-            return self.model.embed_input_ids(input_ids)
-        return self.get_input_embeddings()(input_ids)
-
-    def forward(
-        self,
-        *,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-    ) -> torch.Tensor | IntermediateTensors:
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_input_ids(input_ids)
-        return self.model(
-            input_ids=input_ids,
-            positions=positions,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds,
-        )
-
-    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
-        return self.model.compute_logits(hidden_states)
-
-    def sample(self, logits: torch.Tensor, sampling_metadata: Any):
-        return self.model.sample(logits, sampling_metadata)
diff --git a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
index 0261b39b5c6..2c24b438890 100644
--- a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
@@ -53,6 +53,54 @@
 from .validation import _coerce_audio_vae_config, _nested_get, _to_plain_dict, validate_ming_tts_config
 
 
+def _coerce_qwen2_config(value: Any) -> Qwen2Config:
+    if isinstance(value, Qwen2Config):
+        return value
+    if isinstance(value, PretrainedConfig):
+        return Qwen2Config.from_dict(value.to_dict())
+    if isinstance(value, dict):
+        return Qwen2Config.from_dict(dict(value))
+    raise TypeError(f"Unsupported llm_config type for Ming dense config: {type(value)!r}")
+
+
+def _coerce_ming_dense_audio_vae_config(value: Any) -> AudioVAEconfig | None:
+    if value is None:
+        return None
+    if isinstance(value, AudioVAEconfig):
+        value = value.to_dict()
+    elif isinstance(value, PretrainedConfig):
+        value = value.to_dict()
+    elif isinstance(value, dict):
+        value = dict(value)
+    else:
+        raise TypeError(f"Unsupported audio_tokenizer_config type for Ming dense config: {type(value)!r}")
+
+    return AudioVAEconfig(**value)
+
+
+class MingDenseConfig(PretrainedConfig):
+    model_type = "dense"
+
+    def __init__(
+        self,
+        llm_config: Qwen2Config | dict[str, Any] | None = None,
+        ditar_config: dict[str, Any] | None = None,
+        aggregator_config: dict[str, Any] | None = None,
+        audio_tokenizer_config: AudioVAEconfig | dict[str, Any] | None = None,
+        architectures: list[str] | None = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(architectures=architectures, **kwargs)
+        self.llm_config = _coerce_qwen2_config(llm_config or {})
+        self.ditar_config = dict(ditar_config or {})
+        self.aggregator_config = dict(aggregator_config or {})
+        self.audio_tokenizer_config = _coerce_ming_dense_audio_vae_config(audio_tokenizer_config)
+
+    def get_text_config(self, decoder: bool = False, **kwargs: Any) -> Qwen2Config:
+        del decoder, kwargs
+        return self.llm_config
+
+
 @dataclass
 class MingTTSConfig:
     """Flat config object shared by Stage-1 and Stage-2. Build via from_hf_config()."""
@@ -186,6 +234,7 @@ def approx_chunk_seconds(self) -> float:
     "LLM_HIDDEN_SIZE",
     "LLM_VOCAB_SIZE",
     "MAX_DECODE_STEPS",
+    "MingDenseConfig",
     "MingTTSConfig",
     "PAD_TOKEN_ID",
     "PATCH_SIZE",
diff --git a/vllm_omni/model_executor/models/ming_tts/configuration_ming_dense.py b/vllm_omni/model_executor/models/ming_tts/configuration_ming_dense.py
deleted file mode 100644
index d6f5c8182e9..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/configuration_ming_dense.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-from typing import Any
-
-from transformers import PretrainedConfig, Qwen2Config
-
-from .audio_tokenizer.configuration_audio_vae import AudioVAEconfig
-
-
-def _coerce_qwen2_config(value: Any) -> Qwen2Config:
-    if isinstance(value, Qwen2Config):
-        return value
-    if isinstance(value, PretrainedConfig):
-        return Qwen2Config.from_dict(value.to_dict())
-    if isinstance(value, dict):
-        return Qwen2Config.from_dict(dict(value))
-    raise TypeError(f"Unsupported llm_config type for Ming dense config: {type(value)!r}")
-
-
-def _coerce_audio_vae_config(value: Any) -> AudioVAEconfig | None:
-    if value is None:
-        return None
-    if isinstance(value, AudioVAEconfig):
-        value = value.to_dict()
-    elif isinstance(value, PretrainedConfig):
-        value = value.to_dict()
-    elif isinstance(value, dict):
-        value = dict(value)
-    else:
-        raise TypeError(f"Unsupported audio_tokenizer_config type for Ming dense config: {type(value)!r}")
-
-    return AudioVAEconfig(**value)
-
-
-class MingDenseConfig(PretrainedConfig):
-    model_type = "dense"
-
-    def __init__(
-        self,
-        llm_config: Qwen2Config | dict[str, Any] | None = None,
-        ditar_config: dict[str, Any] | None = None,
-        aggregator_config: dict[str, Any] | None = None,
-        audio_tokenizer_config: AudioVAEconfig | dict[str, Any] | None = None,
-        architectures: list[str] | None = None,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(architectures=architectures, **kwargs)
-        self.llm_config = _coerce_qwen2_config(llm_config or {})
-        self.ditar_config = dict(ditar_config or {})
-        self.aggregator_config = dict(aggregator_config or {})
-        self.audio_tokenizer_config = _coerce_audio_vae_config(audio_tokenizer_config)
-
-    def get_text_config(self, decoder: bool = False, **kwargs: Any) -> Qwen2Config:
-        del decoder, kwargs
-        return self.llm_config
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/cfm.py b/vllm_omni/model_executor/models/ming_tts/fm/cfm.py
index b1924973b47..601f5f72a6e 100644
--- a/vllm_omni/model_executor/models/ming_tts/fm/cfm.py
+++ b/vllm_omni/model_executor/models/ming_tts/fm/cfm.py
@@ -4,7 +4,6 @@
 
 
 import torch
-import torch.nn.functional as F
 from torch import nn
 
 
@@ -74,51 +73,6 @@ def __init__(
     def device(self):
         return next(self.parameters()).device
 
-    def forward(
-        self,
-        cond,
-        target,
-        latent_history,
-        mask,
-        patch_size,
-    ):
-        if patch_size <= 0:
-            raise ValueError(f"patch_size must be positive, got {patch_size}")
-        if cond.ndim != 3:
-            raise ValueError(f"Expected cond rank-3 [Batch, Time, Dimension], got {tuple(cond.shape)}")
-        if target.ndim != 3:
-            raise ValueError(f"Expected target rank-3 [Batch, Time, Dimension], got {tuple(target.shape)}")
-        if latent_history.ndim != 3:
-            raise ValueError(
-                f"Expected latent_history rank-3 [Batch, Time, Dimension], got {tuple(latent_history.shape)}"
-            )
-        if cond.shape[0] != target.shape[0] or cond.shape[0] != latent_history.shape[0]:
-            raise ValueError(
-                "Batch mismatch across cond, target, and latent_history: "
-                f"{cond.shape[0]}, {target.shape[0]}, {latent_history.shape[0]}"
-            )
-        token_mask = _coerce_token_mask(
-            mask, batch_size=target.shape[0], target_steps=target.shape[1], device=target.device
-        )
-
-        x1 = target
-        batch, dtype = x1.shape[0], x1.dtype
-        x0 = torch.randn_like(x1)
-        time = torch.rand((batch,), dtype=dtype, device=self.device)
-        # sample xt (φ_t(x) in the paper)
-        t = time.unsqueeze(-1).unsqueeze(-1)
-        x = (1 - t) * x0 + t * x1
-        flow = x1 - x0
-
-        pred = self.model(x=x, t=time, c=cond, latent_history=latent_history, mask=token_mask)
-        pred = pred[:, -patch_size:, :]
-
-        loss = F.mse_loss(pred, flow, reduction="none")
-        loss_mask = token_mask.unsqueeze(-1).expand_as(loss)
-        loss = loss[loss_mask]
-
-        return loss.mean()
-
     @torch.no_grad()
     def sample(
         self,
@@ -193,15 +147,3 @@ def fn(t, x):
         out = sampled
 
         return out, trajectory
-
-
-def _coerce_token_mask(mask, *, batch_size, target_steps, device):
-    if not isinstance(mask, torch.Tensor):
-        mask = torch.as_tensor(mask, device=device)
-    if mask.ndim == 3 and mask.shape[-1] == 1:
-        mask = mask.squeeze(-1)
-    if mask.ndim != 2:
-        raise ValueError(f"Expected mask rank-2 [Batch, Time] or rank-3 [Batch, Time, 1], got {tuple(mask.shape)}")
-    if mask.shape[0] != batch_size or mask.shape[1] != target_steps:
-        raise ValueError(f"Mask shape mismatch: got {tuple(mask.shape)}, expected {(batch_size, target_steps)}")
-    return mask.to(device=device, dtype=torch.bool)
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/dit.py b/vllm_omni/model_executor/models/ming_tts/fm/dit.py
index 39cc5693507..a45a2db81bc 100644
--- a/vllm_omni/model_executor/models/ming_tts/fm/dit.py
+++ b/vllm_omni/model_executor/models/ming_tts/fm/dit.py
@@ -46,27 +46,12 @@ def forward(self, timestep):
 class CondEmbedder(nn.Module):
     def __init__(self, input_feature_size, hidden_size, dropout_prob):
         super().__init__()
-        self.dropout_prob = dropout_prob
+        del dropout_prob
         self.cond_embedder = nn.Linear(input_feature_size, hidden_size)
 
-    def cond_drop(self, llm_cond):
+    def forward(self, llm_cond):
         if llm_cond.ndim != 3:
             raise ValueError(f"Expected conditioning rank-3 [Batch, Time, Dimension], got {tuple(llm_cond.shape)}")
-        bsz = llm_cond.shape[0]
-        drop_latent_mask = torch.rand(bsz) < self.dropout_prob
-        drop_latent_mask = drop_latent_mask.unsqueeze(-1).unsqueeze(-1).to(llm_cond.dtype).to(llm_cond.device)
-        fake_latent = torch.zeros_like(llm_cond)
-        llm_cond = drop_latent_mask * fake_latent + (1 - drop_latent_mask) * llm_cond
-
-        return llm_cond
-
-    def forward(self, llm_cond, train):
-        if llm_cond.ndim != 3:
-            raise ValueError(f"Expected conditioning rank-3 [Batch, Time, Dimension], got {tuple(llm_cond.shape)}")
-        use_dropout = self.dropout_prob > 0
-        if train and use_dropout:
-            llm_cond = self.cond_drop(llm_cond)
-
         llm_cond = self.cond_embedder(llm_cond)
 
         return llm_cond
@@ -130,7 +115,7 @@ def forward(self, x, t, c, latent_history, mask=None):
         x_now = self.x_embedder(x)
         x_history = self.x_embedder(latent_history)
         x = torch.cat([x_history, x_now], dim=1)
-        c = self.c_embedder(c, self.training)
+        c = self.c_embedder(c)
         y = t + c
         x = torch.cat([y, x], dim=1)
         rope = self.rotary_embed.forward_from_seq_len(x.shape[1])
diff --git a/vllm_omni/model_executor/models/ming_tts/ingress.py b/vllm_omni/model_executor/models/ming_tts/ingress.py
index 77428164d5a..92ab2860232 100644
--- a/vllm_omni/model_executor/models/ming_tts/ingress.py
+++ b/vllm_omni/model_executor/models/ming_tts/ingress.py
@@ -17,7 +17,7 @@
     KEY_TEXT_MODE,
     MingTTSConfig,
 )
-from .prompt_builder import (
+from .prompt_utils import (
     build_dense_prompt_token_ids,
     coerce_speaker_embeddings,
     count_prompt_waveform_patches,
diff --git a/vllm_omni/model_executor/models/ming_tts/loader.py b/vllm_omni/model_executor/models/ming_tts/loader.py
index dc1f25fcb5a..2e80e93e9cd 100644
--- a/vllm_omni/model_executor/models/ming_tts/loader.py
+++ b/vllm_omni/model_executor/models/ming_tts/loader.py
@@ -2,27 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
-import json
-from io import BytesIO
-from pathlib import Path
 from typing import Any
 
 import torch
-from safetensors import safe_open
-
-from vllm_omni.engine.stage_init_utils import _resolve_model_to_local_path
-
-from .audio_tokenizer.modeling_audio_vae import AudioVAE
-from .config_ming_tts import (
-    KEY_PROMPT_LATENTS,
-    VISION_START_TOKEN_ID,
-    MingTTSConfig,
-)
-from .prompt_builder import (
-    coerce_prompt_waveform,
-    count_prompt_latent_patches,
-    pad_prompt_waveform,
-)
 
 
 def load_weights(model_stage: str, model: Any, weights: list[tuple[str, torch.Tensor]]):
@@ -42,255 +24,3 @@ def load_weights(model_stage: str, model: Any, weights: list[tuple[str, torch.Te
         raise RuntimeError("Ming Stage-1 received no loadable checkpoint weights. Expected prefix: audio.*")
     loaded = model.load_weights(audio_weights)
     return {f"model.{name}" for name in loaded}
-
-
-def _resolve_prompt_latents(wrapper: Any, info_dict: dict[str, Any]) -> dict[str, torch.Tensor] | None:
-    raw_latents = info_dict.get(KEY_PROMPT_LATENTS, info_dict.get("prompt_latents"))
-    raw_waveform = info_dict.get("prompt_waveform", info_dict.get("prompt_waveforms"))
-    if raw_latents is not None and raw_waveform is not None:
-        raise ValueError(
-            "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
-            "Choose exactly one source of truth."
-        )
-
-    direct_latents = _coerce_prompt_latents(
-        raw_latents,
-        patch_size=wrapper.ming_config.patch_size,
-        latent_dim=wrapper.ming_config.latent_dim,
-    )
-    if direct_latents is not None:
-        return direct_latents
-    if raw_waveform is None:
-        return None
-
-    encode_fn = getattr(wrapper, "_encode_prompt_waveform_to_latents", None)
-    if callable(encode_fn):
-        latents = encode_fn(raw_waveform, info_dict.get("prompt_waveform_length"))
-    else:
-        latents = _encode_prompt_waveform_to_latents(
-            wrapper,
-            raw_waveform,
-            info_dict.get("prompt_waveform_length"),
-        )
-    return _coerce_prompt_latents(
-        latents,
-        patch_size=wrapper.ming_config.patch_size,
-        latent_dim=wrapper.ming_config.latent_dim,
-    )
-
-
-def _load_prompt_encoder(wrapper: Any) -> AudioVAE:
-    if wrapper._prompt_encoder is not None:
-        return wrapper._prompt_encoder
-    if wrapper.ming_config.audio_tokenizer_config is None:
-        raise RuntimeError("Ming Stage-0 requires audio_tokenizer_config to encode prompt audio.")
-
-    encoder = AudioVAE(wrapper.ming_config.audio_tokenizer_config).eval()
-    state_dict = encoder.state_dict()
-    loaded = 0
-    loaded_encoder_params = set()
-    with torch.no_grad():
-        for shard_path in _iter_model_safetensors(
-            _resolve_model_to_local_path(str(wrapper.vllm_config.model_config.model))
-        ):
-            with safe_open(str(shard_path), framework="pt", device="cpu") as handle:
-                for key in handle.keys():
-                    if not key.startswith("audio.encoder."):
-                        continue
-                    name = key[len("audio.") :]
-                    if name not in state_dict:
-                        continue
-                    target = state_dict[name]
-                    target.copy_(handle.get_tensor(key).to(device=target.device, dtype=target.dtype))
-                    loaded += 1
-                    loaded_encoder_params.add(name)
-    if loaded == 0:
-        raise RuntimeError("Ming prompt encoder received no audio.encoder.* weights from checkpoint.")
-
-    expected_encoder_params = {f"encoder.{name}" for name, _ in encoder.encoder.named_parameters()}
-    missing = expected_encoder_params - loaded_encoder_params
-    if missing:
-        raise RuntimeError(f"Ming prompt encoder: {len(missing)} params not loaded. First few: {sorted(missing)[:5]}")
-
-    dev = next(wrapper.parameters()).device
-    try:
-        del encoder.decoder
-        encoder.decoder = None
-        if dev.type != "cpu":
-            encoder.encoder.to(dev, dtype=getattr(wrapper.model, "fm_dtype", torch.bfloat16))
-        else:
-            encoder.encoder.to(dev)
-    except Exception as exc:
-        raise RuntimeError(f"Failed to move Ming prompt encoder to {dev}: {exc}") from exc
-    wrapper._prompt_encoder = encoder
-    return encoder
-
-
-@torch.inference_mode()
-def _encode_prompt_waveform_to_latents(wrapper: Any, waveform: Any, waveform_length: Any = None) -> torch.Tensor:
-    encoder = _load_prompt_encoder(wrapper)
-    waveform = _normalize_prompt_waveform(waveform, target_sr=wrapper.ming_config.sample_rate)
-    waveform = pad_prompt_waveform(
-        waveform,
-        patch_size=wrapper.ming_config.patch_size,
-        sample_rate=wrapper.ming_config.sample_rate,
-        frame_hop=wrapper.ming_config.audio_frame_hop,
-    )
-    dev = next(encoder.encoder.parameters()).device
-    waveform = waveform.to(device=dev, dtype=next(encoder.encoder.parameters()).dtype)
-    if waveform_length is None:
-        waveform_length = torch.full((waveform.shape[0],), waveform.shape[-1], dtype=torch.int32, device=dev)
-    elif not isinstance(waveform_length, torch.Tensor):
-        waveform_length = torch.as_tensor(waveform_length, dtype=torch.int32, device=dev)
-    else:
-        waveform_length = waveform_length.to(device=dev, dtype=torch.int32)
-
-    latents, _ = encoder.encode_latent(waveform, waveform_length)
-    if latents.ndim == 3 and latents.shape[0] == 1:
-        latents = latents.squeeze(0)
-    count_prompt_latent_patches(
-        latents,
-        patch_size=wrapper.ming_config.patch_size,
-        latent_dim=wrapper.ming_config.latent_dim,
-    )
-    return latents.detach().to(dtype=torch.float32).contiguous()
-
-
-def _iter_model_safetensors(local_model_path: str) -> list[Path]:
-    model_root = Path(local_model_path)
-    index_path = model_root / "model.safetensors.index.json"
-    if index_path.exists():
-        with index_path.open("r", encoding="utf-8") as handle:
-            index_data = json.load(handle)
-        filenames = sorted(set(index_data.get("weight_map", {}).values()))
-        if not filenames:
-            raise RuntimeError(f"No checkpoint shards listed in {index_path}")
-        return [model_root / filename for filename in filenames]
-
-    single_file = model_root / "model.safetensors"
-    if single_file.exists():
-        return [single_file]
-
-    files = sorted(model_root.glob("*.safetensors"))
-    if not files:
-        raise RuntimeError(f"No .safetensors checkpoint found under {local_model_path}")
-    return files
-
-
-def _normalize_prompt_waveform(value: Any, *, target_sr: int) -> torch.Tensor:
-    if isinstance(value, bytes):
-        import torchaudio
-
-        waveform, sr = torchaudio.load(BytesIO(value))
-        waveform = waveform[:1].to(torch.float32)
-        if int(sr) != int(target_sr):
-            from torchaudio.functional import resample as resample_audio
-
-            waveform = resample_audio(waveform, int(sr), int(target_sr))
-        return waveform
-
-    if isinstance(value, tuple) and len(value) == 2 and isinstance(value[1], int):
-        waveform = coerce_prompt_waveform(value[0])
-        if int(value[1]) != int(target_sr):
-            from torchaudio.functional import resample as resample_audio
-
-            waveform = resample_audio(waveform, int(value[1]), int(target_sr))
-        return waveform
-
-    if isinstance(value, dict):
-        samples = value.get("samples", value.get("array", value.get("waveform")))
-        sr = value.get("sample_rate", value.get("sr", target_sr))
-        return _normalize_prompt_waveform((samples, int(sr)), target_sr=target_sr)
-
-    return coerce_prompt_waveform(value)
-
-
-def _coerce_prompt_latents(
-    value: Any,
-    *,
-    patch_size: int,
-    latent_dim: int,
-) -> dict[str, torch.Tensor] | None:
-    if value is None:
-        return None
-    if not isinstance(value, torch.Tensor):
-        value = torch.as_tensor(value)
-
-    latents = value.detach()
-    if latents.ndim == 3 and latents.shape[0] == 1:
-        latents = latents.squeeze(0)
-
-    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
-        patches = latents
-        frames = patches.reshape(-1, latent_dim)
-        return {"patches": patches, "frames": frames}
-
-    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
-        raise ValueError(f"Unsupported prompt latent shape: {tuple(latents.shape)}")
-    if latents.shape[0] % patch_size != 0:
-        raise ValueError(
-            f"Prompt latent frame count must be divisible by patch_size={patch_size}, "
-            f"got frames={int(latents.shape[0])}"
-        )
-    patches = latents.reshape(-1, patch_size, latent_dim) if latents.shape[0] > 0 else None
-    return {"patches": patches, "frames": latents}
-
-
-def _initial_history(
-    frames: torch.Tensor | None,
-    *,
-    history_size: int,
-    latent_dim: int,
-    device: torch.device,
-    dtype: torch.dtype,
-) -> torch.Tensor:
-    history = torch.zeros((history_size, latent_dim), device=device, dtype=dtype)
-    if frames is None or frames.numel() == 0:
-        return history
-    frames = frames.to(device=device, dtype=dtype)
-    take = min(history_size, int(frames.shape[0]))
-    history[-take:] = frames[-take:]
-    return history
-
-
-def _take_index(value: Any, idx: int) -> torch.Tensor | None:
-    if not isinstance(value, torch.Tensor) or value.numel() == 0:
-        return None
-    return value[idx]
-
-
-def _take_scalar(value: Any, idx: int) -> float | None:
-    if not isinstance(value, torch.Tensor) or value.numel() == 0:
-        return None
-    return float(value.reshape(-1)[idx].item())
-
-
-def _find_audio_placeholder_positions(input_ids: torch.Tensor, cfg: MingTTSConfig) -> torch.Tensor:
-    dummy_pos = (input_ids == cfg.audio_dummy_token_id).nonzero(as_tuple=True)[0]
-    if dummy_pos.numel() == 0:
-        return dummy_pos
-
-    audio_start_pos = (input_ids == cfg.audio_start_token_id).nonzero(as_tuple=True)[0]
-    audio_end_pos = (input_ids == cfg.audio_end_token_id).nonzero(as_tuple=True)[0]
-    if audio_start_pos.numel() == 0:
-        return dummy_pos
-
-    start = int(audio_start_pos[0].item())
-    end = int(audio_end_pos[0].item()) if audio_end_pos.numel() > 0 else int(input_ids.shape[0])
-    keep = (dummy_pos > start) & (dummy_pos < end)
-    filtered = dummy_pos[keep]
-    return filtered if filtered.numel() > 0 else dummy_pos
-
-
-def _find_speaker_placeholder_positions(input_ids: torch.Tensor, hf_config: Any) -> list[int]:
-    vision_start_token_id = getattr(hf_config, "vision_start_token_id", VISION_START_TOKEN_ID)
-    vision_start_pos = (input_ids == int(vision_start_token_id)).nonzero(as_tuple=True)[0]
-    if vision_start_pos.numel() == 0:
-        return []
-
-    slots = []
-    for pos in vision_start_pos:
-        slot = int(pos.item()) + 1
-        if slot < int(input_ids.shape[0]):
-            slots.append(slot)
-    return slots
diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts.py b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
index cf4550cb895..d86c50fe6c2 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
@@ -32,13 +32,16 @@
     MingTTSConfig,
 )
 from .loader import (
+    load_weights,
+)
+from .prompt_utils import (
     _coerce_prompt_latents,
     _find_audio_placeholder_positions,
     _find_speaker_placeholder_positions,
     _initial_history,
     _resolve_prompt_latents,
     _take_scalar,
-    load_weights,
+    coerce_speaker_embeddings,
 )
 
 MING_STOP_REASON_KEY = "ming_stop_reason"
@@ -167,8 +170,6 @@ def _prefill_preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tenso
         speaker_embedding = info_dict.get(KEY_SPEAKER_EMBEDDING, info_dict.get("speaker_embedding"))
         speaker_embeddings = None
         if speaker_embedding is not None:
-            from .prompt_builder import coerce_speaker_embeddings
-
             speaker_embeddings = coerce_speaker_embeddings(
                 speaker_embedding,
                 use_zero_spk_emb=bool(info_dict.get("use_zero_spk_emb", False)),
diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py b/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
index 7ffa524f140..007dac25b06 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
@@ -10,8 +10,8 @@
 import torch.nn as nn
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name
-from vllm.model_executor.models.utils import init_vllm_registered_model, is_pp_missing_parameter
+from vllm.model_executor.models.qwen2 import Qwen2Model
+from vllm.model_executor.models.utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 from vllm.sequence import IntermediateTensors
 from vllm.v1.outputs import SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -19,7 +19,6 @@
 from vllm_omni.model_executor.models.output_templates import OmniOutput
 
 from .aggregator import Aggregator
-from .backbone import MingQwen2Backbone
 from .config_ming_tts import (
     KEY_CFG,
     KEY_DECODE_STEP,
@@ -53,23 +52,23 @@
 )
 
 logger = init_logger(__name__)
-_ORIGINAL_INIT_VLLM_REGISTERED_MODEL = init_vllm_registered_model
 
 
 class MingLLMModel(nn.Module):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.model.": "model.",
+        }
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.ming_config = MingTTSConfig.from_hf_config(vllm_config.model_config.hf_config)
         self.ming_config.validate()
         self.vllm_config = vllm_config
         self.prefix = prefix
-        self.quant_config = vllm_config.quant_config
         self.fm_dtype = _resolve_ming_runtime_dtype(vllm_config)
-        self.model = (
-            init_vllm_registered_model(vllm_config=vllm_config, architectures=["Qwen2ForCausalLM"])
-            if init_vllm_registered_model is not _ORIGINAL_INIT_VLLM_REGISTERED_MODEL
-            else MingQwen2Backbone(vllm_config=vllm_config, prefix=prefix)
-        )
+        self.model = Qwen2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model"))
         self.linear_proj_audio = Aggregator(
             in_channels=self.ming_config.latent_dim,
             llm_input_dim=self.ming_config.llm_hidden_size,
@@ -465,67 +464,11 @@ def _get_decode_steps(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        skipped: list[str] = []
-        mapping = [
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        for ckpt_name, loaded_weight in weights:
-            name = ckpt_name
-            if self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name)):
-                if scale_name not in params_dict:
-                    skipped.append(ckpt_name)
-                    continue
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0])
-                loaded_params.add(scale_name)
-                continue
-            mapped_name = None
-            for param_name, weight_name, shard_id in mapping:
-                if weight_name not in name:
-                    continue
-                mapped_name = name.replace(weight_name, param_name)
-                if mapped_name.endswith(".bias") and mapped_name not in params_dict:
-                    mapped_name = None
-                    break
-                if is_pp_missing_parameter(mapped_name, self) or mapped_name not in params_dict:
-                    mapped_name = None
-                    continue
-                param = params_dict[mapped_name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight) if weight_loader == default_weight_loader else weight_loader(
-                    param, loaded_weight, shard_id
-                )
-                loaded_params.add(mapped_name)
-                break
-            if mapped_name in loaded_params or name.endswith(".bias") and name not in params_dict:
-                continue
-            name = maybe_remap_kv_scale_name(name, params_dict)
-            if name is None:
-                continue
-            if name.startswith("model.") and name not in params_dict and f"model.{name}" in params_dict:
-                name = f"model.{name}"
-            if is_pp_missing_parameter(name, self):
-                continue
-            if name not in params_dict:
-                skipped.append(ckpt_name)
-                continue
-            getattr(params_dict[name], "weight_loader", default_weight_loader)(params_dict[name], loaded_weight)
-            loaded_params.add(name)
+        loaded_params = AutoWeightsLoader(self).load_weights(weights, mapper=self.hf_to_vllm_mapper)
         _warn_missing_prefix("flowloss", params_dict, loaded_params, prefix="flowloss.", fatal=True)
         _warn_missing_prefix("linear_proj_audio", params_dict, loaded_params, prefix="linear_proj_audio.", fatal=True)
         _warn_missing_prefix("stop_head", params_dict, loaded_params, prefix="stop_head.", fatal=True)
         _warn_missing_prefix("spk_head", params_dict, loaded_params, prefix="spk_head.", fatal=True)
-        if skipped:
-            warnings.warn(
-                f"MingLLMModel: skipped {len(skipped)} checkpoint keys during load. First few: {skipped[:8]}",
-                stacklevel=2,
-            )
         return loaded_params
 
 
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_builder/__init__.py b/vllm_omni/model_executor/models/ming_tts/prompt_builder/__init__.py
deleted file mode 100644
index dcabecc838b..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/prompt_builder/__init__.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-from ._base import (
-    coerce_prompt_waveform,
-    coerce_speaker_embeddings,
-    count_prompt_latent_patches,
-    count_prompt_waveform_patches,
-    create_instruction,
-    estimate_decode_step_window_for_duration,
-    estimate_decode_steps_for_duration,
-    pad_prompt_waveform,
-    parse_duration_seconds,
-)
-from .builders import (
-    build_dense_prompt_token_ids,
-    build_ming_dense_prompt,
-    build_runtime_controls,
-    resolve_effective_runtime_controls,
-)
-
-__all__ = [
-    "build_dense_prompt_token_ids",
-    "build_ming_dense_prompt",
-    "build_runtime_controls",
-    "coerce_prompt_waveform",
-    "coerce_speaker_embeddings",
-    "count_prompt_latent_patches",
-    "count_prompt_waveform_patches",
-    "create_instruction",
-    "estimate_decode_step_window_for_duration",
-    "estimate_decode_steps_for_duration",
-    "pad_prompt_waveform",
-    "parse_duration_seconds",
-    "resolve_effective_runtime_controls",
-]
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_builder/_base.py b/vllm_omni/model_executor/models/ming_tts/prompt_builder/_base.py
deleted file mode 100644
index 8627cbb4c33..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/prompt_builder/_base.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-import copy
-import json
-import math
-import re
-from typing import Any
-
-import torch
-
-from ..config_ming_tts import AUDIO_FRAME_HOP, LATENT_DIM, PATCH_SIZE, SAMPLE_RATE, VAE_PATCH_SIZE
-
-BASE_CAPTION_TEMPLATE = {
-    "audio_sequence": [
-        {
-            "序号": 1,
-            "说话人": "speaker_1",
-            "方言": None,
-            "风格": None,
-            "语速": None,
-            "基频": None,
-            "音量": None,
-            "情感": None,
-            "BGM": {
-                "Genre": None,
-                "Mood": None,
-                "Instrument": None,
-                "Theme": None,
-                "ENV": None,
-                "SNR": None,
-            },
-            "IP": None,
-        }
-    ]
-}
-
-_DURATION_SECONDS_RE = re.compile(r"Duration:\s*([0-9]+(?:\.[0-9]+)?)\s*s\b", re.IGNORECASE)
-
-
-def create_instruction(value: Any) -> str | None:
-    if value is None:
-        return None
-    if isinstance(value, str):
-        return value
-    if not isinstance(value, dict):
-        raise ValueError(f"Ming instruction must be str or dict, got {type(value).__name__}")
-
-    caption = copy.deepcopy(BASE_CAPTION_TEMPLATE)
-    target = caption["audio_sequence"][0]
-    for key, item in value.items():
-        if key in target:
-            target[key] = item
-
-    if target["BGM"].get("SNR") is not None:
-        order = ["序号", "说话人", "BGM", "情感", "方言", "风格", "语速", "基频", "音量", "IP"]
-        caption["audio_sequence"][0] = {key: target[key] for key in order if key in target}
-    return json.dumps(caption, ensure_ascii=False)
-
-
-def parse_duration_seconds(text: str | None) -> float | None:
-    if not isinstance(text, str):
-        return None
-    match = _DURATION_SECONDS_RE.search(text)
-    if match is None:
-        return None
-    try:
-        value = float(match.group(1))
-    except ValueError:
-        return None
-    if value <= 0.0:
-        return None
-    return value
-
-
-def estimate_decode_steps_for_duration(
-    duration_seconds: float,
-    *,
-    sample_rate: int = SAMPLE_RATE,
-    frame_hop: int = AUDIO_FRAME_HOP,
-    patch_size: int = PATCH_SIZE,
-    vae_patch_size: int = VAE_PATCH_SIZE,
-) -> int:
-    if duration_seconds <= 0.0:
-        return 0
-    samples_per_decode_step = int(frame_hop) * int(patch_size) * int(vae_patch_size)
-    required_samples = float(duration_seconds) * float(sample_rate)
-    return max(1, int(math.ceil(required_samples / float(samples_per_decode_step))))
-
-
-def estimate_decode_step_window_for_duration(duration_seconds: float) -> tuple[int, int]:
-    target_steps = estimate_decode_steps_for_duration(duration_seconds)
-    min_steps = max(1, target_steps - 3)
-    max_steps = max(min_steps, target_steps + 3)
-    return min_steps, max_steps
-
-
-def pad_prompt_waveform(
-    waveform: Any,
-    *,
-    patch_size: int = PATCH_SIZE,
-    sample_rate: int = SAMPLE_RATE,
-    frame_hop: int = AUDIO_FRAME_HOP,
-) -> torch.Tensor:
-    tensor = coerce_prompt_waveform(waveform)
-    del frame_hop
-    pad_align = int((float(sample_rate) / 12.5) * int(patch_size))
-    new_len = ((int(tensor.shape[-1]) + pad_align - 1) // pad_align) * pad_align
-    if new_len == int(tensor.shape[-1]):
-        return tensor
-    padded = torch.zeros((1, new_len), dtype=tensor.dtype, device=tensor.device)
-    padded[:, : tensor.shape[-1]] = tensor
-    return padded
-
-
-def coerce_prompt_waveform(value: Any) -> torch.Tensor:
-    if value is None:
-        raise ValueError("prompt waveform cannot be None")
-    if isinstance(value, torch.Tensor):
-        tensor = value.detach()
-        if tensor.ndim == 1:
-            return tensor.unsqueeze(0).to(torch.float32)
-        if tensor.ndim == 2:
-            if tensor.shape[0] != 1:
-                return tensor.reshape(1, -1).to(torch.float32)
-            return tensor.to(torch.float32)
-        raise ValueError(f"Unsupported Ming prompt waveform rank: {tuple(tensor.shape)}")
-    if isinstance(value, (list, tuple)):
-        parts = [coerce_prompt_waveform(item) for item in value if item is not None]
-        if not parts:
-            raise ValueError("prompt waveform list was empty")
-        return torch.cat(parts, dim=-1)
-    return coerce_prompt_waveform(torch.as_tensor(value))
-
-
-def coerce_speaker_embeddings(value: Any, *, use_zero_spk_emb: bool = False) -> list[torch.Tensor] | None:
-    if value is None:
-        return [torch.zeros((192,), dtype=torch.float32)] if use_zero_spk_emb else None
-    if isinstance(value, torch.Tensor):
-        tensor = value.detach()
-        if tensor.ndim == 1:
-            tensor = tensor.unsqueeze(0)
-        if tensor.ndim != 2:
-            raise ValueError(f"Unsupported Ming speaker embedding shape: {tuple(tensor.shape)}")
-        items = [row.reshape(-1).to(torch.float32).cpu() for row in tensor]
-    elif isinstance(value, (list, tuple)):
-        if value and all(not isinstance(item, (list, tuple, torch.Tensor)) for item in value):
-            items = [torch.as_tensor(value).detach().reshape(-1).to(torch.float32).cpu()]
-        else:
-            items = []
-            for item in value:
-                if item is None:
-                    continue
-                if not isinstance(item, torch.Tensor):
-                    item = torch.as_tensor(item)
-                items.append(item.detach().reshape(-1).to(torch.float32).cpu())
-    else:
-        return coerce_speaker_embeddings(torch.as_tensor(value), use_zero_spk_emb=use_zero_spk_emb)
-    if not items:
-        return [torch.zeros((192,), dtype=torch.float32)] if use_zero_spk_emb else None
-    for item in items:
-        if int(item.numel()) != 192:
-            raise ValueError(f"Ming speaker embedding must have 192 dims, got {int(item.numel())}")
-    return items
-
-
-def count_prompt_latent_patches(
-    value: Any,
-    *,
-    patch_size: int = PATCH_SIZE,
-    latent_dim: int = LATENT_DIM,
-) -> int:
-    if value is None:
-        return 0
-    if not isinstance(value, torch.Tensor):
-        value = torch.as_tensor(value)
-    latents = value.detach()
-    if latents.ndim == 3 and latents.shape[0] == 1:
-        latents = latents.squeeze(0)
-    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
-        return int(latents.shape[0])
-    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
-        raise ValueError(f"Unsupported Ming prompt_latents shape: {tuple(latents.shape)}")
-    if latents.shape[0] % patch_size != 0:
-        raise ValueError(
-            f"Ming prompt_latents frame count must be divisible by patch_size={patch_size}, "
-            f"got frames={int(latents.shape[0])}"
-        )
-    return int(latents.shape[0] // patch_size)
-
-
-def count_prompt_waveform_patches(
-    value: Any,
-    *,
-    patch_size: int = PATCH_SIZE,
-    frame_hop: int = AUDIO_FRAME_HOP,
-    vae_patch_size: int = VAE_PATCH_SIZE,
-) -> int:
-    if value is None:
-        return 0
-    waveform = pad_prompt_waveform(value, patch_size=patch_size, frame_hop=frame_hop)
-    frame_count = int(math.ceil(float(waveform.shape[-1]) / float(frame_hop)))
-    latent_frames = int(math.ceil(float(frame_count) / float(vae_patch_size)))
-    if latent_frames % int(patch_size) != 0:
-        raise ValueError(
-            f"Ming prompt waveform produced latent frame count not divisible by patch_size={patch_size}: "
-            f"frames={latent_frames}"
-        )
-    return int(latent_frames // int(patch_size))
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_builder/builders.py b/vllm_omni/model_executor/models/ming_tts/prompt_builder/builders.py
deleted file mode 100644
index 6e4f9d4220f..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/prompt_builder/builders.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-from typing import Any
-
-import torch
-
-from ..config_ming_tts import (
-    KEY_CFG,
-    KEY_MAX_DECODE_STEPS,
-    KEY_MIN_DECODE_STEPS,
-    KEY_PROMPT_LATENTS,
-    KEY_REQUEST_ID,
-    KEY_SIGMA,
-    KEY_SPEAKER_EMBEDDING,
-    KEY_TEMPERATURE,
-    LATENT_DIM,
-    PATCH_SIZE,
-)
-from ._base import (
-    coerce_speaker_embeddings,
-    count_prompt_latent_patches,
-    count_prompt_waveform_patches,
-    create_instruction,
-    estimate_decode_step_window_for_duration,
-    pad_prompt_waveform,
-    parse_duration_seconds,
-)
-
-
-def resolve_effective_runtime_controls(
-    *,
-    text: str,
-    runtime_controls: dict[str, Any] | None = None,
-) -> dict[str, Any]:
-    controls = {} if runtime_controls is None else dict(runtime_controls)
-    has_explicit_min = KEY_MIN_DECODE_STEPS in controls and controls[KEY_MIN_DECODE_STEPS] is not None
-    has_explicit_max = KEY_MAX_DECODE_STEPS in controls and controls[KEY_MAX_DECODE_STEPS] is not None
-    if has_explicit_min or has_explicit_max:
-        return controls
-    duration_seconds = parse_duration_seconds(text)
-    if duration_seconds is None:
-        return controls
-    min_decode_steps, max_decode_steps = estimate_decode_step_window_for_duration(duration_seconds)
-    controls[KEY_MIN_DECODE_STEPS] = min_decode_steps
-    controls[KEY_MAX_DECODE_STEPS] = max_decode_steps
-    return controls
-
-
-def build_dense_prompt_token_ids(
-    tokenizer: Any,
-    *,
-    prompt: str,
-    text: str,
-    instruction: str | None = None,
-    prompt_text: str | None = None,
-    speaker_count: int = 0,
-    prompt_patch_count: int = 0,
-) -> list[int]:
-    speaker_prompt = []
-    for idx in range(int(speaker_count)):
-        speaker_prompt.extend(
-            tokenizer.encode(f"  speaker_{idx + 1}:")
-            + tokenizer.encode("<|vision_start|>")
-            + tokenizer.encode("<|vision_pad|>")
-            + tokenizer.encode("<|vision_end|>\n")
-        )
-    instruction_prompt = (
-        tokenizer.encode(instruction) + tokenizer.encode("<|endoftext|>") if instruction is not None else []
-    )
-    prompt_text_tokens = (
-        tokenizer.encode(prompt_text) if int(prompt_patch_count) > 0 and prompt_text is not None else []
-    )
-    prompt_latent_tokens = [tokenizer.convert_tokens_to_ids("<audioPatch>")] * int(prompt_patch_count)
-    text_input_prefix = (
-        []
-        if all(token in text for token in ("Genre: ", "Mood: ", "Instrument: ", "Theme: ", "Duration: "))
-        else tokenizer.encode(" Text input:\n")
-    )
-    return (
-        tokenizer.encode("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n")
-        + tokenizer.encode("<|im_start|>user\n")
-        + tokenizer.encode(prompt)
-        + speaker_prompt
-        + text_input_prefix
-        + prompt_text_tokens
-        + tokenizer.encode(text)
-        + tokenizer.encode("<|im_end|>\n")
-        + tokenizer.encode("<|im_start|>assistant\n")
-        + instruction_prompt
-        + tokenizer.encode("<audio>")
-        + prompt_latent_tokens
-    )
-
-
-def build_ming_dense_prompt(
-    tokenizer: Any,
-    *,
-    prompt: str,
-    text: str,
-    runtime_controls: dict[str, Any] | None = None,
-    instruction: Any = None,
-    prompt_text: str | None = None,
-    prompt_waveform: Any = None,
-    prompt_latents: Any = None,
-    speaker_embedding: Any = None,
-    use_zero_spk_emb: bool = False,
-    request_id: str | None = None,
-) -> dict[str, Any]:
-    instruction_text = create_instruction(instruction)
-    speaker_embeddings = coerce_speaker_embeddings(speaker_embedding, use_zero_spk_emb=use_zero_spk_emb)
-    effective_runtime_controls = resolve_effective_runtime_controls(text=text, runtime_controls=runtime_controls)
-
-    prompt_waveform_tensor = None
-    prompt_patch_count = 0
-    if prompt_waveform is not None:
-        prompt_waveform_tensor = pad_prompt_waveform(prompt_waveform)
-        prompt_patch_count = count_prompt_waveform_patches(prompt_waveform_tensor)
-    if prompt_waveform_tensor is not None and prompt_latents is not None:
-        raise ValueError(
-            "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
-            "Choose exactly one source of truth."
-        )
-
-    prompt_latent_value = None
-    if prompt_waveform_tensor is not None and prompt_text is None:
-        raise ValueError(
-            "Ming prompt_waveform requires prompt_text for prompt-latent conditioning. "
-            "Use speaker_embedding for reference-audio-only speaker conditioning."
-        )
-    if prompt_latents is not None:
-        prompt_latent_value = torch.as_tensor(prompt_latents)
-        prompt_patch_count = count_prompt_latent_patches(
-            prompt_latent_value, patch_size=PATCH_SIZE, latent_dim=LATENT_DIM
-        )
-
-    prompt_token_ids = build_dense_prompt_token_ids(
-        tokenizer,
-        prompt=prompt,
-        text=text,
-        instruction=instruction_text,
-        prompt_text=prompt_text if prompt_patch_count > 0 else None,
-        speaker_count=0 if speaker_embeddings is None else len(speaker_embeddings),
-        prompt_patch_count=prompt_patch_count,
-    )
-
-    additional_information = {}
-    for key, value in effective_runtime_controls.items():
-        if isinstance(value, torch.Tensor):
-            additional_information[key] = value
-        elif key in (KEY_MIN_DECODE_STEPS, KEY_MAX_DECODE_STEPS):
-            additional_information[key] = torch.tensor(int(value), dtype=torch.int32)
-        else:
-            additional_information[key] = torch.tensor(float(value), dtype=torch.float32)
-    if request_id is not None:
-        additional_information[KEY_REQUEST_ID] = request_id
-    if instruction_text is not None:
-        additional_information["instruction"] = instruction_text
-    if prompt_text is not None:
-        additional_information["prompt_text"] = prompt_text
-    if prompt_waveform_tensor is not None:
-        additional_information["prompt_waveform"] = prompt_waveform_tensor
-        additional_information["prompt_waveform_length"] = torch.tensor(
-            [int(prompt_waveform_tensor.shape[-1])], dtype=torch.int32
-        )
-    if prompt_latent_value is not None:
-        additional_information[KEY_PROMPT_LATENTS] = prompt_latent_value
-    if speaker_embeddings is not None:
-        additional_information[KEY_SPEAKER_EMBEDDING] = (
-            speaker_embeddings[0] if len(speaker_embeddings) == 1 else torch.stack(speaker_embeddings, dim=0)
-        )
-    if use_zero_spk_emb:
-        additional_information["use_zero_spk_emb"] = True
-    return {
-        "prompt": prompt,
-        "text": text,
-        "prompt_token_ids": prompt_token_ids,
-        "additional_information": additional_information,
-    }
-
-
-def build_runtime_controls(
-    *,
-    cfg: float | None = None,
-    sigma: float | None = None,
-    temperature: float | None = None,
-    min_decode_steps: int | None = None,
-    max_decode_steps: int | None = None,
-) -> dict[str, torch.Tensor]:
-    controls = {}
-    if cfg is not None:
-        controls[KEY_CFG] = torch.tensor(float(cfg), dtype=torch.float32)
-    if sigma is not None:
-        controls[KEY_SIGMA] = torch.tensor(float(sigma), dtype=torch.float32)
-    if temperature is not None:
-        controls[KEY_TEMPERATURE] = torch.tensor(float(temperature), dtype=torch.float32)
-    if min_decode_steps is not None:
-        controls[KEY_MIN_DECODE_STEPS] = torch.tensor(int(min_decode_steps), dtype=torch.int32)
-    if max_decode_steps is not None:
-        controls[KEY_MAX_DECODE_STEPS] = torch.tensor(int(max_decode_steps), dtype=torch.int32)
-    return controls
-
-
-__all__ = [
-    "build_dense_prompt_token_ids",
-    "build_ming_dense_prompt",
-    "build_runtime_controls",
-    "resolve_effective_runtime_controls",
-]
diff --git a/vllm_omni/model_executor/stage_configs/ming_tts.yaml b/vllm_omni/model_executor/stage_configs/ming_tts.yaml
deleted file mode 100644
index 01063afcf86..00000000000
--- a/vllm_omni/model_executor/stage_configs/ming_tts.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-async_chunk: false
-stage_args:
-  - stage_id: 0
-    stage_type: llm
-    runtime:
-      process: true
-      devices: "0"
-    engine_args:
-      dtype: bfloat16
-      max_num_seqs: 1
-      model_stage: llm
-      model_arch: MingTTSForConditionalGeneration
-      hf_config_name: llm_config
-      worker_type: ar
-      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.45
-      enforce_eager: true
-      trust_remote_code: false
-      async_scheduling: false
-      enable_prefix_caching: false
-      engine_output_type: latent
-      max_model_len: 8192
-      max_num_batched_tokens: 8192
-    is_comprehension: true
-    final_output: true
-    final_output_type: text
-    default_sampling_params:
-      temperature: 0.0
-      top_p: 1.0
-      top_k: -1
-      max_tokens: 512
-      detokenize: true
-
-  - stage_id: 1
-    stage_type: llm
-    runtime:
-      process: true
-      devices: "0"
-    engine_args:
-      dtype: bfloat16
-      max_num_seqs: 1
-      model_stage: audio_vae
-      model_arch: MingTTSForConditionalGeneration
-      hf_config_name: llm_config
-      worker_type: generation
-      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
-      gpu_memory_utilization: 0.25
-      enforce_eager: true
-      trust_remote_code: false
-      async_scheduling: false
-      enable_prefix_caching: false
-      engine_output_type: audio
-      max_model_len: 8192
-      max_num_batched_tokens: 8192
-    engine_input_source: [0]
-    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.ming_tts.llm2audio_vae
-    is_comprehension: false
-    final_output: true
-    final_output_type: audio
-    default_sampling_params:
-      temperature: 0.0
-      top_p: 1.0
-      top_k: -1
-      max_tokens: 1
-      detokenize: false
diff --git a/vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml
deleted file mode 100644
index b7ffc8212ee..00000000000
--- a/vllm_omni/model_executor/stage_configs/ming_tts_async_chunk.yaml
+++ /dev/null
@@ -1,86 +0,0 @@
-async_chunk: true
-stage_args:
-  - stage_id: 0
-    stage_type: llm
-    runtime:
-      process: true
-      devices: "0"
-    engine_args:
-      dtype: bfloat16
-      max_num_seqs: 1
-      model_stage: llm
-      model_arch: MingTTSForConditionalGeneration
-      hf_config_name: llm_config
-      worker_type: ar
-      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.45
-      enforce_eager: true
-      trust_remote_code: false
-      enable_prefix_caching: false
-      engine_output_type: latent
-      max_model_len: 8192
-      max_num_batched_tokens: 8192
-      custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.ming_tts.llm2audio_vae_async_chunk
-    is_comprehension: true
-    final_output: true
-    final_output_type: text
-    default_sampling_params:
-      temperature: 0.0
-      top_p: 1.0
-      top_k: -1
-      max_tokens: 512
-      detokenize: true
-    output_connectors:
-      to_stage_1: connector_of_shared_memory
-
-  - stage_id: 1
-    stage_type: llm
-    runtime:
-      process: true
-      devices: "0"
-    engine_args:
-      dtype: bfloat16
-      max_num_seqs: 1
-      model_stage: audio_vae
-      model_arch: MingTTSForConditionalGeneration
-      hf_config_name: llm_config
-      worker_type: generation
-      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
-      gpu_memory_utilization: 0.25
-      enforce_eager: true
-      trust_remote_code: false
-      async_scheduling: false
-      enable_prefix_caching: false
-      engine_output_type: audio
-      max_model_len: 8192
-      max_num_batched_tokens: 8192
-    engine_input_source: [0]
-    is_comprehension: false
-    final_output: true
-    final_output_type: audio
-    input_connectors:
-      from_stage_0: connector_of_shared_memory
-    default_sampling_params:
-      temperature: 0.0
-      top_p: 1.0
-      top_k: -1
-      max_tokens: 1
-      detokenize: false
-
-runtime:
-  enabled: true
-  defaults:
-    window_size: -1
-    max_inflight: 1
-
-  connectors:
-    connector_of_shared_memory:
-      name: SharedMemoryConnector
-      extra:
-        latent_chunk_size: 25
-        latent_left_context: 0
-
-  edges:
-    - from: 0
-      to: 1
-      window_size: -1

From 42bacb4f490b3df92d130b447ab8941f0441c6d6 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 13 May 2026 19:34:19 +0530
Subject: [PATCH 20/54] test(ming-tts): update e2e coverage for prompt helper
 refactor

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 tests/e2e/offline_inference/test_ming_tts.py | 4 ++--
 tests/e2e/online_serving/test_ming_tts.py    | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/e2e/offline_inference/test_ming_tts.py b/tests/e2e/offline_inference/test_ming_tts.py
index b6562b19c3e..87a76a8648d 100644
--- a/tests/e2e/offline_inference/test_ming_tts.py
+++ b/tests/e2e/offline_inference/test_ming_tts.py
@@ -21,7 +21,7 @@
     SAMPLE_RATE,
     TEXT_EOS_TOKEN_ID,
 )
-from vllm_omni.model_executor.models.ming_tts.prompt_utils import build_ming_dense_prompt
+from vllm_omni.model_executor.models.ming_tts.prompt_utils import DEFAULT_PROMPT, build_ming_dense_prompt
 
 MODEL = "inclusionAI/Ming-omni-tts-0.5B"
 DEPLOY_CONFIG = get_deploy_config_path("ming_tts.yaml")
@@ -67,7 +67,7 @@ def _build_prompt(
 ) -> dict:
     return build_ming_dense_prompt(
         tokenizer,
-        prompt="Please generate speech based on the following description.\n",
+        prompt=DEFAULT_PROMPT,
         text=text,
         instruction=instruction,
         runtime_controls={KEY_MAX_DECODE_STEPS: 200},
diff --git a/tests/e2e/online_serving/test_ming_tts.py b/tests/e2e/online_serving/test_ming_tts.py
index 4204d4c5462..3f1bddbd51e 100644
--- a/tests/e2e/online_serving/test_ming_tts.py
+++ b/tests/e2e/online_serving/test_ming_tts.py
@@ -86,4 +86,8 @@ def test_ming_tts_audio_speech_streaming(omni_server, openai_client) -> None:
         "stream": True,
         "response_format": "wav",
     }
-    openai_client.send_audio_speech_request(request_config)
+    responses = openai_client.send_audio_speech_request(request_config)
+    assert len(responses) == 1
+    assert responses[0].audio_bytes is not None, "Expected streamed WAV bytes from /v1/audio/speech"
+    sample_rate = _wav_sample_rate(responses[0].audio_bytes)
+    assert sample_rate == SAMPLE_RATE, f"Expected Ming output sample rate {SAMPLE_RATE}, got {sample_rate}"

From ee047ce167a513e38e3261580bdb192028cb38e3 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 13 May 2026 20:14:57 +0530
Subject: [PATCH 21/54] style: reorder imports in ming_tts/qwen3 and fix noqa
 in test_serving_audio_generate

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../model_executor/stage_input_processors/qwen3_tts.py    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/model_executor/stage_input_processors/qwen3_tts.py b/vllm_omni/model_executor/stage_input_processors/qwen3_tts.py
index 299c0678b43..a8472b084c5 100644
--- a/vllm_omni/model_executor/stage_input_processors/qwen3_tts.py
+++ b/vllm_omni/model_executor/stage_input_processors/qwen3_tts.py
@@ -5,10 +5,6 @@
 import torch
 from vllm.logger import init_logger
 
-from vllm_omni.model_executor.stage_input_processors._chunk_transfer import (
-    get_request_payload_store,
-    get_transfer_extra_config,
-)
 from vllm_omni.data_entry_keys import (
     CodesStruct,
     MetaStruct,
@@ -16,6 +12,10 @@
     OmniPayloadStruct,
     to_dict,
 )
+from vllm_omni.model_executor.stage_input_processors._chunk_transfer import (
+    get_request_payload_store,
+    get_transfer_extra_config,
+)
 from vllm_omni.model_executor.stage_input_processors.chunk_size_utils import (
     compute_dynamic_initial_chunk_size,
     max_ic_for_chunk_size,

From 55f90253da47413efd33e823fbef0c353a97490f Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 13 May 2026 20:24:58 +0530
Subject: [PATCH 22/54] Add Ming dense prompt utilities

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../models/ming_tts/prompt_utils.py           | 648 ++++++++++++++++++
 1 file changed, 648 insertions(+)
 create mode 100644 vllm_omni/model_executor/models/ming_tts/prompt_utils.py

diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_utils.py b/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
new file mode 100644
index 00000000000..0075b248047
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
@@ -0,0 +1,648 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import json
+import math
+import re
+from io import BytesIO
+from pathlib import Path
+from typing import Any
+
+import torch
+from safetensors import safe_open
+
+from vllm_omni.engine.stage_init_utils import _resolve_model_to_local_path
+from vllm_omni.model_executor.models.ming_flash_omni.prompt_utils import (
+    DEFAULT_PROMPT,
+)
+from vllm_omni.model_executor.models.ming_flash_omni.prompt_utils import (
+    create_instruction as _create_ming_instruction,
+)
+
+from .audio_tokenizer.modeling_audio_vae import AudioVAE
+from .config_ming_tts import (
+    AUDIO_FRAME_HOP,
+    KEY_CFG,
+    KEY_MAX_DECODE_STEPS,
+    KEY_MIN_DECODE_STEPS,
+    KEY_PROMPT_LATENTS,
+    KEY_REQUEST_ID,
+    KEY_SIGMA,
+    KEY_SPEAKER_EMBEDDING,
+    KEY_TEMPERATURE,
+    LATENT_DIM,
+    PATCH_SIZE,
+    SAMPLE_RATE,
+    VAE_PATCH_SIZE,
+    VISION_START_TOKEN_ID,
+    MingTTSConfig,
+)
+
+_DURATION_SECONDS_RE = re.compile(r"Duration:\s*([0-9]+(?:\.[0-9]+)?)\s*s\b", re.IGNORECASE)
+
+
+def create_instruction(value: Any) -> str | None:
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value
+    if not isinstance(value, dict):
+        raise ValueError(f"Ming instruction must be str or dict, got {type(value).__name__}")
+    return _create_ming_instruction(value)
+
+
+def parse_duration_seconds(text: str | None) -> float | None:
+    if not isinstance(text, str):
+        return None
+    match = _DURATION_SECONDS_RE.search(text)
+    if match is None:
+        return None
+    try:
+        value = float(match.group(1))
+    except ValueError:
+        return None
+    if value <= 0.0:
+        return None
+    return value
+
+
+def estimate_decode_steps_for_duration(
+    duration_seconds: float,
+    *,
+    sample_rate: int = SAMPLE_RATE,
+    frame_hop: int = AUDIO_FRAME_HOP,
+    patch_size: int = PATCH_SIZE,
+    vae_patch_size: int = VAE_PATCH_SIZE,
+) -> int:
+    if duration_seconds <= 0.0:
+        return 0
+    samples_per_decode_step = int(frame_hop) * int(patch_size) * int(vae_patch_size)
+    required_samples = float(duration_seconds) * float(sample_rate)
+    return max(1, int(math.ceil(required_samples / float(samples_per_decode_step))))
+
+
+def estimate_decode_step_window_for_duration(duration_seconds: float) -> tuple[int, int]:
+    target_steps = estimate_decode_steps_for_duration(duration_seconds)
+    min_steps = max(1, target_steps - 3)
+    max_steps = max(min_steps, target_steps + 3)
+    return min_steps, max_steps
+
+
+def pad_prompt_waveform(
+    waveform: Any,
+    *,
+    patch_size: int = PATCH_SIZE,
+    sample_rate: int = SAMPLE_RATE,
+    frame_hop: int = AUDIO_FRAME_HOP,
+) -> torch.Tensor:
+    tensor = coerce_prompt_waveform(waveform)
+    del frame_hop
+    pad_align = int((float(sample_rate) / 12.5) * int(patch_size))
+    new_len = ((int(tensor.shape[-1]) + pad_align - 1) // pad_align) * pad_align
+    if new_len == int(tensor.shape[-1]):
+        return tensor
+    padded = torch.zeros((1, new_len), dtype=tensor.dtype, device=tensor.device)
+    padded[:, : tensor.shape[-1]] = tensor
+    return padded
+
+
+def coerce_prompt_waveform(value: Any) -> torch.Tensor:
+    if value is None:
+        raise ValueError("prompt waveform cannot be None")
+    if isinstance(value, torch.Tensor):
+        tensor = value.detach()
+        if tensor.ndim == 1:
+            return tensor.unsqueeze(0).to(torch.float32)
+        if tensor.ndim == 2:
+            if tensor.shape[0] != 1:
+                return tensor.reshape(1, -1).to(torch.float32)
+            return tensor.to(torch.float32)
+        raise ValueError(f"Unsupported Ming prompt waveform rank: {tuple(tensor.shape)}")
+    if isinstance(value, (list, tuple)):
+        parts = [coerce_prompt_waveform(item) for item in value if item is not None]
+        if not parts:
+            raise ValueError("prompt waveform list was empty")
+        return torch.cat(parts, dim=-1)
+    return coerce_prompt_waveform(torch.as_tensor(value))
+
+
+def coerce_speaker_embeddings(value: Any, *, use_zero_spk_emb: bool = False) -> list[torch.Tensor] | None:
+    if value is None:
+        return [torch.zeros((192,), dtype=torch.float32)] if use_zero_spk_emb else None
+    if isinstance(value, torch.Tensor):
+        tensor = value.detach()
+        if tensor.ndim == 1:
+            tensor = tensor.unsqueeze(0)
+        if tensor.ndim != 2:
+            raise ValueError(f"Unsupported Ming speaker embedding shape: {tuple(tensor.shape)}")
+        items = [row.reshape(-1).to(torch.float32).cpu() for row in tensor]
+    elif isinstance(value, (list, tuple)):
+        if value and all(not isinstance(item, (list, tuple, torch.Tensor)) for item in value):
+            items = [torch.as_tensor(value).detach().reshape(-1).to(torch.float32).cpu()]
+        else:
+            items = []
+            for item in value:
+                if item is None:
+                    continue
+                if not isinstance(item, torch.Tensor):
+                    item = torch.as_tensor(item)
+                items.append(item.detach().reshape(-1).to(torch.float32).cpu())
+    else:
+        return coerce_speaker_embeddings(torch.as_tensor(value), use_zero_spk_emb=use_zero_spk_emb)
+    if not items:
+        return [torch.zeros((192,), dtype=torch.float32)] if use_zero_spk_emb else None
+    for item in items:
+        if int(item.numel()) != 192:
+            raise ValueError(f"Ming speaker embedding must have 192 dims, got {int(item.numel())}")
+    return items
+
+
+def count_prompt_latent_patches(
+    value: Any,
+    *,
+    patch_size: int = PATCH_SIZE,
+    latent_dim: int = LATENT_DIM,
+) -> int:
+    if value is None:
+        return 0
+    if not isinstance(value, torch.Tensor):
+        value = torch.as_tensor(value)
+    latents = value.detach()
+    if latents.ndim == 3 and latents.shape[0] == 1:
+        latents = latents.squeeze(0)
+    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
+        return int(latents.shape[0])
+    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
+        raise ValueError(f"Unsupported Ming prompt_latents shape: {tuple(latents.shape)}")
+    if latents.shape[0] % patch_size != 0:
+        raise ValueError(
+            f"Ming prompt_latents frame count must be divisible by patch_size={patch_size}, "
+            f"got frames={int(latents.shape[0])}"
+        )
+    return int(latents.shape[0] // patch_size)
+
+
+def count_prompt_waveform_patches(
+    value: Any,
+    *,
+    patch_size: int = PATCH_SIZE,
+    frame_hop: int = AUDIO_FRAME_HOP,
+    vae_patch_size: int = VAE_PATCH_SIZE,
+) -> int:
+    if value is None:
+        return 0
+    waveform = pad_prompt_waveform(value, patch_size=patch_size, frame_hop=frame_hop)
+    frame_count = int(math.ceil(float(waveform.shape[-1]) / float(frame_hop)))
+    latent_frames = int(math.ceil(float(frame_count) / float(vae_patch_size)))
+    if latent_frames % int(patch_size) != 0:
+        raise ValueError(
+            f"Ming prompt waveform produced latent frame count not divisible by patch_size={patch_size}: "
+            f"frames={latent_frames}"
+        )
+    return int(latent_frames // int(patch_size))
+
+
+def resolve_effective_runtime_controls(
+    *,
+    text: str,
+    runtime_controls: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    controls = {} if runtime_controls is None else dict(runtime_controls)
+    has_explicit_min = KEY_MIN_DECODE_STEPS in controls and controls[KEY_MIN_DECODE_STEPS] is not None
+    has_explicit_max = KEY_MAX_DECODE_STEPS in controls and controls[KEY_MAX_DECODE_STEPS] is not None
+    if has_explicit_min or has_explicit_max:
+        return controls
+    duration_seconds = parse_duration_seconds(text)
+    if duration_seconds is None:
+        return controls
+    min_decode_steps, max_decode_steps = estimate_decode_step_window_for_duration(duration_seconds)
+    controls[KEY_MIN_DECODE_STEPS] = min_decode_steps
+    controls[KEY_MAX_DECODE_STEPS] = max_decode_steps
+    return controls
+
+
+def build_dense_prompt_token_ids(
+    tokenizer: Any,
+    *,
+    prompt: str,
+    text: str,
+    instruction: str | None = None,
+    prompt_text: str | None = None,
+    speaker_count: int = 0,
+    prompt_patch_count: int = 0,
+) -> list[int]:
+    speaker_prompt = []
+    for idx in range(int(speaker_count)):
+        speaker_prompt.extend(
+            tokenizer.encode(f"  speaker_{idx + 1}:")
+            + tokenizer.encode("<|vision_start|>")
+            + tokenizer.encode("<|vision_pad|>")
+            + tokenizer.encode("<|vision_end|>\n")
+        )
+    instruction_prompt = (
+        tokenizer.encode(instruction) + tokenizer.encode("<|endoftext|>") if instruction is not None else []
+    )
+    prompt_text_tokens = (
+        tokenizer.encode(prompt_text) if int(prompt_patch_count) > 0 and prompt_text is not None else []
+    )
+    prompt_latent_tokens = [tokenizer.convert_tokens_to_ids("<audioPatch>")] * int(prompt_patch_count)
+    text_input_prefix = (
+        []
+        if all(token in text for token in ("Genre: ", "Mood: ", "Instrument: ", "Theme: ", "Duration: "))
+        else tokenizer.encode(" Text input:\n")
+    )
+    return (
+        tokenizer.encode("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n")
+        + tokenizer.encode("<|im_start|>user\n")
+        + tokenizer.encode(prompt)
+        + speaker_prompt
+        + text_input_prefix
+        + prompt_text_tokens
+        + tokenizer.encode(text)
+        + tokenizer.encode("<|im_end|>\n")
+        + tokenizer.encode("<|im_start|>assistant\n")
+        + instruction_prompt
+        + tokenizer.encode("<audio>")
+        + prompt_latent_tokens
+    )
+
+
+def build_ming_dense_prompt(
+    tokenizer: Any,
+    *,
+    prompt: str,
+    text: str,
+    runtime_controls: dict[str, Any] | None = None,
+    instruction: Any = None,
+    prompt_text: str | None = None,
+    prompt_waveform: Any = None,
+    prompt_latents: Any = None,
+    speaker_embedding: Any = None,
+    use_zero_spk_emb: bool = False,
+    request_id: str | None = None,
+) -> dict[str, Any]:
+    instruction_text = create_instruction(instruction)
+    speaker_embeddings = coerce_speaker_embeddings(speaker_embedding, use_zero_spk_emb=use_zero_spk_emb)
+    effective_runtime_controls = resolve_effective_runtime_controls(text=text, runtime_controls=runtime_controls)
+
+    prompt_waveform_tensor = None
+    prompt_patch_count = 0
+    if prompt_waveform is not None:
+        prompt_waveform_tensor = pad_prompt_waveform(prompt_waveform)
+        prompt_patch_count = count_prompt_waveform_patches(prompt_waveform_tensor)
+    if prompt_waveform_tensor is not None and prompt_latents is not None:
+        raise ValueError(
+            "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
+            "Choose exactly one source of truth."
+        )
+
+    prompt_latent_value = None
+    if prompt_waveform_tensor is not None and prompt_text is None:
+        raise ValueError(
+            "Ming prompt_waveform requires prompt_text for prompt-latent conditioning. "
+            "Use speaker_embedding for reference-audio-only speaker conditioning."
+        )
+    if prompt_latents is not None:
+        prompt_latent_value = torch.as_tensor(prompt_latents)
+        prompt_patch_count = count_prompt_latent_patches(
+            prompt_latent_value, patch_size=PATCH_SIZE, latent_dim=LATENT_DIM
+        )
+
+    prompt_token_ids = build_dense_prompt_token_ids(
+        tokenizer,
+        prompt=prompt,
+        text=text,
+        instruction=instruction_text,
+        prompt_text=prompt_text if prompt_patch_count > 0 else None,
+        speaker_count=0 if speaker_embeddings is None else len(speaker_embeddings),
+        prompt_patch_count=prompt_patch_count,
+    )
+
+    additional_information = {}
+    for key, value in effective_runtime_controls.items():
+        if isinstance(value, torch.Tensor):
+            additional_information[key] = value
+        elif key in (KEY_MIN_DECODE_STEPS, KEY_MAX_DECODE_STEPS):
+            additional_information[key] = torch.tensor(int(value), dtype=torch.int32)
+        else:
+            additional_information[key] = torch.tensor(float(value), dtype=torch.float32)
+    if request_id is not None:
+        additional_information[KEY_REQUEST_ID] = request_id
+    if instruction_text is not None:
+        additional_information["instruction"] = instruction_text
+    if prompt_text is not None:
+        additional_information["prompt_text"] = prompt_text
+    if prompt_waveform_tensor is not None:
+        additional_information["prompt_waveform"] = prompt_waveform_tensor
+        additional_information["prompt_waveform_length"] = torch.tensor(
+            [int(prompt_waveform_tensor.shape[-1])], dtype=torch.int32
+        )
+    if prompt_latent_value is not None:
+        additional_information[KEY_PROMPT_LATENTS] = prompt_latent_value
+    if speaker_embeddings is not None:
+        additional_information[KEY_SPEAKER_EMBEDDING] = (
+            speaker_embeddings[0] if len(speaker_embeddings) == 1 else torch.stack(speaker_embeddings, dim=0)
+        )
+    if use_zero_spk_emb:
+        additional_information["use_zero_spk_emb"] = True
+    return {
+        "prompt": prompt,
+        "text": text,
+        "prompt_token_ids": prompt_token_ids,
+        "additional_information": additional_information,
+    }
+
+
+def build_runtime_controls(
+    *,
+    cfg: float | None = None,
+    sigma: float | None = None,
+    temperature: float | None = None,
+    min_decode_steps: int | None = None,
+    max_decode_steps: int | None = None,
+) -> dict[str, torch.Tensor]:
+    controls = {}
+    if cfg is not None:
+        controls[KEY_CFG] = torch.tensor(float(cfg), dtype=torch.float32)
+    if sigma is not None:
+        controls[KEY_SIGMA] = torch.tensor(float(sigma), dtype=torch.float32)
+    if temperature is not None:
+        controls[KEY_TEMPERATURE] = torch.tensor(float(temperature), dtype=torch.float32)
+    if min_decode_steps is not None:
+        controls[KEY_MIN_DECODE_STEPS] = torch.tensor(int(min_decode_steps), dtype=torch.int32)
+    if max_decode_steps is not None:
+        controls[KEY_MAX_DECODE_STEPS] = torch.tensor(int(max_decode_steps), dtype=torch.int32)
+    return controls
+
+
+def _resolve_prompt_latents(wrapper: Any, info_dict: dict[str, Any]) -> dict[str, torch.Tensor] | None:
+    raw_latents = info_dict.get(KEY_PROMPT_LATENTS, info_dict.get("prompt_latents"))
+    raw_waveform = info_dict.get("prompt_waveform", info_dict.get("prompt_waveforms"))
+    if raw_latents is not None and raw_waveform is not None:
+        raise ValueError(
+            "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
+            "Choose exactly one source of truth."
+        )
+
+    direct_latents = _coerce_prompt_latents(
+        raw_latents,
+        patch_size=wrapper.ming_config.patch_size,
+        latent_dim=wrapper.ming_config.latent_dim,
+    )
+    if direct_latents is not None:
+        return direct_latents
+    if raw_waveform is None:
+        return None
+
+    encode_fn = getattr(wrapper, "_encode_prompt_waveform_to_latents", None)
+    if callable(encode_fn):
+        latents = encode_fn(raw_waveform, info_dict.get("prompt_waveform_length"))
+    else:
+        latents = _encode_prompt_waveform_to_latents(
+            wrapper,
+            raw_waveform,
+            info_dict.get("prompt_waveform_length"),
+        )
+    return _coerce_prompt_latents(
+        latents,
+        patch_size=wrapper.ming_config.patch_size,
+        latent_dim=wrapper.ming_config.latent_dim,
+    )
+
+
+def _load_prompt_encoder(wrapper: Any) -> AudioVAE:
+    if wrapper._prompt_encoder is not None:
+        return wrapper._prompt_encoder
+    if wrapper.ming_config.audio_tokenizer_config is None:
+        raise RuntimeError("Ming Stage-0 requires audio_tokenizer_config to encode prompt audio.")
+
+    encoder = AudioVAE(wrapper.ming_config.audio_tokenizer_config).eval()
+    state_dict = encoder.state_dict()
+    loaded = 0
+    loaded_encoder_params = set()
+    with torch.no_grad():
+        for shard_path in _iter_model_safetensors(
+            _resolve_model_to_local_path(str(wrapper.vllm_config.model_config.model))
+        ):
+            with safe_open(str(shard_path), framework="pt", device="cpu") as handle:
+                for key in handle.keys():
+                    if not key.startswith("audio.encoder."):
+                        continue
+                    name = key[len("audio.") :]
+                    if name not in state_dict:
+                        continue
+                    target = state_dict[name]
+                    target.copy_(handle.get_tensor(key).to(device=target.device, dtype=target.dtype))
+                    loaded += 1
+                    loaded_encoder_params.add(name)
+    if loaded == 0:
+        raise RuntimeError("Ming prompt encoder received no audio.encoder.* weights from checkpoint.")
+
+    expected_encoder_params = {f"encoder.{name}" for name, _ in encoder.encoder.named_parameters()}
+    missing = expected_encoder_params - loaded_encoder_params
+    if missing:
+        raise RuntimeError(f"Ming prompt encoder: {len(missing)} params not loaded. First few: {sorted(missing)[:5]}")
+
+    dev = next(wrapper.parameters()).device
+    try:
+        del encoder.decoder
+        encoder.decoder = None
+        if dev.type != "cpu":
+            encoder.encoder.to(dev, dtype=getattr(wrapper.model, "fm_dtype", torch.bfloat16))
+        else:
+            encoder.encoder.to(dev)
+    except Exception as exc:
+        raise RuntimeError(f"Failed to move Ming prompt encoder to {dev}: {exc}") from exc
+    wrapper._prompt_encoder = encoder
+    return encoder
+
+
+@torch.inference_mode()
+def _encode_prompt_waveform_to_latents(wrapper: Any, waveform: Any, waveform_length: Any = None) -> torch.Tensor:
+    encoder = _load_prompt_encoder(wrapper)
+    waveform = _normalize_prompt_waveform(waveform, target_sr=wrapper.ming_config.sample_rate)
+    waveform = pad_prompt_waveform(
+        waveform,
+        patch_size=wrapper.ming_config.patch_size,
+        sample_rate=wrapper.ming_config.sample_rate,
+        frame_hop=wrapper.ming_config.audio_frame_hop,
+    )
+    dev = next(encoder.encoder.parameters()).device
+    waveform = waveform.to(device=dev, dtype=next(encoder.encoder.parameters()).dtype)
+    if waveform_length is None:
+        waveform_length = torch.full((waveform.shape[0],), waveform.shape[-1], dtype=torch.int32, device=dev)
+    elif not isinstance(waveform_length, torch.Tensor):
+        waveform_length = torch.as_tensor(waveform_length, dtype=torch.int32, device=dev)
+    else:
+        waveform_length = waveform_length.to(device=dev, dtype=torch.int32)
+
+    latents, _ = encoder.encode_latent(waveform, waveform_length)
+    if latents.ndim == 3 and latents.shape[0] == 1:
+        latents = latents.squeeze(0)
+    count_prompt_latent_patches(
+        latents,
+        patch_size=wrapper.ming_config.patch_size,
+        latent_dim=wrapper.ming_config.latent_dim,
+    )
+    return latents.detach().to(dtype=torch.float32).contiguous()
+
+
+def _iter_model_safetensors(local_model_path: str) -> list[Path]:
+    model_root = Path(local_model_path)
+    index_path = model_root / "model.safetensors.index.json"
+    if index_path.exists():
+        with index_path.open("r", encoding="utf-8") as handle:
+            index_data = json.load(handle)
+        filenames = sorted(set(index_data.get("weight_map", {}).values()))
+        if not filenames:
+            raise RuntimeError(f"No checkpoint shards listed in {index_path}")
+        return [model_root / filename for filename in filenames]
+
+    single_file = model_root / "model.safetensors"
+    if single_file.exists():
+        return [single_file]
+
+    files = sorted(model_root.glob("*.safetensors"))
+    if not files:
+        raise RuntimeError(f"No .safetensors checkpoint found under {local_model_path}")
+    return files
+
+
+def _normalize_prompt_waveform(value: Any, *, target_sr: int) -> torch.Tensor:
+    if isinstance(value, bytes):
+        import torchaudio
+
+        waveform, sr = torchaudio.load(BytesIO(value))
+        waveform = waveform[:1].to(torch.float32)
+        if int(sr) != int(target_sr):
+            from torchaudio.functional import resample as resample_audio
+
+            waveform = resample_audio(waveform, int(sr), int(target_sr))
+        return waveform
+
+    if isinstance(value, tuple) and len(value) == 2 and isinstance(value[1], int):
+        waveform = coerce_prompt_waveform(value[0])
+        if int(value[1]) != int(target_sr):
+            from torchaudio.functional import resample as resample_audio
+
+            waveform = resample_audio(waveform, int(value[1]), int(target_sr))
+        return waveform
+
+    if isinstance(value, dict):
+        samples = value.get("samples", value.get("array", value.get("waveform")))
+        sr = value.get("sample_rate", value.get("sr", target_sr))
+        return _normalize_prompt_waveform((samples, int(sr)), target_sr=target_sr)
+
+    return coerce_prompt_waveform(value)
+
+
+def _coerce_prompt_latents(
+    value: Any,
+    *,
+    patch_size: int,
+    latent_dim: int,
+) -> dict[str, torch.Tensor] | None:
+    if value is None:
+        return None
+    if not isinstance(value, torch.Tensor):
+        value = torch.as_tensor(value)
+
+    latents = value.detach()
+    if latents.ndim == 3 and latents.shape[0] == 1:
+        latents = latents.squeeze(0)
+
+    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
+        patches = latents
+        # [Patch, Time, Dimension] -> [Frame, Dimension] for history seeding.
+        frames = patches.reshape(-1, latent_dim)
+        return {"patches": patches, "frames": frames}
+
+    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
+        raise ValueError(f"Unsupported prompt latent shape: {tuple(latents.shape)}")
+    if latents.shape[0] % patch_size != 0:
+        raise ValueError(
+            f"Prompt latent frame count must be divisible by patch_size={patch_size}, "
+            f"got frames={int(latents.shape[0])}"
+        )
+    # [Frame, Dimension] -> [Patch, Time, Dimension] for Aggregator prompt slots.
+    patches = latents.reshape(-1, patch_size, latent_dim) if latents.shape[0] > 0 else None
+    return {"patches": patches, "frames": latents}
+
+
+def _initial_history(
+    frames: torch.Tensor | None,
+    *,
+    history_size: int,
+    latent_dim: int,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    history = torch.zeros((history_size, latent_dim), device=device, dtype=dtype)
+    if frames is None or frames.numel() == 0:
+        return history
+    frames = frames.to(device=device, dtype=dtype)
+    take = min(history_size, int(frames.shape[0]))
+    history[-take:] = frames[-take:]
+    return history
+
+
+def _take_scalar(value: Any, idx: int) -> float | None:
+    if not isinstance(value, torch.Tensor) or value.numel() == 0:
+        return None
+    return float(value.reshape(-1)[idx].item())
+
+
+def _find_audio_placeholder_positions(input_ids: torch.Tensor, cfg: MingTTSConfig) -> torch.Tensor:
+    dummy_pos = (input_ids == cfg.audio_dummy_token_id).nonzero(as_tuple=True)[0]
+    if dummy_pos.numel() == 0:
+        return dummy_pos
+
+    audio_start_pos = (input_ids == cfg.audio_start_token_id).nonzero(as_tuple=True)[0]
+    audio_end_pos = (input_ids == cfg.audio_end_token_id).nonzero(as_tuple=True)[0]
+    if audio_start_pos.numel() == 0:
+        return dummy_pos
+
+    start = int(audio_start_pos[0].item())
+    end = int(audio_end_pos[0].item()) if audio_end_pos.numel() > 0 else int(input_ids.shape[0])
+    keep = (dummy_pos > start) & (dummy_pos < end)
+    filtered = dummy_pos[keep]
+    return filtered if filtered.numel() > 0 else dummy_pos
+
+
+def _find_speaker_placeholder_positions(input_ids: torch.Tensor, hf_config: Any) -> list[int]:
+    vision_start_token_id = getattr(hf_config, "vision_start_token_id", VISION_START_TOKEN_ID)
+    vision_start_pos = (input_ids == int(vision_start_token_id)).nonzero(as_tuple=True)[0]
+    if vision_start_pos.numel() == 0:
+        return []
+
+    slots = []
+    for pos in vision_start_pos:
+        slot = int(pos.item()) + 1
+        if slot < int(input_ids.shape[0]):
+            slots.append(slot)
+    return slots
+
+
+__all__ = [
+    "DEFAULT_PROMPT",
+    "build_dense_prompt_token_ids",
+    "build_ming_dense_prompt",
+    "build_runtime_controls",
+    "coerce_prompt_waveform",
+    "coerce_speaker_embeddings",
+    "count_prompt_latent_patches",
+    "count_prompt_waveform_patches",
+    "create_instruction",
+    "estimate_decode_step_window_for_duration",
+    "estimate_decode_steps_for_duration",
+    "pad_prompt_waveform",
+    "parse_duration_seconds",
+    "resolve_effective_runtime_controls",
+    "_coerce_prompt_latents",
+    "_find_audio_placeholder_positions",
+    "_find_speaker_placeholder_positions",
+    "_initial_history",
+    "_resolve_prompt_latents",
+    "_take_scalar",
+]

From f5dd5bb83a18739d97d6a196d277f82c85445525 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 13 May 2026 20:32:13 +0530
Subject: [PATCH 23/54] Fix Ming dense config initialization order

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 vllm_omni/model_executor/models/ming_tts/config_ming_tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
index 2c24b438890..f0a08768a91 100644
--- a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
@@ -90,11 +90,11 @@ def __init__(
         architectures: list[str] | None = None,
         **kwargs: Any,
     ) -> None:
-        super().__init__(architectures=architectures, **kwargs)
         self.llm_config = _coerce_qwen2_config(llm_config or {})
         self.ditar_config = dict(ditar_config or {})
         self.aggregator_config = dict(aggregator_config or {})
         self.audio_tokenizer_config = _coerce_ming_dense_audio_vae_config(audio_tokenizer_config)
+        super().__init__(architectures=architectures, **kwargs)
 
     def get_text_config(self, decoder: bool = False, **kwargs: Any) -> Qwen2Config:
         del decoder, kwargs

From 9609ecededc4d333960c884f78cc95fcad3a9e89 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 13 May 2026 20:34:33 +0530
Subject: [PATCH 24/54] Fix Ming dense config initialization and e2e validation

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 tests/e2e/offline_inference/test_ming_tts.py |  6 +-
 tests/e2e/online_serving/test_ming_tts.py    | 62 ++++++++++++++++----
 2 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/tests/e2e/offline_inference/test_ming_tts.py b/tests/e2e/offline_inference/test_ming_tts.py
index 87a76a8648d..b979f234d8a 100644
--- a/tests/e2e/offline_inference/test_ming_tts.py
+++ b/tests/e2e/offline_inference/test_ming_tts.py
@@ -35,7 +35,7 @@ def ming_tokenizer():
     return AutoTokenizer.from_pretrained(MODEL, trust_remote_code=False)
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def ming_engine():
     with OmniRunner(
         MODEL,
@@ -46,7 +46,7 @@ def ming_engine():
         yield runner.omni
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def async_omni_engine():
     engine = AsyncOmni(
         model=MODEL,
@@ -272,7 +272,7 @@ async def _run() -> None:
                 else:
                     audio_chunk = audio.float().detach().cpu()
             elif isinstance(audio, list):
-                audio_chunk = torch.as_tensor(audio[chunk_idx], dtype=torch.float32).reshape(-1).cpu()
+                audio_chunk = _flatten_audio(audio)
             else:
                 audio_chunk = torch.as_tensor(audio, dtype=torch.float32).reshape(-1).cpu()
             accumulated_samples += int(audio_chunk.numel())
diff --git a/tests/e2e/online_serving/test_ming_tts.py b/tests/e2e/online_serving/test_ming_tts.py
index 3f1bddbd51e..c670addaff3 100644
--- a/tests/e2e/online_serving/test_ming_tts.py
+++ b/tests/e2e/online_serving/test_ming_tts.py
@@ -38,6 +38,48 @@ def _wav_sample_rate(audio_bytes: bytes) -> int:
         return int(wav_file.getframerate())
 
 
+def _assert_wav_audio(audio_bytes: bytes) -> None:
+    assert len(audio_bytes) > 44, f"Expected WAV payload, got {len(audio_bytes)} bytes"
+    assert audio_bytes[:4] == b"RIFF", "Expected RIFF WAV header"
+    assert audio_bytes[8:12] == b"WAVE", "Expected WAVE WAV header"
+    sample_rate = _wav_sample_rate(audio_bytes)
+    assert sample_rate == SAMPLE_RATE, f"Expected Ming output sample rate {SAMPLE_RATE}, got {sample_rate}"
+
+
+def _read_non_streaming_audio(openai_client, request_config: dict) -> bytes:
+    kwargs = {
+        "model": request_config["model"],
+        "input": request_config["input"],
+        "response_format": request_config["response_format"],
+        "timeout": request_config.get("timeout", 300.0),
+    }
+    if request_config.get("voice") is not None:
+        kwargs["voice"] = request_config["voice"]
+    response = openai_client.client.audio.speech.create(**kwargs)
+    if hasattr(response, "read") and callable(response.read):
+        return response.read()
+    if hasattr(response, "content"):
+        return response.content
+    raise TypeError(f"Unsupported audio speech response type: {type(response)}")
+
+
+def _read_streaming_audio(openai_client, request_config: dict) -> bytes:
+    data = bytearray()
+    kwargs = {
+        "model": request_config["model"],
+        "input": request_config["input"],
+        "response_format": request_config["response_format"],
+        "timeout": request_config.get("timeout", 300.0),
+    }
+    if request_config.get("voice") is not None:
+        kwargs["voice"] = request_config["voice"]
+    with openai_client.client.audio.speech.with_streaming_response.create(**kwargs) as response:
+        for chunk in response.iter_bytes():
+            if chunk:
+                data.extend(chunk)
+    return bytes(data)
+
+
 @pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
@@ -49,6 +91,7 @@ def test_ming_tts_audio_speech_non_streaming(omni_server, openai_client) -> None
         "input": "我会一直在这里陪着你，直到你慢慢地沉入那个最温柔的梦里。",
         "stream": False,
         "response_format": "wav",
+        "timeout": 300.0,
     }
     request_inputs = [
         "我会一直在这里陪着你，直到你慢慢地沉入那个最温柔的梦里。",
@@ -57,9 +100,8 @@ def test_ming_tts_audio_speech_non_streaming(omni_server, openai_client) -> None
 
     def _send_one(text):
         per_request_config = {**request_config, "input": text}
-        responses = openai_client.send_audio_speech_request(per_request_config)
-        assert len(responses) == 1
-        return text, responses[0]
+        audio_bytes = _read_non_streaming_audio(openai_client, per_request_config)
+        return text, audio_bytes
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=len(request_inputs)) as executor:
         futures = [executor.submit(_send_one, text) for text in request_inputs]
@@ -67,10 +109,8 @@ def _send_one(text):
 
     assert {text for text, _ in results} == set(request_inputs)
     assert len(results) == len(request_inputs)
-    for _, response in results:
-        assert response.audio_bytes is not None, "Expected WAV bytes from /v1/audio/speech"
-        sample_rate = _wav_sample_rate(response.audio_bytes)
-        assert sample_rate == SAMPLE_RATE, f"Expected Ming output sample rate {SAMPLE_RATE}, got {sample_rate}"
+    for _, audio_bytes in results:
+        _assert_wav_audio(audio_bytes)
 
 
 @pytest.mark.advanced_model
@@ -85,9 +125,7 @@ def test_ming_tts_audio_speech_streaming(omni_server, openai_client) -> None:
         "voice": "灵小甄",
         "stream": True,
         "response_format": "wav",
+        "timeout": 300.0,
     }
-    responses = openai_client.send_audio_speech_request(request_config)
-    assert len(responses) == 1
-    assert responses[0].audio_bytes is not None, "Expected streamed WAV bytes from /v1/audio/speech"
-    sample_rate = _wav_sample_rate(responses[0].audio_bytes)
-    assert sample_rate == SAMPLE_RATE, f"Expected Ming output sample rate {SAMPLE_RATE}, got {sample_rate}"
+    audio_bytes = _read_streaming_audio(openai_client, request_config)
+    _assert_wav_audio(audio_bytes)

From bf95f769ac835a4ff1c99c32a9de38c237077520 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 13 May 2026 20:58:21 +0530
Subject: [PATCH 25/54]   Disable prefix caching for Ming dense TTS

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 vllm_omni/deploy/ming_tts.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm_omni/deploy/ming_tts.yaml b/vllm_omni/deploy/ming_tts.yaml
index 327f61a1785..1a2dc99eca1 100644
--- a/vllm_omni/deploy/ming_tts.yaml
+++ b/vllm_omni/deploy/ming_tts.yaml
@@ -8,6 +8,7 @@ pipeline: ming_tts
 async_chunk: true
 trust_remote_code: false
 dtype: bfloat16
+enable_prefix_caching: false
 
 connectors:
   connector_of_shared_memory:
@@ -21,6 +22,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.45
     enforce_eager: true
+    enable_prefix_caching: false
     async_scheduling: false
     max_num_batched_tokens: 8192
     max_model_len: 8192
@@ -38,6 +40,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.25
     enforce_eager: true
+    enable_prefix_caching: false
     async_scheduling: false
     max_num_batched_tokens: 8192
     max_model_len: 8192

From 51701719830a0c530e5356d2095ac6c63e39d11f Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 13 May 2026 21:33:57 +0530
Subject: [PATCH 26/54] vllm_omni/entrypoints/openai/serving_speech.py

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 tests/e2e/online_serving/test_ming_tts.py | 131 ++++++++--------------
 1 file changed, 46 insertions(+), 85 deletions(-)

diff --git a/tests/e2e/online_serving/test_ming_tts.py b/tests/e2e/online_serving/test_ming_tts.py
index c670addaff3..29503325901 100644
--- a/tests/e2e/online_serving/test_ming_tts.py
+++ b/tests/e2e/online_serving/test_ming_tts.py
@@ -1,131 +1,92 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+E2E online serving tests for Ming-omni-tts (dense) model.
+Tests text-to-audio via /v1/audio/speech endpoint.
+"""
 
-"""E2E online-serving tests for Ming-omni-tts."""
-
-import concurrent.futures
-import io
 import os
-import wave
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
 
 import pytest
 
 from tests.helpers.mark import hardware_test
 from tests.helpers.runtime import OmniServerParams
 from tests.helpers.stage_config import get_deploy_config_path
-from vllm_omni.model_executor.models.ming_tts.config_ming_tts import SAMPLE_RATE
+
+pytestmark = [pytest.mark.advanced_model, pytest.mark.omni]
 
 MODEL = "inclusionAI/Ming-omni-tts-0.5B"
 DEPLOY_CONFIG = get_deploy_config_path("ming_tts.yaml")
 
-SERVER_PARAMS = [
+no_async_chunk_params = [
     pytest.param(
         OmniServerParams(
             model=MODEL,
             stage_config_path=DEPLOY_CONFIG,
-            server_args=["--enforce-eager", "--disable-log-stats"],
+            server_args=["--enforce-eager", "--no-async-chunk"],
         ),
-        id="async_chunk",
+        id="no_async_chunk",
     )
 ]
 
-
-def _wav_sample_rate(audio_bytes: bytes) -> int:
-    with wave.open(io.BytesIO(audio_bytes), "rb") as wav_file:
-        return int(wav_file.getframerate())
-
-
-def _assert_wav_audio(audio_bytes: bytes) -> None:
-    assert len(audio_bytes) > 44, f"Expected WAV payload, got {len(audio_bytes)} bytes"
-    assert audio_bytes[:4] == b"RIFF", "Expected RIFF WAV header"
-    assert audio_bytes[8:12] == b"WAVE", "Expected WAVE WAV header"
-    sample_rate = _wav_sample_rate(audio_bytes)
-    assert sample_rate == SAMPLE_RATE, f"Expected Ming output sample rate {SAMPLE_RATE}, got {sample_rate}"
-
-
-def _read_non_streaming_audio(openai_client, request_config: dict) -> bytes:
-    kwargs = {
-        "model": request_config["model"],
-        "input": request_config["input"],
-        "response_format": request_config["response_format"],
-        "timeout": request_config.get("timeout", 300.0),
-    }
-    if request_config.get("voice") is not None:
-        kwargs["voice"] = request_config["voice"]
-    response = openai_client.client.audio.speech.create(**kwargs)
-    if hasattr(response, "read") and callable(response.read):
-        return response.read()
-    if hasattr(response, "content"):
-        return response.content
-    raise TypeError(f"Unsupported audio speech response type: {type(response)}")
+async_chunk_params = [
+    pytest.param(
+        OmniServerParams(
+            model=MODEL,
+            stage_config_path=DEPLOY_CONFIG,
+            server_args=["--enforce-eager"],
+        ),
+        id="async_chunk",
+    )
+]
 
 
-def _read_streaming_audio(openai_client, request_config: dict) -> bytes:
-    data = bytearray()
-    kwargs = {
-        "model": request_config["model"],
-        "input": request_config["input"],
-        "response_format": request_config["response_format"],
-        "timeout": request_config.get("timeout", 300.0),
+def get_prompt(prompt_type="zh"):
+    prompts = {
+        "zh": "我会一直在这里陪着你，直到你慢慢地沉入那个最温柔的梦里。",
+        "zh_short": "这款产品的名字，叫变态坑爹牛肉丸。",
     }
-    if request_config.get("voice") is not None:
-        kwargs["voice"] = request_config["voice"]
-    with openai_client.client.audio.speech.with_streaming_response.create(**kwargs) as response:
-        for chunk in response.iter_bytes():
-            if chunk:
-                data.extend(chunk)
-    return bytes(data)
+    return prompts.get(prompt_type, prompts["zh"])
 
 
-@pytest.mark.advanced_model
-@pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
-@pytest.mark.parametrize("omni_server", SERVER_PARAMS, indirect=True)
-def test_ming_tts_audio_speech_non_streaming(omni_server, openai_client) -> None:
-    """Test non-streaming Ming generation through /v1/audio/speech."""
+@pytest.mark.parametrize("omni_server", no_async_chunk_params, indirect=True)
+def test_text_to_audio_non_streaming_001(omni_server, openai_client) -> None:
+    """
+    Deploy Setting: ming_tts.yaml with --no-async-chunk
+    Input Modal: text
+    Output Modal: audio
+    Input Setting: stream=False
+    Datasets: two concurrent requests
+    """
     request_config = {
         "model": omni_server.model,
-        "input": "我会一直在这里陪着你，直到你慢慢地沉入那个最温柔的梦里。",
+        "input": get_prompt("zh"),
         "stream": False,
         "response_format": "wav",
         "timeout": 300.0,
     }
-    request_inputs = [
-        "我会一直在这里陪着你，直到你慢慢地沉入那个最温柔的梦里。",
-        "这款产品的名字，叫变态坑爹牛肉丸。",
-    ]
-
-    def _send_one(text):
-        per_request_config = {**request_config, "input": text}
-        audio_bytes = _read_non_streaming_audio(openai_client, per_request_config)
-        return text, audio_bytes
-
-    with concurrent.futures.ThreadPoolExecutor(max_workers=len(request_inputs)) as executor:
-        futures = [executor.submit(_send_one, text) for text in request_inputs]
-        results = [future.result() for future in concurrent.futures.as_completed(futures)]
-
-    assert {text for text, _ in results} == set(request_inputs)
-    assert len(results) == len(request_inputs)
-    for _, audio_bytes in results:
-        _assert_wav_audio(audio_bytes)
+    openai_client.send_audio_speech_request(request_config, request_num=2)
 
 
-@pytest.mark.advanced_model
-@pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
-@pytest.mark.parametrize("omni_server", SERVER_PARAMS, indirect=True)
-def test_ming_tts_audio_speech_streaming(omni_server, openai_client) -> None:
-    """Test streaming Ming generation through /v1/audio/speech."""
+@pytest.mark.parametrize("omni_server", async_chunk_params, indirect=True)
+def test_text_to_audio_streaming_001(omni_server, openai_client) -> None:
+    """
+    Deploy Setting: ming_tts.yaml (async_chunk=true)
+    Input Modal: text + voice
+    Output Modal: audio (streamed)
+    Input Setting: stream=True
+    Datasets: single request
+    """
     request_config = {
         "model": omni_server.model,
-        "input": "这款产品的名字，叫变态坑爹牛肉丸。",
+        "input": get_prompt("zh_short"),
         "voice": "灵小甄",
         "stream": True,
         "response_format": "wav",
         "timeout": 300.0,
     }
-    audio_bytes = _read_streaming_audio(openai_client, request_config)
-    _assert_wav_audio(audio_bytes)
+    openai_client.send_audio_speech_request(request_config)

From 0b7eeca7c51f208fd581fa973e334f9d96a2b249 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 13 May 2026 22:06:09 +0530
Subject: [PATCH 27/54] fix(ming_tts): align llm2audio_vae signature with
 custom_process_input_func convention

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../stage_input_processors/ming_tts.py         | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/vllm_omni/model_executor/stage_input_processors/ming_tts.py b/vllm_omni/model_executor/stage_input_processors/ming_tts.py
index cfbf0e401aa..00aa364c570 100644
--- a/vllm_omni/model_executor/stage_input_processors/ming_tts.py
+++ b/vllm_omni/model_executor/stage_input_processors/ming_tts.py
@@ -255,23 +255,17 @@ def llm2audio_vae_async_chunk(
 
 
 def llm2audio_vae(
-    stage_list: list[Any],
-    engine_input_source: list[int],
+    source_outputs: list[Any],
     prompt: OmniTokensPrompt | TextPrompt | None = None,
     requires_multimodal_data: bool = False,
+    streaming_context: Any = None,
 ) -> list[OmniTokensPrompt]:
-    del prompt, requires_multimodal_data
-    if not engine_input_source:
-        raise ValueError("engine_input_source cannot be empty")
-
-    source_stage_id = engine_input_source[0]
-    if source_stage_id >= len(stage_list):
-        raise IndexError(f"Invalid stage_id: {source_stage_id}")
-    if stage_list[source_stage_id].engine_outputs is None:
-        raise RuntimeError(f"Stage {source_stage_id} has no outputs yet")
+    del prompt, requires_multimodal_data, streaming_context
+    if not source_outputs:
+        raise ValueError("source_outputs cannot be empty")
 
     outputs = []
-    for stage_output in stage_list[source_stage_id].engine_outputs:
+    for stage_output in source_outputs:
         finished = bool(getattr(stage_output, "finished", True))
         if not finished:
             continue

From dc3240a800451e897d43163e851922f5d3f7bb85 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Thu, 14 May 2026 01:01:11 +0530
Subject: [PATCH 28/54]  fix(ming_tts): fall back to soundfile when torchcodec
 unavailable

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../model_executor/models/ming_tts/speaker_extractor.py    | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py b/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py
index 97d5510da0e..5301180aa27 100644
--- a/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py
+++ b/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py
@@ -42,7 +42,12 @@ def extract_from_waveform(self, waveform, sample_rate):
         return embedding.squeeze(0).to(dtype=torch.float32)
 
     def extract_from_file(self, audio_path):
-        waveform, sample_rate = torchaudio.load(audio_path)
+        try:
+            waveform, sample_rate = torchaudio.load(audio_path)
+        except RuntimeError as e:
+            if not any(tok in str(e) for tok in ("torchcodec", "libavutil", "libtorchcodec")):
+                raise
+            waveform, sample_rate = torchaudio.load(audio_path, backend="soundfile")
         return self.extract_from_waveform(waveform, sample_rate)
 
     def extract_many(self, audio_paths):

From b72dea9d6b43bd5c8bc7a5b92ff8a8e027712f29 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Thu, 14 May 2026 01:08:35 +0530
Subject: [PATCH 29/54] Fix Ming speaker audio fallback

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../models/ming_tts/speaker_extractor.py               | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py b/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py
index 5301180aa27..2219c46bf38 100644
--- a/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py
+++ b/vllm_omni/model_executor/models/ming_tts/speaker_extractor.py
@@ -18,6 +18,14 @@ def resolve_model_to_local_path(model):
     return snapshot_download(model)
 
 
+def _load_with_soundfile(audio_path):
+    import soundfile as sf
+
+    data, sample_rate = sf.read(audio_path, dtype="float32", always_2d=True)
+    waveform = torch.from_numpy(data).T.contiguous()
+    return waveform, sample_rate
+
+
 class MingSpeakerEmbeddingExtractor:
     def __init__(self, model, target_sr=16000):
         local_model_path = resolve_model_to_local_path(model)
@@ -47,7 +55,7 @@ def extract_from_file(self, audio_path):
         except RuntimeError as e:
             if not any(tok in str(e) for tok in ("torchcodec", "libavutil", "libtorchcodec")):
                 raise
-            waveform, sample_rate = torchaudio.load(audio_path, backend="soundfile")
+            waveform, sample_rate = _load_with_soundfile(audio_path)
         return self.extract_from_waveform(waveform, sample_rate)
 
     def extract_many(self, audio_paths):

From 6edc6ff554a0a949ce6a1a03ee9dd847192f2435 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Thu, 14 May 2026 01:50:36 +0530
Subject: [PATCH 30/54] Align Ming podcast prompt formatting

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 examples/offline_inference/ming_tts/cases.yaml | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/examples/offline_inference/ming_tts/cases.yaml b/examples/offline_inference/ming_tts/cases.yaml
index 2568a9bad2e..db9fbca30c9 100644
--- a/examples/offline_inference/ming_tts/cases.yaml
+++ b/examples/offline_inference/ming_tts/cases.yaml
@@ -72,16 +72,9 @@ zero_shot:
 
 podcast:
   prompt: "Please generate speech based on the following description.\n"
-  text: |
-    speaker_1:你可以说一下，就大概说一下，可能虽然我也不知道，我看过那部电影没有。
-    speaker_2:就是那个叫什么，变相一节课的嘛。
-    speaker_1:嗯。
-    speaker_2:一部搞笑的电影。
-    speaker_1:一部搞笑的。
+  text: " speaker_1:你可以说一下，就大概说一下，可能虽然我也不知道，我看过那部电影没有。\n speaker_2:就是那个叫什么，变相一节课的嘛。\n speaker_1:嗯。\n speaker_2:一部搞笑的电影。\n speaker_1:一部搞笑的。\n"
   instruction: null
-  prompt_text: |
-    speaker_1:并且我们还要进行每个月还要考核 笔试的话还要进行笔试，做个，当服务员还要去笔试了
-    speaker_2:对啊，这真的很奇怪，就是 单纯的因，单纯自己工资不高，只是因为可能人家那个店比较出名一点，就对你苛刻要求
+  prompt_text: " speaker_1:并且我们还要进行每个月还要考核 笔试的话还要进行笔试，做个，当服务员还要去笔试了\n speaker_2:对啊，这真的很奇怪，就是 单纯的因，单纯自己工资不高，只是因为可能人家那个店比较出名一点，就对你苛刻要求\n"
   requires_ref_audio_count: 2
   auto_extract_speaker_embeddings: true
   max_decode_steps: 200

From 736f7b884c3a56ead0ceb7539df7fb8121bfb070 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Thu, 14 May 2026 22:14:35 +0530
Subject: [PATCH 31/54]  ming-tts: address decode state and ISTFT reuse
 concerns

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../models/ming_flash_omni/audio_vae.py       |  97 +-------
 .../models/ming_flash_omni/prompt_utils.py    |   8 +-
 .../models/ming_tts/audio_tokenizer/istft.py  | 210 +-----------------
 .../models/ming_tts/ming_tts_llm.py           |  87 +-------
 .../models/ming_tts/patch_emission.py         |  21 +-
 .../models/ming_tts/prompt_utils.py           |  14 +-
 .../models/ming_utils/__init__.py             |   2 +
 .../models/ming_utils/audio_dsp.py            | 210 ++++++++++++++++++
 8 files changed, 256 insertions(+), 393 deletions(-)
 create mode 100644 vllm_omni/model_executor/models/ming_utils/__init__.py
 create mode 100644 vllm_omni/model_executor/models/ming_utils/audio_dsp.py

diff --git a/vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py b/vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py
index 3137a878ec2..81496744cde 100644
--- a/vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py
+++ b/vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py
@@ -13,6 +13,8 @@
 from transformers.utils import is_flash_attn_2_available
 from vllm.logger import init_logger
 
+from vllm_omni.model_executor.models.ming_utils.audio_dsp import ISTFTHead
+
 logger = init_logger(__name__)
 
 
@@ -36,101 +38,6 @@ def __init__(
         super().__init__(**kwargs)
 
 
-class ISTFT(nn.Module):
-    def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"):
-        super().__init__()
-        if padding not in ["center", "same"]:
-            raise ValueError("Padding must be 'center' or 'same'.")
-        self.padding = padding
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.win_length = win_length
-        window = torch.hann_window(win_length)
-        self.register_buffer("window", window)
-        self.buffer_len = self.win_length - self.hop_length
-
-    def _buffer_process(self, x, buffer, pad, last_chunk=False, streaming=False):
-        if streaming:
-            if buffer is None:
-                x = x[:, pad:]
-            if buffer is not None:
-                x[:, : self.buffer_len] += buffer
-            buffer = x[:, -self.buffer_len :]
-            if not last_chunk:
-                x = x[:, : -self.buffer_len]
-            else:
-                x = x[:, :-pad]
-        else:
-            x = x[:, pad:-pad]
-        return x, buffer
-
-    def forward(self, spec, audio_buffer=None, window_buffer=None, streaming=False, last_chunk=False):
-        if self.padding == "center":
-            return torch.istft(spec, self.n_fft, self.hop_length, self.win_length, self.window, center=True)
-        elif self.padding == "same":
-            pad = (self.win_length - self.hop_length) // 2
-        else:
-            raise ValueError("Padding must be 'center' or 'same'.")
-
-        B, N, T = spec.shape
-        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
-        ifft = ifft * self.window[None, :, None]
-
-        output_size = (T - 1) * self.hop_length + self.win_length
-        y = torch.nn.functional.fold(
-            ifft,
-            output_size=(1, output_size),
-            kernel_size=(1, self.win_length),
-            stride=(1, self.hop_length),
-        )[:, 0, 0, :]
-
-        y, audio_buffer = self._buffer_process(y, audio_buffer, pad, last_chunk=last_chunk, streaming=streaming)
-
-        window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
-        window_envelope = (
-            torch.nn.functional.fold(
-                window_sq,
-                output_size=(1, output_size),
-                kernel_size=(1, self.win_length),
-                stride=(1, self.hop_length),
-            )
-            .squeeze(0)
-            .squeeze(0)
-        )
-
-        window_envelope, window_buffer = self._buffer_process(
-            window_envelope, window_buffer, pad, last_chunk=last_chunk, streaming=streaming
-        )
-        window_envelope = window_envelope.squeeze()
-
-        assert (window_envelope > 1e-11).all()
-        y = y / window_envelope
-
-        return y, audio_buffer, window_buffer
-
-
-class ISTFTHead(nn.Module):
-    def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"):
-        super().__init__()
-        out_dim = n_fft + 2
-        self.out = nn.Linear(dim, out_dim)
-        self.istft = ISTFT(n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding)
-
-    def forward(self, x, audio_buffer=None, window_buffer=None, streaming=False, last_chunk=False):
-        x_pred = self.out(x)
-        x_pred = x_pred.transpose(1, 2)
-        mag, p = x_pred.chunk(2, dim=1)
-        mag = torch.exp(mag)
-        mag = torch.clip(mag, max=1e2)
-        x = torch.cos(p)
-        y = torch.sin(p)
-        S = mag * (x + 1j * y)
-        audio, audio_buffer, window_buffer = self.istft(
-            S, audio_buffer=audio_buffer, window_buffer=window_buffer, streaming=streaming, last_chunk=last_chunk
-        )
-        return audio.unsqueeze(1), x_pred, audio_buffer, window_buffer
-
-
 class StreamingLinearUpsample(nn.Module):
     def __init__(self, scale_factor=4):
         super().__init__()
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/prompt_utils.py b/vllm_omni/model_executor/models/ming_flash_omni/prompt_utils.py
index 4271114bc2d..572713e5b81 100644
--- a/vllm_omni/model_executor/models/ming_flash_omni/prompt_utils.py
+++ b/vllm_omni/model_executor/models/ming_flash_omni/prompt_utils.py
@@ -36,12 +36,18 @@
 }
 
 
-def create_instruction(user_input: dict[str, Any]) -> str:
+def create_instruction(user_input: Any) -> str | None:
     """Return a JSON caption string for ``audio_sequence[0]``.
 
     Only keys already present on the base template are merged in; unknown
     keys are silently ignored to keep the output schema stable.
     """
+    if user_input is None:
+        return None
+    if isinstance(user_input, str):
+        return user_input
+    if not isinstance(user_input, dict):
+        raise ValueError(f"Ming instruction must be str, dict, or None; got {type(user_input).__name__}")
     caption = copy.deepcopy(BASE_CAPTION_TEMPLATE)
     item = caption["audio_sequence"][0]
     for key, value in user_input.items():
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py
index 6824de74140..982762338c2 100644
--- a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py
+++ b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py
@@ -1,209 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/audio_tokenizer/istft.py
+from vllm_omni.model_executor.models.ming_utils.audio_dsp import ISTFT, FourierHead, ISTFTHead
 
-import torch
-import torch.nn as nn
-
-
-class ISTFT(nn.Module):
-    """
-    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
-    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
-    See issue: https://github.com/pytorch/pytorch/issues/62323
-    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
-    The NOLA constraint is met as we trim padded samples anyway.
-
-    Args:
-        n_fft (int): Size of Fourier transform.
-        hop_length (int): The distance between neighboring sliding window frames.
-        win_length (int): The size of window frame and STFT filter.
-        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
-    """
-
-    def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"):
-        super().__init__()
-        if padding not in ["center", "same"]:
-            raise ValueError("Padding must be 'center' or 'same'.")
-        self.padding = padding
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.win_length = win_length
-        window = torch.hann_window(win_length)
-        self.register_buffer("window", window)
-
-        self.audio_buffer = None
-        self.window_buffer = None
-        self.buffer_len = self.win_length - self.hop_length
-
-    def __buffer_process(
-        self,
-        x: torch.Tensor,
-        buffer: torch.Tensor | None,
-        pad: int,
-        last_chunk: bool = False,
-        streaming: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        if streaming:
-            if buffer is None:
-                # first chunk
-                x = x[:, pad:]
-            if buffer is not None:
-                # next chunk
-                x[:, : self.buffer_len] += buffer
-            buffer = x[:, -self.buffer_len :]
-            if not last_chunk:
-                x = x[:, : -self.buffer_len]
-            else:
-                x = x[:, :-pad]
-        else:
-            x = x[:, pad:-pad]
-
-        return x, buffer
-
-    def forward(
-        self,
-        spec: torch.Tensor,
-        audio_buffer: torch.Tensor | None = None,
-        window_buffer: torch.Tensor | None = None,
-        streaming: bool = False,
-        last_chunk: bool = False,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
-        """
-        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
-
-        Args:
-            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
-                            N is the number of frequency bins, and T is the number of time frames.
-            audio_buffer (Tensor): [Streaming Input/State] The audio overlap buffer from the previous chunk.
-                            Shape: (B, win_length - hop_length)
-            window_buffer (Tensor): [Streaming Input/State] The window overlap buffer from the previous chunk.
-            streaming: If `True`, the function operates in streaming mode, processing `spec` as a single chunk.
-            last_chunk: When `streaming=True` and `last_chunk=True`, the function can perform final "flush" operations
-
-        Returns:
-            Reconstructed signal, plus streaming buffers when `padding="same"`.
-        """
-        if self.padding == "center":
-            # Fallback to pytorch native implementation
-            return torch.istft(spec, self.n_fft, self.hop_length, self.win_length, self.window, center=True)
-        elif self.padding == "same":
-            pad = (self.win_length - self.hop_length) // 2
-        else:
-            raise ValueError("Padding must be 'center' or 'same'.")
-
-        if spec.dim() != 3:
-            raise ValueError(f"Expected spec rank-3 [Batch, Freq, Time], got {tuple(spec.shape)}")
-        B, N, T = spec.shape
-
-        # Inverse FFT
-        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
-        ifft = ifft * self.window[None, :, None]
-
-        # Overlap and Add
-        output_size = (T - 1) * self.hop_length + self.win_length
-        y = torch.nn.functional.fold(
-            ifft,
-            output_size=(1, output_size),
-            kernel_size=(1, self.win_length),
-            stride=(1, self.hop_length),
-        )[:, 0, 0, :]
-
-        y, audio_buffer = self.__buffer_process(y, audio_buffer, pad, last_chunk=last_chunk, streaming=streaming)
-
-        # Window envelope
-        window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
-        window_envelope = (
-            torch.nn.functional.fold(
-                window_sq,
-                output_size=(1, output_size),
-                kernel_size=(1, self.win_length),
-                stride=(1, self.hop_length),
-            )
-            .squeeze(0)
-            .squeeze(0)
-        )
-
-        window_envelope, window_buffer = self.__buffer_process(
-            window_envelope, window_buffer, pad, last_chunk=last_chunk, streaming=streaming
-        )
-        window_envelope = window_envelope.squeeze()
-
-        # Normalize
-        if not (window_envelope > 1e-11).all():
-            raise RuntimeError("ISTFT window envelope underflowed; invalid overlap-add state.")
-        y = y / window_envelope
-
-        return y, audio_buffer, window_buffer
-
-
-class FourierHead(nn.Module):
-    """Base class for inverse fourier modules."""
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
-                        L is the sequence length, and H denotes the model dimension.
-
-        Returns:
-            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
-        """
-        raise NotImplementedError("Subclasses must implement the forward method.")
-
-
-class ISTFTHead(FourierHead):
-    """
-    ISTFT Head module for predicting STFT complex coefficients.
-
-    Args:
-        dim (int): Hidden dimension of the model.
-        n_fft (int): Size of Fourier transform.
-        hop_length (int): The distance between neighboring sliding window frames, which should align with
-                          the resolution of the input features.
-        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
-    """
-
-    def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"):
-        super().__init__()
-        out_dim = n_fft + 2
-        self.out = torch.nn.Linear(dim, out_dim)
-        self.istft = ISTFT(n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        audio_buffer: torch.Tensor | None = None,
-        window_buffer: torch.Tensor | None = None,
-        streaming: bool = False,
-        last_chunk: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
-        """
-        Forward pass of the ISTFTHead module.
-
-        Args:
-            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
-                        L is the sequence length, and H denotes the model dimension.
-
-        Returns:
-            Audio, predicted spectrogram coefficients, and streaming buffers.
-        """
-        x_pred = self.out(x)
-        # x_pred = x
-        x_pred = x_pred.transpose(1, 2)
-        mag, p = x_pred.chunk(2, dim=1)
-        mag = torch.exp(mag)
-        mag = torch.clip(mag, max=1e2)  # safeguard to prevent excessively large magnitudes
-        # wrapping happens here. These two lines produce real and imaginary value
-        x = torch.cos(p)
-        y = torch.sin(p)
-        # recalculating phase here does not produce anything new
-        # only costs time
-        # phase = torch.atan2(y, x)
-        # S = mag * torch.exp(phase * 1j)
-        # better directly produce the complex value
-        S = mag * (x + 1j * y)
-        audio, audio_buffer, window_buffer = self.istft(
-            S, audio_buffer=audio_buffer, window_buffer=window_buffer, streaming=streaming, last_chunk=last_chunk
-        )
-        return audio.unsqueeze(1), x_pred, audio_buffer, window_buffer
+__all__ = ["FourierHead", "ISTFT", "ISTFTHead"]
diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py b/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
index 007dac25b06..6d170aca046 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
@@ -49,6 +49,7 @@
     _resolve_runtime_float,
     _resolve_runtime_int,
     _resolve_stop_probs_batch,
+    _validate_ming_decode_window,
 )
 
 logger = init_logger(__name__)
@@ -86,11 +87,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.stop_head.to(dtype=self.fm_dtype)
         self.spk_head.to(dtype=self.fm_dtype)
         self._pending_postprocess_updates: dict[str, dict[str, Any]] = {}
-        self._last_sample_decode_steps = None
-        self._last_sample_stop_probs = None
-        self._last_sample_max_decode_steps = None
-        self._last_sample_min_decode_steps = None
-        self._pending_sample_stop_inputs = None
         self._last_text_mode = False
 
     def embed_input_ids(
@@ -124,6 +120,11 @@ def forward(
         if model_intermediate_buffer is None:
             model_intermediate_buffer = kwargs.get("runtime_additional_information")
         request_infos = _normalize_request_infos(model_intermediate_buffer)
+        _validate_ming_decode_window(
+            request_infos,
+            min_stop_step=int(self.ming_config.stop_head_min_steps),
+            default_max_decode_steps=self.ming_config.max_decode_steps,
+        )
         backbone_out = self.model(
             input_ids=input_ids,
             positions=positions,
@@ -139,10 +140,6 @@ def forward(
             raise RuntimeError("Mixed Ming text/audio modes in one Stage-0 batch are unsupported.")
         if text_mode:
             self._last_text_mode = True
-            self._last_sample_decode_steps = None
-            self._last_sample_stop_probs = None
-            self._last_sample_max_decode_steps = None
-            self._last_sample_min_decode_steps = None
             return OmniOutput(
                 text_hidden_states=hidden_states,
                 multimodal_outputs={KEY_TEXT_MODE: True},
@@ -162,10 +159,6 @@ def forward(
             stop_reason_code_tokens
         ) = None
         pending_updates: dict[str, dict[str, Any]] = {}
-        sampled_decode_steps: list[int] = []
-        sampled_stop_probs: list[torch.Tensor] = []
-        sampled_max_decode_steps: list[int] = []
-        sampled_min_decode_steps: list[int] = []
         cursor = 0
         any_decode = False
         for req_idx, token_count in enumerate(token_counts):
@@ -217,10 +210,6 @@ def forward(
             max_decode_step_tokens[output_index : output_index + 1] = req_max_decode_steps
             min_decode_step_tokens[output_index : output_index + 1] = req_min_decode_steps
             has_patch[output_index : output_index + 1] = True
-            sampled_decode_steps.append(decode_step)
-            sampled_stop_probs.append(stop_probs.reshape(-1)[0])
-            sampled_max_decode_steps.append(req_max_decode_steps)
-            sampled_min_decode_steps.append(req_min_decode_steps)
             stop_reason, _, _, _, _ = _resolve_ming_stop_decision(
                 step=decode_step,
                 stop_prob=float(stop_probs.reshape(-1)[0].item()),
@@ -245,31 +234,9 @@ def forward(
 
         self._pending_postprocess_updates = pending_updates
         if not any_decode:
-            self._last_sample_decode_steps = None
-            self._last_sample_stop_probs = None
-            self._last_sample_max_decode_steps = None
-            self._last_sample_min_decode_steps = None
             return OmniOutput(
                 text_hidden_states=hidden_states, multimodal_outputs=None, intermediate_tensors=intermediate_tensors
             )
-        self._last_sample_decode_steps = (
-            torch.tensor(sampled_decode_steps, dtype=torch.int32, device=hidden_states.device)
-            if sampled_decode_steps
-            else None
-        )
-        self._last_sample_stop_probs = (
-            torch.stack(sampled_stop_probs).to(device=hidden_states.device) if sampled_stop_probs else None
-        )
-        self._last_sample_max_decode_steps = (
-            torch.tensor(sampled_max_decode_steps, dtype=torch.int32, device=hidden_states.device)
-            if sampled_max_decode_steps
-            else None
-        )
-        self._last_sample_min_decode_steps = (
-            torch.tensor(sampled_min_decode_steps, dtype=torch.int32, device=hidden_states.device)
-            if sampled_min_decode_steps
-            else None
-        )
         return OmniOutput(
             text_hidden_states=hidden_states,
             multimodal_outputs={
@@ -303,22 +270,17 @@ def compute_logits(
             min_decode_steps_tensor = mm.get("ming_min_decode_steps")
             hidden_states = hidden_states.text_hidden_states
         if text_mode:
-            self._pending_sample_stop_inputs = None
             return (
                 None
                 if hidden_states is None or hidden_states.numel() == 0
                 else self.model.compute_logits(hidden_states)
             )
-        max_decode_steps_tensor = (
-            self._last_sample_max_decode_steps if max_decode_steps_tensor is None else max_decode_steps_tensor
-        )
-        min_decode_steps_tensor = (
-            self._last_sample_min_decode_steps if min_decode_steps_tensor is None else min_decode_steps_tensor
-        )
-        decode_steps = self._last_sample_decode_steps if decode_steps is None else decode_steps
-        stop_probs_tensor = self._last_sample_stop_probs if stop_probs_tensor is None else stop_probs_tensor
+        if max_decode_steps_tensor is None or decode_steps is None or stop_probs_tensor is None:
+            raise RuntimeError(
+                "compute_logits received plain Tensor, not OmniOutput - "
+                "multimodal decode state unavailable. Pipeline-parallel split unsupported for MingTTS."
+            )
         if hidden_states is None or hidden_states.numel() == 0:
-            self._pending_sample_stop_inputs = None
             return None
         if hidden_states.dim() != 2:
             raise RuntimeError(
@@ -354,12 +316,6 @@ def compute_logits(
                 text_eos_token_id=int(self.ming_config.text_eos_token_id),
             )
             logits[i, int(next_token_id)] = 0.0
-        self._pending_sample_stop_inputs = {
-            "steps": steps,
-            "stop_probs": stop_prob_values,
-            "max_decode_steps": max_decode_steps,
-            "min_decode_steps": min_decode_steps,
-        }
         return logits
 
     def sample(self, logits, sampling_metadata):
@@ -368,27 +324,8 @@ def sample(self, logits, sampling_metadata):
         if self._last_text_mode:
             return self.model.sample(logits, sampling_metadata)
         del sampling_metadata
-        stop_inputs = self._pending_sample_stop_inputs
-        self._pending_sample_stop_inputs = None
-        if stop_inputs is None:
-            return SamplerOutput(
-                sampled_token_ids=logits.argmax(dim=-1, keepdim=True).to(dtype=torch.int32), logprobs_tensors=None
-            )
-        sampled_ids = []
-        for i in range(logits.shape[0]):
-            _, _, _, _, next_token_id = _resolve_ming_stop_decision(
-                step=int(stop_inputs["steps"][i]),
-                stop_prob=float(stop_inputs["stop_probs"][i]),
-                stop_threshold=float(self.ming_config.stop_head_threshold),
-                min_stop_step=int(self.ming_config.stop_head_min_steps),
-                min_decode_steps=int(stop_inputs["min_decode_steps"][i]),
-                max_decode_steps=int(stop_inputs["max_decode_steps"][i]),
-                audio_dummy_token_id=int(self.ming_config.audio_dummy_token_id),
-                text_eos_token_id=int(self.ming_config.text_eos_token_id),
-            )
-            sampled_ids.append(next_token_id)
         return SamplerOutput(
-            sampled_token_ids=torch.tensor(sampled_ids, dtype=torch.int32, device=logits.device).reshape(-1, 1),
+            sampled_token_ids=logits.argmax(dim=-1, keepdim=True).to(dtype=torch.int32),
             logprobs_tensors=None,
         )
 
diff --git a/vllm_omni/model_executor/models/ming_tts/patch_emission.py b/vllm_omni/model_executor/models/ming_tts/patch_emission.py
index 938e789b67a..baf53189924 100644
--- a/vllm_omni/model_executor/models/ming_tts/patch_emission.py
+++ b/vllm_omni/model_executor/models/ming_tts/patch_emission.py
@@ -7,7 +7,7 @@
 import torch
 from vllm.forward_context import get_forward_context, is_forward_context_available
 
-from .config_ming_tts import MingTTSConfig
+from .config_ming_tts import KEY_MAX_DECODE_STEPS, KEY_MIN_DECODE_STEPS, KEY_REQUEST_ID, MingTTSConfig
 
 MING_STOP_REASON_CONTINUE = "continue"
 MING_STOP_REASON_STOP_HEAD = "stop_head"
@@ -115,6 +115,25 @@ def _resolve_optional_runtime_int(req_info: dict[str, Any], key: str, default_va
     return value
 
 
+def _validate_ming_decode_window(
+    request_infos: list[dict[str, Any]],
+    *,
+    min_stop_step: int,
+    default_max_decode_steps: int,
+) -> None:
+    for i, info in enumerate(request_infos):
+        max_steps = _resolve_runtime_int(info, KEY_MAX_DECODE_STEPS, default_max_decode_steps)
+        min_steps = _resolve_optional_runtime_int(info, KEY_MIN_DECODE_STEPS, 0)
+        min_required = max(min_stop_step + 1, min_steps)
+        if max_steps < min_required:
+            req_id = info.get(KEY_REQUEST_ID, f"idx={i}")
+            raise ValueError(
+                f"Ming request {req_id!r}: max_decode_steps={max_steps} < "
+                f"min_required_decode_steps={min_required} "
+                f"(min_stop_step={min_stop_step}, min_decode_steps={min_steps})"
+            )
+
+
 def _resolve_max_decode_steps_batch(
     value: torch.Tensor | None,
     *,
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_utils.py b/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
index 0075b248047..e647fd68f74 100644
--- a/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
+++ b/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
@@ -15,9 +15,7 @@
 from vllm_omni.engine.stage_init_utils import _resolve_model_to_local_path
 from vllm_omni.model_executor.models.ming_flash_omni.prompt_utils import (
     DEFAULT_PROMPT,
-)
-from vllm_omni.model_executor.models.ming_flash_omni.prompt_utils import (
-    create_instruction as _create_ming_instruction,
+    create_instruction,
 )
 
 from .audio_tokenizer.modeling_audio_vae import AudioVAE
@@ -42,16 +40,6 @@
 _DURATION_SECONDS_RE = re.compile(r"Duration:\s*([0-9]+(?:\.[0-9]+)?)\s*s\b", re.IGNORECASE)
 
 
-def create_instruction(value: Any) -> str | None:
-    if value is None:
-        return None
-    if isinstance(value, str):
-        return value
-    if not isinstance(value, dict):
-        raise ValueError(f"Ming instruction must be str or dict, got {type(value).__name__}")
-    return _create_ming_instruction(value)
-
-
 def parse_duration_seconds(text: str | None) -> float | None:
     if not isinstance(text, str):
         return None
diff --git a/vllm_omni/model_executor/models/ming_utils/__init__.py b/vllm_omni/model_executor/models/ming_utils/__init__.py
new file mode 100644
index 00000000000..208f01a7cb5
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_utils/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/vllm_omni/model_executor/models/ming_utils/audio_dsp.py b/vllm_omni/model_executor/models/ming_utils/audio_dsp.py
new file mode 100644
index 00000000000..e79ada4339a
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_utils/audio_dsp.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/audio_tokenizer/istft.py
+
+import torch
+import torch.nn as nn
+
+
+class ISTFT(nn.Module):
+    """
+    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
+    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
+    See issue: https://github.com/pytorch/pytorch/issues/62323
+    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
+    The NOLA constraint is met as we trim padded samples anyway.
+
+    Args:
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames.
+        win_length (int): The size of window frame and STFT filter.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+
+    def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        window = torch.hann_window(win_length)
+        self.register_buffer("window", window)
+
+        self.buffer_len = self.win_length - self.hop_length
+
+    def _buffer_process(
+        self,
+        x: torch.Tensor,
+        buffer: torch.Tensor | None,
+        pad: int,
+        last_chunk: bool = False,
+        streaming: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if streaming:
+            if buffer is None:
+                # first chunk
+                x = x[:, pad:]
+            if buffer is not None:
+                # next chunk
+                x[:, : self.buffer_len] += buffer
+            buffer = x[:, -self.buffer_len :]
+            if not last_chunk:
+                x = x[:, : -self.buffer_len]
+            else:
+                x = x[:, :-pad]
+        else:
+            x = x[:, pad:-pad]
+
+        return x, buffer
+
+    def forward(
+        self,
+        spec: torch.Tensor,
+        audio_buffer: torch.Tensor | None = None,
+        window_buffer: torch.Tensor | None = None,
+        streaming: bool = False,
+        last_chunk: bool = False,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+        """
+        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
+
+        Args:
+            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
+                            N is the number of frequency bins, and T is the number of time frames.
+            audio_buffer (Tensor): [Streaming Input/State] The audio overlap buffer from the previous chunk.
+                            Shape: (B, win_length - hop_length)
+            window_buffer (Tensor): [Streaming Input/State] The window overlap buffer from the previous chunk.
+            streaming: If `True`, the function operates in streaming mode, processing `spec` as a single chunk.
+            last_chunk: When `streaming=True` and `last_chunk=True`, the function can perform final "flush" operations
+
+        Returns:
+            Reconstructed signal, plus streaming buffers when `padding="same"`.
+        """
+        if self.padding == "center":
+            # Fallback to pytorch native implementation
+            return torch.istft(spec, self.n_fft, self.hop_length, self.win_length, self.window, center=True)
+        elif self.padding == "same":
+            pad = (self.win_length - self.hop_length) // 2
+        else:
+            raise ValueError("Padding must be 'center' or 'same'.")
+
+        if spec.dim() != 3:
+            raise ValueError(f"Expected spec rank-3 [Batch, Freq, Time], got {tuple(spec.shape)}")
+        _, _, T = spec.shape
+
+        # Inverse FFT
+        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
+        ifft = ifft * self.window[None, :, None]
+
+        # Overlap and Add
+        output_size = (T - 1) * self.hop_length + self.win_length
+        y = torch.nn.functional.fold(
+            ifft,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        )[:, 0, 0, :]
+
+        y, audio_buffer = self._buffer_process(y, audio_buffer, pad, last_chunk=last_chunk, streaming=streaming)
+
+        # Window envelope
+        window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
+        window_envelope = (
+            torch.nn.functional.fold(
+                window_sq,
+                output_size=(1, output_size),
+                kernel_size=(1, self.win_length),
+                stride=(1, self.hop_length),
+            )
+            .squeeze(0)
+            .squeeze(0)
+        )
+
+        window_envelope, window_buffer = self._buffer_process(
+            window_envelope, window_buffer, pad, last_chunk=last_chunk, streaming=streaming
+        )
+        window_envelope = window_envelope.squeeze()
+
+        # Normalize
+        if not (window_envelope > 1e-11).all():
+            raise RuntimeError("ISTFT window envelope underflowed; invalid overlap-add state.")
+        y = y / window_envelope
+
+        return y, audio_buffer, window_buffer
+
+
+class FourierHead(nn.Module):
+    """Base class for inverse fourier modules."""
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        raise NotImplementedError("Subclasses must implement the forward method.")
+
+
+class ISTFTHead(FourierHead):
+    """
+    ISTFT Head module for predicting STFT complex coefficients.
+
+    Args:
+        dim (int): Hidden dimension of the model.
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames, which should align with
+                          the resolution of the input features.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+
+    def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"):
+        super().__init__()
+        out_dim = n_fft + 2
+        self.out = torch.nn.Linear(dim, out_dim)
+        self.istft = ISTFT(n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        audio_buffer: torch.Tensor | None = None,
+        window_buffer: torch.Tensor | None = None,
+        streaming: bool = False,
+        last_chunk: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+        """
+        Forward pass of the ISTFTHead module.
+
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+
+        Returns:
+            Audio, predicted spectrogram coefficients, and streaming buffers.
+        """
+        x_pred = self.out(x)
+        # x_pred = x
+        x_pred = x_pred.transpose(1, 2)
+        mag, p = x_pred.chunk(2, dim=1)
+        mag = torch.exp(mag)
+        mag = torch.clip(mag, max=1e2)  # safeguard to prevent excessively large magnitudes
+        # wrapping happens here. These two lines produce real and imaginary value
+        x = torch.cos(p)
+        y = torch.sin(p)
+        # recalculating phase here does not produce anything new
+        # only costs time
+        # phase = torch.atan2(y, x)
+        # S = mag * torch.exp(phase * 1j)
+        # better directly produce the complex value
+        S = mag * (x + 1j * y)
+        audio, audio_buffer, window_buffer = self.istft(
+            S, audio_buffer=audio_buffer, window_buffer=window_buffer, streaming=streaming, last_chunk=last_chunk
+        )
+        return audio.unsqueeze(1), x_pred, audio_buffer, window_buffer
+
+
+__all__ = ["FourierHead", "ISTFT", "ISTFTHead"]

From f1a81790685b607ac8cf0b53ef600753f24578ff Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Thu, 14 May 2026 22:36:12 +0530
Subject: [PATCH 32/54] fix(ming_tts_llm): allow compute_logits with plain
 tensor during profile run

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py b/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
index 6d170aca046..ff3bf3d8354 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts_llm.py
@@ -275,11 +275,6 @@ def compute_logits(
                 if hidden_states is None or hidden_states.numel() == 0
                 else self.model.compute_logits(hidden_states)
             )
-        if max_decode_steps_tensor is None or decode_steps is None or stop_probs_tensor is None:
-            raise RuntimeError(
-                "compute_logits received plain Tensor, not OmniOutput - "
-                "multimodal decode state unavailable. Pipeline-parallel split unsupported for MingTTS."
-            )
         if hidden_states is None or hidden_states.numel() == 0:
             return None
         if hidden_states.dim() != 2:

From e0dfa4286d649a812489959f5c9e75d52e4769c7 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Thu, 14 May 2026 23:27:29 +0530
Subject: [PATCH 33/54] ming-tts: align sampled multimodal stop state for
 logits

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../models/ming_tts/ming_tts.py               |  1 +
 vllm_omni/worker/gpu_ar_model_runner.py       | 60 +++++++++++++++++--
 2 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts.py b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
index d86c50fe6c2..a50fc2af39b 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
@@ -68,6 +68,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.has_postprocess = False
         self.requires_raw_input_tokens = False
         self.model_stage = vllm_config.model_config.model_stage
+        self.requires_sampled_multimodal_outputs = self.model_stage == "llm"
         self._prompt_encoder = None
 
         if self.model_stage == "llm":
diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py
index f87529106d1..d03c65f0dc4 100644
--- a/vllm_omni/worker/gpu_ar_model_runner.py
+++ b/vllm_omni/worker/gpu_ar_model_runner.py
@@ -40,6 +40,7 @@
 
 from vllm_omni.data_entry_keys import flatten_payload
 from vllm_omni.distributed.omni_connectors.kv_transfer_manager import OmniKVTransferManager
+from vllm_omni.model_executor.models.output_templates import OmniOutput
 from vllm_omni.outputs import OmniModelRunnerOutput
 from vllm_omni.utils.mm_outputs import build_mm_cpu, to_payload_element
 from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner
@@ -175,6 +176,45 @@ def _resolve_pooler_payload_req_ids(self, req_ids_output_copy: list[str]) -> tup
             downstream_req_ids = req_ids_output_copy
         return engine_output_type, downstream_req_ids
 
+    def _slice_multimodal_outputs_for_logits(
+        self,
+        multimodal_outputs: Any,
+        logits_indices: torch.Tensor,
+        num_tokens: int,
+    ) -> Any:
+        if not multimodal_outputs:
+            return multimodal_outputs
+
+        def _slice(value):
+            if isinstance(value, torch.Tensor):
+                if value.ndim > 0 and int(value.shape[0]) == int(num_tokens):
+                    return value[logits_indices]
+                return value
+            if isinstance(value, dict):
+                return {key: _slice(item) for key, item in value.items()}
+            if isinstance(value, list) and len(value) == int(num_tokens):
+                indices = logits_indices.detach().cpu().tolist()
+                return [value[int(index)] for index in indices]
+            return value
+
+        return _slice(multimodal_outputs)
+
+    def _build_compute_logits_input(
+        self,
+        sample_hidden_states: torch.Tensor,
+        multimodal_outputs: Any,
+        logits_indices: torch.Tensor,
+        num_tokens: int,
+    ) -> torch.Tensor | OmniOutput:
+        if getattr(self.model, "requires_sampled_multimodal_outputs", False) and multimodal_outputs:
+            return OmniOutput(
+                text_hidden_states=sample_hidden_states,
+                multimodal_outputs=self._slice_multimodal_outputs_for_logits(
+                    multimodal_outputs, logits_indices, num_tokens
+                ),
+            )
+        return sample_hidden_states
+
     def capture_model(self) -> int:
         result = super().capture_model()
         self._capture_talker_mtp_graphs()
@@ -599,18 +639,30 @@ def execute_model(
                     )
 
                 sample_hidden_states = hidden_states[logits_indices]
+                compute_logits_input = self._build_compute_logits_input(
+                    sample_hidden_states,
+                    multimodal_outputs,
+                    logits_indices,
+                    int(hidden_states.shape[0]),
+                )
                 # Try with sampling_metadata first; fall back to without for models that don't support it
                 try:
                     logits = self.model.compute_logits(
-                        sample_hidden_states, sampling_metadata=self.input_batch.sampling_metadata
+                        compute_logits_input, sampling_metadata=self.input_batch.sampling_metadata
                     )
                 except TypeError:
-                    logits = self.model.compute_logits(sample_hidden_states)
+                    logits = self.model.compute_logits(compute_logits_input)
             else:
                 # Rare case.
                 assert not self.is_pooling_model
 
                 sample_hidden_states = hidden_states[logits_indices]
+                compute_logits_input = self._build_compute_logits_input(
+                    sample_hidden_states,
+                    multimodal_outputs,
+                    logits_indices,
+                    int(hidden_states.shape[0]),
+                )
                 if not get_pp_group().is_last_rank:
                     all_gather_tensors = {
                         "residual": not is_residual_scattered_for_sp(self.vllm_config, num_tokens_padded)
@@ -625,10 +677,10 @@ def execute_model(
                     # Try with sampling_metadata first; fall back to without for models that don't support it
                     try:
                         logits = self.model.compute_logits(
-                            sample_hidden_states, sampling_metadata=self.input_batch.sampling_metadata
+                            compute_logits_input, sampling_metadata=self.input_batch.sampling_metadata
                         )
                     except TypeError:
-                        logits = self.model.compute_logits(sample_hidden_states)
+                        logits = self.model.compute_logits(compute_logits_input)
 
                 model_output_broadcast_data: dict[str, Any] = {}
                 if logits is not None:

From 94c91a93d8d8cb50a400fe5ef606ac27cd76a648 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Thu, 14 May 2026 23:37:26 +0530
Subject: [PATCH 34/54] fix(ming-tts): skip text-mode requests in decode window
 validation

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 vllm_omni/model_executor/models/ming_tts/patch_emission.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/patch_emission.py b/vllm_omni/model_executor/models/ming_tts/patch_emission.py
index baf53189924..7ed1e8f0b3b 100644
--- a/vllm_omni/model_executor/models/ming_tts/patch_emission.py
+++ b/vllm_omni/model_executor/models/ming_tts/patch_emission.py
@@ -7,7 +7,7 @@
 import torch
 from vllm.forward_context import get_forward_context, is_forward_context_available
 
-from .config_ming_tts import KEY_MAX_DECODE_STEPS, KEY_MIN_DECODE_STEPS, KEY_REQUEST_ID, MingTTSConfig
+from .config_ming_tts import KEY_MAX_DECODE_STEPS, KEY_MIN_DECODE_STEPS, KEY_REQUEST_ID, KEY_TEXT_MODE, MingTTSConfig
 
 MING_STOP_REASON_CONTINUE = "continue"
 MING_STOP_REASON_STOP_HEAD = "stop_head"
@@ -122,6 +122,8 @@ def _validate_ming_decode_window(
     default_max_decode_steps: int,
 ) -> None:
     for i, info in enumerate(request_infos):
+        if info.get(KEY_TEXT_MODE):
+            continue
         max_steps = _resolve_runtime_int(info, KEY_MAX_DECODE_STEPS, default_max_decode_steps)
         min_steps = _resolve_optional_runtime_int(info, KEY_MIN_DECODE_STEPS, 0)
         min_required = max(min_stop_step + 1, min_steps)

From 9ac9689654efef598621d7432d34c8a94c61f4f0 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Tue, 26 May 2026 21:46:45 +0530
Subject: [PATCH 35/54] refactor(ming-tts): prune dense runtime dead code

Move shared DiT and VAE streaming primitives into ming_utils, remove training-only Ming dense paths and unused helpers, and keep the Ming e2e surface on build_ming_dense_prompt.

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../models/ming_flash_omni/audio_vae.py       |  58 +----
 .../models/ming_flash_omni/talker_module.py   | 231 +-----------------
 .../models/ming_tts/aggregator.py             |  14 +-
 .../ming_tts/audio_tokenizer/audio_encoder.py | 135 ----------
 .../audio_tokenizer/modeling_audio_vae.py     |   8 +-
 .../ming_tts/audio_tokenizer/vae_modules.py   |  88 +------
 .../models/ming_tts/config_ming_tts.py        |  46 +---
 .../models/ming_tts/constants.py              |   4 -
 .../models/ming_tts/flowloss_head.py          |   3 -
 .../model_executor/models/ming_tts/fm/cfm.py  |  26 +-
 .../model_executor/models/ming_tts/fm/dit.py  |  18 +-
 .../models/ming_tts/fm/modules.py             | 147 -----------
 .../model_executor/models/ming_tts/ingress.py |   4 -
 .../models/ming_tts/prompt_utils.py           |  26 --
 .../models/ming_tts/validation.py             |  30 +--
 .../models/ming_utils/audio_dsp.py            |   1 -
 .../models/ming_utils/audio_vae.py            |  63 +++++
 .../model_executor/models/ming_utils/dit.py   | 220 +++++++++++++++++
 .../stage_input_processors/_chunk_transfer.py |  13 -
 19 files changed, 311 insertions(+), 824 deletions(-)
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/audio_tokenizer/audio_encoder.py
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/fm/modules.py
 create mode 100644 vllm_omni/model_executor/models/ming_utils/audio_vae.py
 create mode 100644 vllm_omni/model_executor/models/ming_utils/dit.py

diff --git a/vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py b/vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py
index 81496744cde..7ef3b5050ae 100644
--- a/vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py
+++ b/vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py
@@ -14,6 +14,7 @@
 from vllm.logger import init_logger
 
 from vllm_omni.model_executor.models.ming_utils.audio_dsp import ISTFTHead
+from vllm_omni.model_executor.models.ming_utils.audio_vae import StreamingLinearUpsample
 
 logger = init_logger(__name__)
 
@@ -38,63 +39,6 @@ def __init__(
         super().__init__(**kwargs)
 
 
-class StreamingLinearUpsample(nn.Module):
-    def __init__(self, scale_factor=4):
-        super().__init__()
-        self.scale_factor = scale_factor
-        self.upsampler = nn.Upsample(scale_factor=scale_factor, mode="linear", align_corners=False)
-
-    def forward(self, x, state=None, is_last=False):
-        if state is None:
-            state = {"prev_chunk": None, "history_last": None, "is_first": True}
-
-        if x is None and not is_last:
-            return None, state
-
-        if state["is_first"] and is_last:
-            out = self.upsampler(x.transpose(1, 2)).transpose(1, 2)
-            return out, None
-
-        output_chunks = []
-
-        if state["is_first"]:
-            state["prev_chunk"] = x
-            state["is_first"] = False
-            if not is_last:
-                return None, state
-
-        if state["prev_chunk"] is not None:
-            p = state["prev_chunk"].transpose(1, 2)
-
-            if state["history_last"] is None:
-                lookahead = x[:, :1, :].transpose(1, 2)
-                inp = torch.cat([p, lookahead], dim=2)
-                up = self.upsampler(inp)
-                out_prev = up[:, :, : p.size(2) * self.scale_factor]
-            else:
-                lookahead = x[:, :1, :].transpose(1, 2)
-                inp = torch.cat([state["history_last"], p, lookahead], dim=2)
-                up = self.upsampler(inp)
-                start = self.scale_factor
-                end = start + p.size(2) * self.scale_factor
-                out_prev = up[:, :, start:end]
-
-            output_chunks.append(out_prev.transpose(1, 2))
-            state["history_last"] = p[:, :, -1:]
-            state["prev_chunk"] = x
-
-        if is_last:
-            p = state["prev_chunk"].transpose(1, 2)
-            inp = torch.cat([state["history_last"], p], dim=2)
-            up = self.upsampler(inp)
-            out_last = up[:, :, self.scale_factor :]
-            output_chunks.append(out_last.transpose(1, 2))
-            state = None
-
-        final_out = torch.cat(output_chunks, dim=1) if output_chunks else None
-        return final_out, state
-
-
 class Decoder(nn.Module):
     def __init__(self, decoder_args, output_dim=320, latent_dim=64, patch_size=-1):
         super().__init__()
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py b/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py
index 360a4f25acf..8323573a2d4 100644
--- a/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py
+++ b/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py
@@ -27,222 +27,18 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from transformers import PreTrainedTokenizerBase, Qwen2Config, Qwen2Model, StaticCache
 from vllm.logger import init_logger
-from x_transformers.x_transformers import RotaryEmbedding, apply_rotary_pos_emb
+from x_transformers.x_transformers import RotaryEmbedding
 
 from vllm_omni.model_executor.layers.timestep_embedding import DiTTimestepEmbedding
+from vllm_omni.model_executor.models.ming_utils.dit import CondEmbedder, DiTBlock, FinalLayer, get_epss_timesteps
 
 from .audio_vae import AudioVAE
 
 logger = init_logger(__name__)
 
 
-########################################################################
-# DiT Modules
-# Ported from:
-# https://github.com/inclusionAI/Ming/blob/e58533db227031990c5a6864dcf5f08fb53ed0d2/talker_module/modules.py
-# Ported from:
-# https://github.com/inclusionAI/Ming/blob/e58533db227031990c5a6864dcf5f08fb53ed0d2/talker_module/dit.py
-########################################################################
-
-
-class RMSNorm(nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            x = x.to(self.weight.dtype)
-        x = F.rms_norm(x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps)
-        return x
-
-
-class FeedForward(nn.Module):
-    def __init__(
-        self, dim: int, dim_out: int | None = None, mult: float = 4, dropout: float = 0.0, approximate: str = "none"
-    ):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = dim_out if dim_out is not None else dim
-
-        activation = nn.GELU(approximate=approximate)
-        project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
-        self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.ff(x)
-
-
-class Attention(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        heads: int = 8,
-        dim_head: int = 64,
-        dropout: float = 0.0,
-        qk_norm: str | None = None,
-        pe_attn_head: int | None = None,
-        attn_mask_enabled: bool = True,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.heads = heads
-        self.inner_dim = dim_head * heads
-        self.dropout = dropout
-
-        self.to_q = nn.Linear(dim, self.inner_dim)
-        self.to_k = nn.Linear(dim, self.inner_dim)
-        self.to_v = nn.Linear(dim, self.inner_dim)
-        if qk_norm is None:
-            self.q_norm = None
-            self.k_norm = None
-        elif qk_norm == "rms_norm":
-            self.q_norm = RMSNorm(dim_head)
-            self.k_norm = RMSNorm(dim_head)
-        else:
-            raise ValueError(f"Unimplemented qk_norm: {qk_norm}")
-
-        self.to_out = nn.ModuleList([])
-        self.to_out.append(nn.Linear(self.inner_dim, dim))
-        self.to_out.append(nn.Dropout(dropout))
-
-        self.pe_attn_head = pe_attn_head
-        self.attn_mask_enabled = attn_mask_enabled
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        mask: torch.Tensor | None = None,
-        rope: tuple[torch.Tensor, torch.Tensor | None] | None = None,
-    ) -> torch.Tensor:
-        batch_size = x.shape[0]
-
-        query = self.to_q(x)
-        key = self.to_k(x)
-        value = self.to_v(x)
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // self.heads
-        query = query.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
-
-        if self.q_norm is not None:
-            query = self.q_norm(query)
-        if self.k_norm is not None:
-            key = self.k_norm(key)
-
-        if rope is not None:
-            freqs, xpos_scale = rope
-            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
-
-            if self.pe_attn_head is not None:
-                on = self.pe_attn_head
-                query[:, :on, :, :] = apply_rotary_pos_emb(query[:, :on, :, :], freqs, q_xpos_scale)
-                key[:, :on, :, :] = apply_rotary_pos_emb(key[:, :on, :, :], freqs, k_xpos_scale)
-            else:
-                query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
-                key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
-
-        if self.attn_mask_enabled and mask is not None:
-            valid_sample_indices = mask.any(dim=1)
-            final_output = torch.zeros_like(query).to(query.device)
-
-            attn_mask = mask[valid_sample_indices]
-            query = query[valid_sample_indices]
-            key = key[valid_sample_indices]
-            value = value[valid_sample_indices]
-            attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)
-            attn_mask = attn_mask.expand(valid_sample_indices.sum().item(), self.heads, query.shape[-2], key.shape[-2])
-        else:
-            attn_mask = None
-
-        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
-        if self.attn_mask_enabled and mask is not None:
-            final_output[valid_sample_indices] = x
-            x = final_output
-
-        x = x.transpose(1, 2).reshape(batch_size, -1, self.heads * head_dim)
-        x = x.to(query.dtype)
-
-        x = self.to_out[0](x)
-        x = self.to_out[1](x)
-
-        if mask is not None:
-            mask = mask.unsqueeze(-1)
-            x = x.masked_fill(~mask, 0.0)
-
-        return x
-
-
-class DiTBlock(nn.Module):
-    """A DiT block with pre-norm and residual connections."""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        dropout: float = 0.1,
-        qk_norm: str | None = None,
-        pe_attn_head: int | None = None,
-        attn_mask_enabled: bool = True,
-        **kwargs,
-    ):
-        super().__init__()
-        self.norm1 = RMSNorm(hidden_size)
-        self.attn = Attention(
-            dim=hidden_size,
-            heads=num_heads,
-            dim_head=hidden_size // num_heads,
-            dropout=dropout,
-            qk_norm=qk_norm,
-            pe_attn_head=pe_attn_head,
-            attn_mask_enabled=attn_mask_enabled,
-        )
-        self.norm2 = RMSNorm(hidden_size)
-        self.mlp = FeedForward(dim=hidden_size, mult=mlp_ratio, dropout=dropout, approximate="tanh")
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        mask: torch.Tensor | None,
-        rope: tuple[torch.Tensor, torch.Tensor | None] | None,
-    ) -> torch.Tensor:
-        x = x + self.attn(self.norm1(x), mask=mask, rope=rope)
-        x = x + self.mlp(self.norm2(x))
-        return x
-
-
-class FinalLayer(nn.Module):
-    """The final layer of DiT."""
-
-    def __init__(self, hidden_size: int, out_channels: int):
-        super().__init__()
-        self.norm_final = RMSNorm(hidden_size)
-        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.norm_final(x)
-        x = self.linear(x)
-        return x
-
-
-class CondEmbedder(nn.Module):
-    """Embeds LLM hidden states with optional CFG dropout."""
-
-    def __init__(self, input_feature_size: int, hidden_size: int):
-        super().__init__()
-        self.cond_embedder = nn.Linear(input_feature_size, hidden_size)
-
-    def forward(self, llm_cond: torch.Tensor) -> torch.Tensor:
-        return self.cond_embedder(llm_cond)
-
-
 class DiT(nn.Module):
     """Diffusion model with a Transformer backbone for audio latent generation."""
 
@@ -323,29 +119,6 @@ def forward_with_cfg(
         return model_out[:, -x.shape[1] :, :]
 
 
-#########################################################################################
-# CFM
-# Ported from:
-# https://github.com/inclusionAI/Ming/blob/e58533db227031990c5a6864dcf5f08fb53ed0d2/talker_module/cfm.py
-#########################################################################################
-
-
-def get_epss_timesteps(n, device, dtype):
-    dt = 1 / 32
-    predefined_timesteps = {
-        5: [0, 2, 4, 8, 16, 32],
-        6: [0, 2, 4, 6, 8, 16, 32],
-        7: [0, 2, 4, 6, 8, 16, 24, 32],
-        10: [0, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32],
-        12: [0, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32],
-        16: [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32],
-    }
-    t = predefined_timesteps.get(n, [])
-    if not t:
-        return torch.linspace(0, 1, n + 1, device=device, dtype=dtype)
-    return dt * torch.tensor(t, device=device, dtype=dtype)
-
-
 class CFM(nn.Module):
     """Conditional Flow Matching module for audio latent generation."""
 
diff --git a/vllm_omni/model_executor/models/ming_tts/aggregator.py b/vllm_omni/model_executor/models/ming_tts/aggregator.py
index 3af25b516aa..de617590403 100644
--- a/vllm_omni/model_executor/models/ming_tts/aggregator.py
+++ b/vllm_omni/model_executor/models/ming_tts/aggregator.py
@@ -6,8 +6,7 @@
 import torch.nn as nn
 from x_transformers.x_transformers import RotaryEmbedding
 
-from .config_ming_tts import MingTTSConfig
-from .fm.modules import DiTBlock, FinalLayer
+from vllm_omni.model_executor.models.ming_utils.dit import DiTBlock, FinalLayer
 
 
 class Aggregator(nn.Module):
@@ -64,13 +63,4 @@ def forward(self, x, mask=None):
         return x[:, :1, :]
 
 
-def build_ming_aggregator(cfg: MingTTSConfig) -> Aggregator:
-    """Build the Ming Stage-1 latent patch projector from the parsed config."""
-    return Aggregator(
-        in_channels=cfg.latent_dim,
-        llm_input_dim=cfg.llm_hidden_size,
-        **cfg.aggregator_config,
-    )
-
-
-__all__ = ["Aggregator", "build_ming_aggregator"]
+__all__ = ["Aggregator"]
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/audio_encoder.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/audio_encoder.py
deleted file mode 100644
index d5d11121791..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/audio_encoder.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/audio_tokenizer/audio_encoder.py
-
-from collections.abc import Iterable
-
-import torch
-import torch.nn.functional as F
-from torch import Tensor, nn
-from torchtune.modules import RotaryPositionalEmbeddings
-from transformers.cache_utils import DynamicCache
-
-try:
-    from flash_attn import flash_attn_func
-
-    _FLASH_ATTN_AVAILABLE = True
-except (ImportError, RuntimeError, OSError):
-    _FLASH_ATTN_AVAILABLE = False
-    flash_attn_func = None  # guarded by semantic_module_kwargs check above
-
-
-class LayerNorm(nn.LayerNorm):
-    def forward(self, x: Tensor) -> Tensor:
-        return super().forward(x.float()).type(x.dtype)
-
-
-class Linear(nn.Linear):
-    def forward(self, x: Tensor) -> Tensor:
-        return F.linear(
-            x,
-            self.weight.to(x.dtype),
-            None if self.bias is None else self.bias.to(x.dtype),
-        )
-
-
-class MultiHeadAttention(nn.Module):
-    def __init__(self, n_state: int, n_head: int, layer_idx: int):
-        super().__init__()
-        self.n_head = n_head
-        self.query = Linear(n_state, n_state)
-        self.key = Linear(n_state, n_state, bias=False)
-        self.value = Linear(n_state, n_state)
-        self.out = Linear(n_state, n_state)
-        self.layer_idx = layer_idx
-        self.rotary_embed = RotaryPositionalEmbeddings(dim=n_state // n_head)
-
-    def forward(self, x: Tensor, past_key_values=None):
-        q = self.query(x)
-        k = self.key(x)
-        v = self.value(x)
-
-        wv, qk, past_key_values = self.qkv_attention(q, k, v, past_key_values=past_key_values)
-        return self.out(wv), qk, past_key_values
-
-    def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, past_key_values=None):
-        if not _FLASH_ATTN_AVAILABLE:
-            raise ImportError("flash_attn is required for Ming semantic audio encoder attention.")
-        q = q.view(*q.shape[:2], self.n_head, -1)  # [B, T, nhead, dm]
-        k = k.view(*k.shape[:2], self.n_head, -1)  # [B, T, nhead, dm]
-        v = v.view(*v.shape[:2], self.n_head, -1)  # [B, T, nhead, dm]
-
-        if past_key_values is not None:
-            past_seen_tokens = past_key_values.get_seq_length(self.layer_idx) if past_key_values is not None else 0
-            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + q.size(1), device=q.device)
-            cache_position = cache_position.unsqueeze(0)
-        else:
-            cache_position = None
-
-        q = self.rotary_embed(q, input_pos=cache_position)
-        k = self.rotary_embed(k, input_pos=cache_position)
-
-        q = q.permute(0, 2, 1, 3)
-        k = k.permute(0, 2, 1, 3)
-        v = v.permute(0, 2, 1, 3)
-
-        if past_key_values is not None:
-            k, v = past_key_values.update(k, v, self.layer_idx, {"cache_position": cache_position})
-
-        a = flash_attn_func(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), causal=True)
-        out = a.flatten(start_dim=2)
-        qk = None
-
-        return out, qk, past_key_values
-
-
-class ResidualAttentionBlock(nn.Module):
-    def __init__(self, n_state: int, n_head: int, layer_idx: int):
-        super().__init__()
-
-        self.attn = MultiHeadAttention(n_state, n_head, layer_idx)
-        self.attn_ln = LayerNorm(n_state)
-        n_mlp = n_state * 4
-        self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state))
-        self.mlp_ln = LayerNorm(n_state)
-        self.layer_idx = layer_idx
-
-    def forward(self, x: Tensor, past_key_values=None):
-        attn_out, _, past_key_values = self.attn(self.attn_ln(x), past_key_values=past_key_values)
-        x = x + attn_out
-        x = x + self.mlp(self.mlp_ln(x))
-        return x, past_key_values
-
-
-class WhisperAudioEncoder(nn.Module):
-    def __init__(self, n_state: int, n_head: int, n_layer: int):
-        super().__init__()
-
-        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
-            [ResidualAttentionBlock(n_state, n_head, layer_idx=i) for i in range(n_layer)]
-        )
-        self.ln_post = LayerNorm(n_state)
-
-    def forward(self, whisper_feats: Tensor, use_cache=False, past_key_values=None, **kwargs):
-        if past_key_values is None and use_cache:
-            past_key_values = DynamicCache()
-
-        x = whisper_feats
-
-        for block in self.blocks:
-            x, past_key_values = block(x, past_key_values=past_key_values)
-
-        x = self.ln_post(x)
-
-        return x, past_key_values
-
-    @classmethod
-    def from_pretrained(cls, dims):
-        audio_encoder = cls(
-            dims["n_state"],
-            dims["n_head"],
-            dims["n_layer"],
-        )
-
-        audio_encoder.audio_emb_dim = dims["n_state"]
-        return audio_encoder
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/modeling_audio_vae.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/modeling_audio_vae.py
index f72c12184fd..e4741adcee0 100644
--- a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/modeling_audio_vae.py
+++ b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/modeling_audio_vae.py
@@ -88,19 +88,13 @@ def __init__(self, config: AudioVAEconfig):
             patch_size=config.patch_size,
         )
 
-        # Semantic module is null for this checkpoint.
         if config.semantic_module_kwargs is not None:
-            from .audio_encoder import WhisperAudioEncoder
-
-            semantic_model = WhisperAudioEncoder.from_pretrained(dims=config.semantic_module_kwargs["whisper_encoder"])
-        else:
-            semantic_model = None
+            raise ValueError("Ming dense 0.5B expects semantic_module_kwargs to be null.")
 
         self.decoder = Decoder(
             decoder_args=dec_kwargs["backbone"],  # IMPORTANT: decoder uses dec_kwargs.backbone
             output_dim=dec_kwargs["output_dim"],  # Ming checkpoint uses 882
             latent_dim=dec_kwargs["latent_dim"],
-            semantic_model=semantic_model,
             patch_size=config.patch_size,
         )
 
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py
index 3920f4be7d4..28a4ce44aea 100644
--- a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py
+++ b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py
@@ -6,67 +6,9 @@
 import torch.nn.functional as F
 from transformers import Qwen2Config, Qwen2Model
 
-from .istft import ISTFTHead
-
-
-class StreamingLinearUpsample(nn.Module):
-    def __init__(self, scale_factor=4):
-        super().__init__()
-        self.scale_factor = scale_factor
-        self.upsampler = nn.Upsample(scale_factor=scale_factor, mode="linear", align_corners=False)
-
-    def forward(self, x, state=None, is_last=False):
-        if x is None and is_last and (state is None or state.get("prev_chunk") is None):
-            raise ValueError("Received end-of-stream without any latent chunk to upsample.")
-        # 初始化状态
-        if state is None:
-            state = {"prev_chunk": None, "history_last": None, "is_first": True}
-
-        if x is None and not is_last:
-            return None, state
-
-        if state["is_first"] and is_last:
-            out = self.upsampler(x.transpose(1, 2)).transpose(1, 2)
-            return out, None  # 结束后清除状态
-
-        output_chunks = []
-
-        if state["is_first"]:
-            state["prev_chunk"] = x
-            state["is_first"] = False
-            if not is_last:
-                return None, state
-
-        if state["prev_chunk"] is not None:
-            p = state["prev_chunk"].transpose(1, 2)
-
-            if state["history_last"] is None:
-                lookahead = x[:, :1, :].transpose(1, 2)
-                inp = torch.cat([p, lookahead], dim=2)
-                up = self.upsampler(inp)
-                out_prev = up[:, :, : p.size(2) * self.scale_factor]
-            else:
-                lookahead = x[:, :1, :].transpose(1, 2)
-                inp = torch.cat([state["history_last"], p, lookahead], dim=2)
-                up = self.upsampler(inp)
-                start = self.scale_factor
-                end = start + p.size(2) * self.scale_factor
-                out_prev = up[:, :, start:end]
+from vllm_omni.model_executor.models.ming_utils.audio_vae import StreamingLinearUpsample
 
-            output_chunks.append(out_prev.transpose(1, 2))
-            state["history_last"] = p[:, :, -1:]
-            state["prev_chunk"] = x
-
-        if is_last:
-            p = state["prev_chunk"].transpose(1, 2)
-            inp = torch.cat([state["history_last"], p], dim=2)
-            up = self.upsampler(inp)
-            out_last = up[:, :, self.scale_factor :]
-            output_chunks.append(out_last.transpose(1, 2))
-            state = None  # 结束
-
-        final_out = torch.cat(output_chunks, dim=1) if output_chunks else None
-        return final_out, state
+from .istft import ISTFTHead
 
 
 class Encoder(nn.Module):
@@ -130,7 +72,7 @@ def forward(self, waveform):
 
 
 class Decoder(nn.Module):
-    def __init__(self, decoder_args, output_dim=320, latent_dim=64, semantic_model=None, patch_size=-1):
+    def __init__(self, decoder_args, output_dim=320, latent_dim=64, patch_size=-1):
         super().__init__()
         config = Qwen2Config.from_dict(config_dict=decoder_args)
         self.decoder = Qwen2Model(config)
@@ -138,14 +80,6 @@ def __init__(self, decoder_args, output_dim=320, latent_dim=64, semantic_model=N
         self.latent_dim = latent_dim
         self.fc1 = nn.Linear(latent_dim, config.hidden_size)
 
-        if semantic_model is not None:
-            self.gelu = nn.GELU()
-            self.fc2 = nn.Linear(config.hidden_size, semantic_model.audio_emb_dim)
-            self.semantic_model = semantic_model
-            self.fc3 = nn.Linear(semantic_model.audio_emb_dim, config.hidden_size)
-        else:
-            self.semantic_model = None
-
         self.hop_length = output_dim
         self.head = ISTFTHead(
             dim=config.hidden_size, n_fft=self.hop_length * 4, hop_length=self.hop_length, padding="same"
@@ -154,21 +88,9 @@ def __init__(self, decoder_args, output_dim=320, latent_dim=64, semantic_model=N
         if self.patch_size != -1:
             self.upsampling = StreamingLinearUpsample(scale_factor=patch_size)
 
-    def forward(self, x, only_semantic_emb=False, past_key_values=None, use_cache=False):
+    def forward(self, x):
         x = self.fc1(x)
 
-        if self.semantic_model is not None:
-            x = self.fc2(self.gelu(x))
-            x, past_key_values = self.semantic_model(
-                whisper_feats=x, past_key_values=past_key_values, use_cache=use_cache
-            )
-            unified_emb = x
-            if only_semantic_emb:
-                return unified_emb, past_key_values
-            x = self.fc3(x)
-        else:
-            unified_emb = None
-
         if self.patch_size != -1:
             x = self.upsampling(x.transpose(1, 2)).transpose(1, 2)
 
@@ -177,7 +99,7 @@ def forward(self, x, only_semantic_emb=False, past_key_values=None, use_cache=Fa
 
         x, _ = self.head(x)
 
-        return x, unified_emb
+        return x, None
 
     def low_level_reconstruct(self, x, past_key_values=None, use_cache=False, stream_state=None, last_chunk=False):
         # Guard against None on first chunk (connector initialises per-request)
diff --git a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
index f0a08768a91..5b2036a63dd 100644
--- a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
@@ -18,7 +18,6 @@
     DEFAULT_CFG,
     DEFAULT_SIGMA,
     DEFAULT_TEMPERATURE,
-    EOS_TOKEN_ID,
     HISTORY_PATCH_SIZE,
     KEY_CFG,
     KEY_CHUNK_ID,
@@ -41,7 +40,6 @@
     LLM_HIDDEN_SIZE,
     LLM_VOCAB_SIZE,
     MAX_DECODE_STEPS,
-    PAD_TOKEN_ID,
     PATCH_SIZE,
     SAMPLE_RATE,
     STOP_HEAD_MIN_STEPS,
@@ -63,21 +61,6 @@ def _coerce_qwen2_config(value: Any) -> Qwen2Config:
     raise TypeError(f"Unsupported llm_config type for Ming dense config: {type(value)!r}")
 
 
-def _coerce_ming_dense_audio_vae_config(value: Any) -> AudioVAEconfig | None:
-    if value is None:
-        return None
-    if isinstance(value, AudioVAEconfig):
-        value = value.to_dict()
-    elif isinstance(value, PretrainedConfig):
-        value = value.to_dict()
-    elif isinstance(value, dict):
-        value = dict(value)
-    else:
-        raise TypeError(f"Unsupported audio_tokenizer_config type for Ming dense config: {type(value)!r}")
-
-    return AudioVAEconfig(**value)
-
-
 class MingDenseConfig(PretrainedConfig):
     model_type = "dense"
 
@@ -93,7 +76,7 @@ def __init__(
         self.llm_config = _coerce_qwen2_config(llm_config or {})
         self.ditar_config = dict(ditar_config or {})
         self.aggregator_config = dict(aggregator_config or {})
-        self.audio_tokenizer_config = _coerce_ming_dense_audio_vae_config(audio_tokenizer_config)
+        self.audio_tokenizer_config = _coerce_audio_vae_config(audio_tokenizer_config)
         super().__init__(architectures=architectures, **kwargs)
 
     def get_text_config(self, decoder: bool = False, **kwargs: Any) -> Qwen2Config:
@@ -132,8 +115,6 @@ class MingTTSConfig:
     latent_left_context: int = LATENT_LEFT_CONTEXT
 
     text_eos_token_id: int = TEXT_EOS_TOKEN_ID
-    eos_token_id: int = EOS_TOKEN_ID
-    pad_token_id: int = PAD_TOKEN_ID
     audio_dummy_token_id: int = AUDIO_DUMMY_TOKEN_ID
     audio_start_token_id: int = AUDIO_START_TOKEN_ID
     audio_end_token_id: int = AUDIO_END_TOKEN_ID
@@ -156,9 +137,7 @@ def from_hf_config(cls, hf_config: PretrainedConfig) -> MingTTSConfig:
         atc_patch_size = _nested_get(atc, "patch_size", default=VAE_PATCH_SIZE)
         atc_sample_rate = _nested_get(atc, "sample_rate", default=SAMPLE_RATE)
 
-        enc_input_dim = _nested_get(atc, "enc_kwargs", "input_dim", default=AUDIO_FRAME_HOP)
         enc_hop_size = _nested_get(atc, "enc_kwargs", "hop_size", default=AUDIO_FRAME_HOP)
-        dec_output_dim = _nested_get(atc, "dec_kwargs", "output_dim", default=AUDIO_FRAME_HOP)
 
         cfg = cls(
             llm_hidden_size=llm_dict.get("hidden_size", LLM_HIDDEN_SIZE),
@@ -174,32 +153,11 @@ def from_hf_config(cls, hf_config: PretrainedConfig) -> MingTTSConfig:
             sample_rate=atc_sample_rate,
             audio_frame_hop=enc_hop_size if enc_hop_size is not None else AUDIO_FRAME_HOP,
         )
-        cfg._enc_input_dim = enc_input_dim
-        cfg._enc_hop_size = enc_hop_size
-        cfg._dec_output_dim = dec_output_dim
         return cfg
 
     def validate(self) -> None:
         validate_ming_tts_config(self)
 
-    def make_qwen2_config(self) -> Qwen2Config:
-        """Reconstruct Qwen2Config for Stage-1 LLM backbone init."""
-        if not self.llm_config:
-            raise ValueError("llm_config is empty; from_hf_config() failed to parse nested llm_config.")
-        return Qwen2Config.from_dict(self.llm_config)
-
-    @property
-    def latent_patch_shape(self) -> tuple[int, int]:
-        return (self.patch_size, self.latent_dim)
-
-    @property
-    def chunk_frames(self) -> int:
-        return self.latent_chunk_size * self.patch_size
-
-    @property
-    def approx_chunk_seconds(self) -> float:
-        return (self.chunk_frames * self.audio_frame_hop) / float(self.sample_rate)
-
 
 __all__ = [
     "AGGREGATOR_HIDDEN_SIZE",
@@ -211,7 +169,6 @@ def approx_chunk_seconds(self) -> float:
     "DEFAULT_CFG",
     "DEFAULT_SIGMA",
     "DEFAULT_TEMPERATURE",
-    "EOS_TOKEN_ID",
     "HISTORY_PATCH_SIZE",
     "KEY_CFG",
     "KEY_CHUNK_ID",
@@ -236,7 +193,6 @@ def approx_chunk_seconds(self) -> float:
     "MAX_DECODE_STEPS",
     "MingDenseConfig",
     "MingTTSConfig",
-    "PAD_TOKEN_ID",
     "PATCH_SIZE",
     "SAMPLE_RATE",
     "STOP_HEAD_MIN_STEPS",
diff --git a/vllm_omni/model_executor/models/ming_tts/constants.py b/vllm_omni/model_executor/models/ming_tts/constants.py
index b7e0b9bb78a..2f59e209401 100644
--- a/vllm_omni/model_executor/models/ming_tts/constants.py
+++ b/vllm_omni/model_executor/models/ming_tts/constants.py
@@ -13,10 +13,6 @@
 VISION_START_TOKEN_ID = 151652  # <|vision_start|>
 
 TEXT_EOS_TOKEN_ID = 151669  # <text_eos>
-PAD_TOKEN_ID = 151643  # <|endoftext|>
-
-# Backward-compat alias for older code paths
-EOS_TOKEN_ID = TEXT_EOS_TOKEN_ID
 
 
 # ---------------------------------------------------------------------------
diff --git a/vllm_omni/model_executor/models/ming_tts/flowloss_head.py b/vllm_omni/model_executor/models/ming_tts/flowloss_head.py
index f2c5cc0753b..30a08b6e11f 100644
--- a/vllm_omni/model_executor/models/ming_tts/flowloss_head.py
+++ b/vllm_omni/model_executor/models/ming_tts/flowloss_head.py
@@ -17,9 +17,6 @@ def __init__(self, z_channels, llm_cond_dim, **kwargs):
         self.z_channels = z_channels
         self.cfm = CFM(model=DiT(in_channels=z_channels, llm_cond_dim=llm_cond_dim, **kwargs))
 
-    def forward(self, cond, target, latent_history, mask, patch_size):
-        return self.cfm(cond=cond, target=target, latent_history=latent_history, mask=mask, patch_size=patch_size)
-
     def sample(self, z, latent_history, cfg=2.0, patch_size=1, sigma=0.25, temperature=0):
         if z.ndim != 3:
             raise ValueError(f"Expected z rank-3 [Batch, Time, Dimension], got {tuple(z.shape)}")
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/cfm.py b/vllm_omni/model_executor/models/ming_tts/fm/cfm.py
index 601f5f72a6e..d53a19d798b 100644
--- a/vllm_omni/model_executor/models/ming_tts/fm/cfm.py
+++ b/vllm_omni/model_executor/models/ming_tts/fm/cfm.py
@@ -6,6 +6,8 @@
 import torch
 from torch import nn
 
+from vllm_omni.model_executor.models.ming_utils.dit import get_epss_timesteps
+
 
 class Solver:
     def __init__(self, func, y0, sigma=0.25, temperature=1.5) -> None:
@@ -45,22 +47,6 @@ def _linear_interp(self, t0, t1, y0, y1, t):
         return y0 + slope * (y1 - y0)
 
 
-def get_epss_timesteps(n, device, dtype):
-    dt = 1 / 32
-    predefined_timesteps = {
-        5: [0, 2, 4, 8, 16, 32],
-        6: [0, 2, 4, 6, 8, 16, 32],
-        7: [0, 2, 4, 6, 8, 16, 24, 32],
-        10: [0, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32],
-        12: [0, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32],
-        16: [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32],
-    }
-    t = predefined_timesteps.get(n, [])
-    if not t:
-        return torch.linspace(0, 1, n + 1, device=device, dtype=dtype)
-    return dt * torch.tensor(t, device=device, dtype=dtype)
-
-
 class CFM(nn.Module):
     def __init__(
         self,
@@ -89,8 +75,6 @@ def sample(
     ):
         if steps <= 0:
             raise ValueError(f"steps must be positive, got {steps}")
-        if patch_size <= 0:
-            raise ValueError(f"patch_size must be positive, got {patch_size}")
         if noise.ndim != 3:
             raise ValueError(f"Expected noise rank-3 [Batch, Dimension, Time], got {tuple(noise.shape)}")
         if c.ndim != 3:
@@ -132,12 +116,10 @@ def fn(t, x):
             return pred + (pred - null_pred) * cfg_scale
 
         y0 = noise.transpose(1, 2)
-        t_start = 0
-
-        if t_start == 0 and use_epss:  # use Empirically Pruned Step Sampling for low NFE
+        if use_epss:
             t = get_epss_timesteps(steps, device=self.device, dtype=noise.dtype)
         else:
-            t = torch.linspace(t_start, 1, steps + 1, device=self.device, dtype=noise.dtype)
+            t = torch.linspace(0, 1, steps + 1, device=self.device, dtype=noise.dtype)
         if sway_sampling_coef is not None:
             t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
 
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/dit.py b/vllm_omni/model_executor/models/ming_tts/fm/dit.py
index a45a2db81bc..4cfe9867396 100644
--- a/vllm_omni/model_executor/models/ming_tts/fm/dit.py
+++ b/vllm_omni/model_executor/models/ming_tts/fm/dit.py
@@ -8,7 +8,7 @@
 import torch.nn as nn
 from x_transformers.x_transformers import RotaryEmbedding
 
-from .modules import DiTBlock, FinalLayer
+from vllm_omni.model_executor.models.ming_utils.dit import CondEmbedder, DiTBlock, FinalLayer
 
 
 class SinusPositionEmbedding(nn.Module):
@@ -43,20 +43,6 @@ def forward(self, timestep):
         return time
 
 
-class CondEmbedder(nn.Module):
-    def __init__(self, input_feature_size, hidden_size, dropout_prob):
-        super().__init__()
-        del dropout_prob
-        self.cond_embedder = nn.Linear(input_feature_size, hidden_size)
-
-    def forward(self, llm_cond):
-        if llm_cond.ndim != 3:
-            raise ValueError(f"Expected conditioning rank-3 [Batch, Time, Dimension], got {tuple(llm_cond.shape)}")
-        llm_cond = self.cond_embedder(llm_cond)
-
-        return llm_cond
-
-
 class DiT(nn.Module):
     def __init__(
         self,
@@ -135,8 +121,6 @@ def forward(self, x, t, c, latent_history, mask=None):
         return x
 
     def forward_with_cfg(self, x, t, c, cfg_scale, latent_history, patch_size):
-        if patch_size <= 0:
-            raise ValueError(f"patch_size must be positive, got {patch_size}")
         if not cfg_scale == 1:
             x = torch.cat([x, x], dim=0)
             latent_history = torch.cat([latent_history, latent_history], dim=0)
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/modules.py b/vllm_omni/model_executor/models/ming_tts/fm/modules.py
deleted file mode 100644
index 1163f8f1837..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/fm/modules.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/fm/modules.py
-import torch
-import torch.nn.functional as F
-from torch import nn
-from x_transformers.x_transformers import apply_rotary_pos_emb
-
-
-class RMSNorm(nn.Module):
-    def __init__(self, dim: int, eps: float):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-        self.native_rms_norm = float(torch.__version__[:3]) >= 2.4
-
-    def forward(self, x):
-        if self.native_rms_norm:
-            if self.weight.dtype in [torch.float16, torch.bfloat16]:
-                x = x.to(self.weight.dtype)
-            x = F.rms_norm(x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps)
-        else:
-            variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
-            x = x * torch.rsqrt(variance + self.eps)
-            if self.weight.dtype in [torch.float16, torch.bfloat16]:
-                x = x.to(self.weight.dtype)
-            x = x * self.weight
-
-        return x
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = dim_out if dim_out is not None else dim
-
-        activation = nn.GELU(approximate=approximate)
-        project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
-        self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
-
-    def forward(self, x):
-        return self.ff(x)
-
-
-class Attention(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        heads: int = 8,
-        dim_head: int = 64,
-        dropout: float = 0.0,
-    ):
-        super().__init__()
-
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("SDPA requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-
-        self.dim = dim
-        self.heads = heads
-        self.inner_dim = dim_head * heads
-        self.dropout = dropout
-        self.to_q = nn.Linear(dim, self.inner_dim)
-        self.to_k = nn.Linear(dim, self.inner_dim)
-        self.to_v = nn.Linear(dim, self.inner_dim)
-
-        self.to_out = nn.ModuleList([])
-        self.to_out.append(nn.Linear(self.inner_dim, dim))
-        self.to_out.append(nn.Dropout(dropout))
-
-    def forward(
-        self,
-        x: float,  # noised input x
-        mask=None,
-        rope=None,  # rotary position embedding for x
-    ) -> torch.Tensor:
-        if x.ndim != 3:
-            raise ValueError(f"Expected x rank-3 [Batch, Time, Dimension], got {tuple(x.shape)}")
-        if x.shape[-1] != self.dim:
-            raise ValueError(f"x feature dim mismatch: got {x.shape[-1]}, expected {self.dim}")
-        if mask is not None:
-            if mask.ndim != 2:
-                raise ValueError(f"Expected mask rank-2 [Batch, Time], got {tuple(mask.shape)}")
-            if mask.shape[0] != x.shape[0] or mask.shape[1] != x.shape[1]:
-                raise ValueError(f"Mask shape mismatch: got {tuple(mask.shape)}, expected {tuple(x.shape[:2])}")
-
-        batch_size = x.shape[0]
-
-        query = self.to_q(x)
-        key = self.to_k(x)
-        value = self.to_v(x)
-
-        # attention
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // self.heads
-        query = query.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
-
-        # apply rotary position embedding
-        if rope is not None:
-            freqs, xpos_scale = rope
-            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
-            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
-            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
-
-        x = F.scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
-        x = x.transpose(1, 2).reshape(batch_size, -1, self.heads * head_dim)
-        x = x.to(query.dtype)
-        x = self.to_out[0](x)
-        x = self.to_out[1](x)
-
-        if mask is not None:
-            mask = mask.unsqueeze(-1)
-            x = x.masked_fill(~mask, 0.0)
-
-        return x
-
-
-class DiTBlock(nn.Module):
-    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, dropout=0.1, **kwargs):
-        super().__init__()
-        self.norm1 = RMSNorm(hidden_size, eps=1e-6)
-        self.attn = Attention(dim=hidden_size, heads=num_heads, dim_head=hidden_size // num_heads, dropout=dropout)
-        self.norm2 = RMSNorm(hidden_size, eps=1e-6)
-        self.mlp = FeedForward(dim=hidden_size, mult=mlp_ratio, dropout=dropout, approximate="tanh")
-
-    def forward(self, x, mask, rope):
-        x = x + self.attn(self.norm1(x), mask=mask, rope=rope)
-        x = x + self.mlp(self.norm2(x))
-        return x
-
-
-class FinalLayer(nn.Module):
-    """
-    The final layer of DiT.
-    """
-
-    def __init__(self, hidden_size, out_channels):
-        super().__init__()
-        self.norm_final = RMSNorm(hidden_size, eps=1e-6)
-        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
-
-    def forward(self, x):
-        x = self.norm_final(x)
-        x = self.linear(x)
-        return x
diff --git a/vllm_omni/model_executor/models/ming_tts/ingress.py b/vllm_omni/model_executor/models/ming_tts/ingress.py
index 92ab2860232..03187ee846e 100644
--- a/vllm_omni/model_executor/models/ming_tts/ingress.py
+++ b/vllm_omni/model_executor/models/ming_tts/ingress.py
@@ -171,7 +171,3 @@ def __call__(self, prompt: Any) -> Any:
                 elapsed_ms,
             )
         return finalized_prompt
-
-
-def build_ming_ingress_processor(*, vllm_config: Any, tokenizer: Any) -> MingIngressProcessor:
-    return MingIngressProcessor(vllm_config=vllm_config, tokenizer=tokenizer)
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_utils.py b/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
index e647fd68f74..6f6863e47b4 100644
--- a/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
+++ b/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
@@ -21,14 +21,11 @@
 from .audio_tokenizer.modeling_audio_vae import AudioVAE
 from .config_ming_tts import (
     AUDIO_FRAME_HOP,
-    KEY_CFG,
     KEY_MAX_DECODE_STEPS,
     KEY_MIN_DECODE_STEPS,
     KEY_PROMPT_LATENTS,
     KEY_REQUEST_ID,
-    KEY_SIGMA,
     KEY_SPEAKER_EMBEDDING,
-    KEY_TEMPERATURE,
     LATENT_DIM,
     PATCH_SIZE,
     SAMPLE_RATE,
@@ -342,28 +339,6 @@ def build_ming_dense_prompt(
     }
 
 
-def build_runtime_controls(
-    *,
-    cfg: float | None = None,
-    sigma: float | None = None,
-    temperature: float | None = None,
-    min_decode_steps: int | None = None,
-    max_decode_steps: int | None = None,
-) -> dict[str, torch.Tensor]:
-    controls = {}
-    if cfg is not None:
-        controls[KEY_CFG] = torch.tensor(float(cfg), dtype=torch.float32)
-    if sigma is not None:
-        controls[KEY_SIGMA] = torch.tensor(float(sigma), dtype=torch.float32)
-    if temperature is not None:
-        controls[KEY_TEMPERATURE] = torch.tensor(float(temperature), dtype=torch.float32)
-    if min_decode_steps is not None:
-        controls[KEY_MIN_DECODE_STEPS] = torch.tensor(int(min_decode_steps), dtype=torch.int32)
-    if max_decode_steps is not None:
-        controls[KEY_MAX_DECODE_STEPS] = torch.tensor(int(max_decode_steps), dtype=torch.int32)
-    return controls
-
-
 def _resolve_prompt_latents(wrapper: Any, info_dict: dict[str, Any]) -> dict[str, torch.Tensor] | None:
     raw_latents = info_dict.get(KEY_PROMPT_LATENTS, info_dict.get("prompt_latents"))
     raw_waveform = info_dict.get("prompt_waveform", info_dict.get("prompt_waveforms"))
@@ -616,7 +591,6 @@ def _find_speaker_placeholder_positions(input_ids: torch.Tensor, hf_config: Any)
     "DEFAULT_PROMPT",
     "build_dense_prompt_token_ids",
     "build_ming_dense_prompt",
-    "build_runtime_controls",
     "coerce_prompt_waveform",
     "coerce_speaker_embeddings",
     "count_prompt_latent_patches",
diff --git a/vllm_omni/model_executor/models/ming_tts/validation.py b/vllm_omni/model_executor/models/ming_tts/validation.py
index 4d7511c77ef..3f674f69e11 100644
--- a/vllm_omni/model_executor/models/ming_tts/validation.py
+++ b/vllm_omni/model_executor/models/ming_tts/validation.py
@@ -38,28 +38,20 @@ def _to_plain_dict(obj: Any) -> dict[str, Any]:
 
 
 def _coerce_audio_vae_config(atc_raw: Any) -> AudioVAEconfig | None:
-    """
-    Normalize audio_tokenizer_config into AudioVAEconfig when possible.
-    Handles:
-      - already AudioVAEconfig
-      - dict
-      - PretrainedConfig-like object
-    """
     if atc_raw is None:
         return None
-    atc_dict = _to_plain_dict(atc_raw)
-    if not atc_dict:
-        return atc_raw
-
-    if hasattr(AudioVAEconfig, "from_dict") and callable(getattr(AudioVAEconfig, "from_dict")):
-        try:
-            return AudioVAEconfig.from_dict(atc_dict)
-        except Exception:
-            pass
-    try:
-        return AudioVAEconfig(**atc_dict)
-    except Exception:
+    if isinstance(atc_raw, AudioVAEconfig):
         return atc_raw
+    if isinstance(atc_raw, PretrainedConfig):
+        atc_dict = atc_raw.to_dict()
+    elif isinstance(atc_raw, dict):
+        atc_dict = dict(atc_raw)
+    elif hasattr(atc_raw, "to_dict") and callable(atc_raw.to_dict):
+        atc_dict = dict(atc_raw.to_dict())
+    else:
+        raise TypeError(f"Unsupported audio_tokenizer_config type for Ming dense config: {type(atc_raw)!r}")
+
+    return AudioVAEconfig(**atc_dict)
 
 
 def _nested_get(obj: Any, *keys: str, default: Any = None) -> Any:
diff --git a/vllm_omni/model_executor/models/ming_utils/audio_dsp.py b/vllm_omni/model_executor/models/ming_utils/audio_dsp.py
index e79ada4339a..be8cd489cff 100644
--- a/vllm_omni/model_executor/models/ming_utils/audio_dsp.py
+++ b/vllm_omni/model_executor/models/ming_utils/audio_dsp.py
@@ -187,7 +187,6 @@ def forward(
             Audio, predicted spectrogram coefficients, and streaming buffers.
         """
         x_pred = self.out(x)
-        # x_pred = x
         x_pred = x_pred.transpose(1, 2)
         mag, p = x_pred.chunk(2, dim=1)
         mag = torch.exp(mag)
diff --git a/vllm_omni/model_executor/models/ming_utils/audio_vae.py b/vllm_omni/model_executor/models/ming_utils/audio_vae.py
new file mode 100644
index 00000000000..3f5e72a6b58
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_utils/audio_vae.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn as nn
+
+
+class StreamingLinearUpsample(nn.Module):
+    def __init__(self, scale_factor=4):
+        super().__init__()
+        self.scale_factor = scale_factor
+        self.upsampler = nn.Upsample(scale_factor=scale_factor, mode="linear", align_corners=False)
+
+    def forward(self, x, state=None, is_last=False):
+        if x is None and is_last and (state is None or state.get("prev_chunk") is None):
+            raise ValueError("Received end-of-stream without any latent chunk to upsample.")
+        if state is None:
+            state = {"prev_chunk": None, "history_last": None, "is_first": True}
+
+        if x is None and not is_last:
+            return None, state
+
+        if state["is_first"] and is_last:
+            out = self.upsampler(x.transpose(1, 2)).transpose(1, 2)
+            return out, None
+
+        output_chunks = []
+
+        if state["is_first"]:
+            state["prev_chunk"] = x
+            state["is_first"] = False
+            if not is_last:
+                return None, state
+
+        if state["prev_chunk"] is not None:
+            p = state["prev_chunk"].transpose(1, 2)
+
+            if state["history_last"] is None:
+                lookahead = x[:, :1, :].transpose(1, 2)
+                inp = torch.cat([p, lookahead], dim=2)
+                up = self.upsampler(inp)
+                out_prev = up[:, :, : p.size(2) * self.scale_factor]
+            else:
+                lookahead = x[:, :1, :].transpose(1, 2)
+                inp = torch.cat([state["history_last"], p, lookahead], dim=2)
+                up = self.upsampler(inp)
+                start = self.scale_factor
+                end = start + p.size(2) * self.scale_factor
+                out_prev = up[:, :, start:end]
+
+            output_chunks.append(out_prev.transpose(1, 2))
+            state["history_last"] = p[:, :, -1:]
+            state["prev_chunk"] = x
+
+        if is_last:
+            p = state["prev_chunk"].transpose(1, 2)
+            inp = torch.cat([state["history_last"], p], dim=2)
+            up = self.upsampler(inp)
+            out_last = up[:, :, self.scale_factor :]
+            output_chunks.append(out_last.transpose(1, 2))
+            state = None
+
+        final_out = torch.cat(output_chunks, dim=1) if output_chunks else None
+        return final_out, state
diff --git a/vllm_omni/model_executor/models/ming_utils/dit.py b/vllm_omni/model_executor/models/ming_utils/dit.py
new file mode 100644
index 00000000000..0a3a153548f
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_utils/dit.py
@@ -0,0 +1,220 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn.functional as F
+from torch import nn
+from x_transformers.x_transformers import apply_rotary_pos_emb
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.native_rms_norm = float(torch.__version__[:3]) >= 2.4
+
+    def forward(self, x):
+        if self.native_rms_norm:
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                x = x.to(self.weight.dtype)
+            return F.rms_norm(x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps)
+
+        variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            x = x.to(self.weight.dtype)
+        return x * self.weight
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate="none"):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+
+        activation = nn.GELU(approximate=approximate)
+        project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
+        self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
+
+    def forward(self, x):
+        return self.ff(x)
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        qk_norm=None,
+        pe_attn_head=None,
+        attn_mask_enabled=True,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("SDPA requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+        self.dim = dim
+        self.heads = heads
+        self.inner_dim = dim_head * heads
+        self.dropout = dropout
+        self.to_q = nn.Linear(dim, self.inner_dim)
+        self.to_k = nn.Linear(dim, self.inner_dim)
+        self.to_v = nn.Linear(dim, self.inner_dim)
+        if qk_norm is None:
+            self.q_norm = None
+            self.k_norm = None
+        elif qk_norm == "rms_norm":
+            self.q_norm = RMSNorm(dim_head)
+            self.k_norm = RMSNorm(dim_head)
+        else:
+            raise ValueError(f"Unimplemented qk_norm: {qk_norm}")
+
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(self.inner_dim, dim))
+        self.to_out.append(nn.Dropout(dropout))
+        self.pe_attn_head = pe_attn_head
+        self.attn_mask_enabled = attn_mask_enabled
+
+    def forward(self, x, mask=None, rope=None):
+        if x.ndim != 3:
+            raise ValueError(f"Expected x rank-3 [Batch, Time, Dimension], got {tuple(x.shape)}")
+        if x.shape[-1] != self.dim:
+            raise ValueError(f"x feature dim mismatch: got {x.shape[-1]}, expected {self.dim}")
+        if mask is not None:
+            if mask.ndim != 2:
+                raise ValueError(f"Expected mask rank-2 [Batch, Time], got {tuple(mask.shape)}")
+            if mask.shape[0] != x.shape[0] or mask.shape[1] != x.shape[1]:
+                raise ValueError(f"Mask shape mismatch: got {tuple(mask.shape)}, expected {tuple(x.shape[:2])}")
+
+        batch_size = x.shape[0]
+        query = self.to_q(x)
+        key = self.to_k(x)
+        value = self.to_v(x)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // self.heads
+        # [Batch, Time, Dimension] -> [Batch, Heads, Time, HeadDimension].
+        query = query.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        # [Batch, Time, Dimension] -> [Batch, Heads, Time, HeadDimension].
+        key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        # [Batch, Time, Dimension] -> [Batch, Heads, Time, HeadDimension].
+        value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+
+        if self.q_norm is not None:
+            query = self.q_norm(query)
+        if self.k_norm is not None:
+            key = self.k_norm(key)
+
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            if self.pe_attn_head is not None:
+                on = self.pe_attn_head
+                query[:, :on, :, :] = apply_rotary_pos_emb(query[:, :on, :, :], freqs, q_xpos_scale)
+                key[:, :on, :, :] = apply_rotary_pos_emb(key[:, :on, :, :], freqs, k_xpos_scale)
+            else:
+                query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+                key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+
+        if self.attn_mask_enabled and mask is not None:
+            valid_sample_indices = mask.any(dim=1)
+            final_output = torch.zeros_like(query).to(query.device)
+            attn_mask = mask[valid_sample_indices]
+            query = query[valid_sample_indices]
+            key = key[valid_sample_indices]
+            value = value[valid_sample_indices]
+            attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)
+            attn_mask = attn_mask.expand(valid_sample_indices.sum().item(), self.heads, query.shape[-2], key.shape[-2])
+        else:
+            attn_mask = None
+
+        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
+        if self.attn_mask_enabled and mask is not None:
+            final_output[valid_sample_indices] = x
+            x = final_output
+
+        # [Batch, Heads, Time, HeadDimension] -> [Batch, Time, Dimension].
+        x = x.transpose(1, 2).reshape(batch_size, -1, self.heads * head_dim)
+        x = x.to(query.dtype)
+        x = self.to_out[0](x)
+        x = self.to_out[1](x)
+
+        if mask is not None:
+            mask = mask.unsqueeze(-1)
+            x = x.masked_fill(~mask, 0.0)
+
+        return x
+
+
+class DiTBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        mlp_ratio=4.0,
+        dropout=0.1,
+        qk_norm=None,
+        pe_attn_head=None,
+        attn_mask_enabled=True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.norm1 = RMSNorm(hidden_size)
+        self.attn = Attention(
+            dim=hidden_size,
+            heads=num_heads,
+            dim_head=hidden_size // num_heads,
+            dropout=dropout,
+            qk_norm=qk_norm,
+            pe_attn_head=pe_attn_head,
+            attn_mask_enabled=attn_mask_enabled,
+        )
+        self.norm2 = RMSNorm(hidden_size)
+        self.mlp = FeedForward(dim=hidden_size, mult=mlp_ratio, dropout=dropout, approximate="tanh")
+
+    def forward(self, x, mask, rope):
+        x = x + self.attn(self.norm1(x), mask=mask, rope=rope)
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class FinalLayer(nn.Module):
+    def __init__(self, hidden_size, out_channels):
+        super().__init__()
+        self.norm_final = RMSNorm(hidden_size)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+
+    def forward(self, x):
+        x = self.norm_final(x)
+        x = self.linear(x)
+        return x
+
+
+class CondEmbedder(nn.Module):
+    def __init__(self, input_feature_size, hidden_size, dropout_prob=None):
+        super().__init__()
+        del dropout_prob
+        self.cond_embedder = nn.Linear(input_feature_size, hidden_size)
+
+    def forward(self, llm_cond):
+        if llm_cond.ndim != 3:
+            raise ValueError(f"Expected conditioning rank-3 [Batch, Time, Dimension], got {tuple(llm_cond.shape)}")
+        return self.cond_embedder(llm_cond)
+
+
+def get_epss_timesteps(n, device, dtype):
+    dt = 1 / 32
+    predefined_timesteps = {
+        5: [0, 2, 4, 8, 16, 32],
+        6: [0, 2, 4, 6, 8, 16, 32],
+        7: [0, 2, 4, 6, 8, 16, 24, 32],
+        10: [0, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32],
+        12: [0, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32],
+        16: [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32],
+    }
+    t = predefined_timesteps.get(n, [])
+    if not t:
+        return torch.linspace(0, 1, n + 1, device=device, dtype=dtype)
+    return dt * torch.tensor(t, device=device, dtype=dtype)
diff --git a/vllm_omni/model_executor/stage_input_processors/_chunk_transfer.py b/vllm_omni/model_executor/stage_input_processors/_chunk_transfer.py
index cfa5369b65e..f92afaf3070 100644
--- a/vllm_omni/model_executor/stage_input_processors/_chunk_transfer.py
+++ b/vllm_omni/model_executor/stage_input_processors/_chunk_transfer.py
@@ -36,16 +36,3 @@ def get_request_payload_store(transfer_manager: Any) -> dict[str, Any]:
         request_payload = {}
         transfer_manager.request_payload = request_payload
     return request_payload
-
-
-def get_initial_codec_chunk_frames(request: Any) -> int | None:
-    additional_information = getattr(request, "additional_information", None)
-    if additional_information is None or not hasattr(additional_information, "entries"):
-        return None
-    if "initial_codec_chunk_frames" not in additional_information.entries:
-        return None
-
-    entry = additional_information.entries["initial_codec_chunk_frames"]
-    if entry.list_data is None or len(entry.list_data) != 1:
-        return None
-    return int(entry.list_data[0])

From 51d03d4cad379b838edeb9a9d9d037594ace88fb Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Tue, 26 May 2026 22:29:47 +0530
Subject: [PATCH 36/54] style: apply pre-commit formatting fixes

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 tests/engine/test_async_omni_engine_input.py                    | 2 ++
 tests/worker/test_omni_gpu_model_runner.py                      | 2 ++
 .../model_executor/models/ming_flash_omni/talker_module.py      | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/engine/test_async_omni_engine_input.py b/tests/engine/test_async_omni_engine_input.py
index 7e3f585e4e1..653df2ee19c 100644
--- a/tests/engine/test_async_omni_engine_input.py
+++ b/tests/engine/test_async_omni_engine_input.py
@@ -144,6 +144,8 @@ def test_build_add_request_message_uses_ingress_processed_prompt_for_additional_
     assert call_kwargs["parent_req"] is None
     assert call_kwargs["request_index"] == 0
     assert call_kwargs["queue"] is None
+
+
 class _FakeStageClient:
     stage_type = "llm"
     final_output = False
diff --git a/tests/worker/test_omni_gpu_model_runner.py b/tests/worker/test_omni_gpu_model_runner.py
index cbe388202c1..0cf05b2f344 100644
--- a/tests/worker/test_omni_gpu_model_runner.py
+++ b/tests/worker/test_omni_gpu_model_runner.py
@@ -392,6 +392,8 @@ def test_update_additional_information_uses_legacy_additional_information():
     info = runner.model_intermediate_buffer["r1"]
     assert info["new_field"] == 1
     assert info["cached_field"] == 3
+
+
 def test_maybe_run_batch_preprocess_calls_model_hook():
     runner = object.__new__(OmniGPUModelRunner)
     runner.model_intermediate_buffer = {"r1": {"text": ["hello"]}}
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py b/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py
index facb33a0f1c..9d07f644106 100644
--- a/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py
+++ b/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py
@@ -30,7 +30,7 @@
 from transformers import PreTrainedTokenizerBase, Qwen2Config, Qwen2Model, StaticCache
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from x_transformers.x_transformers import RotaryEmbedding, apply_rotary_pos_emb
+from x_transformers.x_transformers import RotaryEmbedding
 
 from vllm_omni.model_executor.layers.timestep_embedding import DiTTimestepEmbedding
 from vllm_omni.model_executor.models.ming_utils.dit import CondEmbedder, DiTBlock, FinalLayer, get_epss_timesteps

From a10ed8fbe9a0ef51ba908844e7b5ac1cd950a5ca Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Tue, 26 May 2026 23:13:16 +0530
Subject: [PATCH 37/54] test(ming-tts): update zh evaluation prompt in e2e
 tests

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 tests/e2e/online_serving/test_ming_tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/online_serving/test_ming_tts.py b/tests/e2e/online_serving/test_ming_tts.py
index 29503325901..03982cc4a93 100644
--- a/tests/e2e/online_serving/test_ming_tts.py
+++ b/tests/e2e/online_serving/test_ming_tts.py
@@ -45,7 +45,7 @@
 
 def get_prompt(prompt_type="zh"):
     prompts = {
-        "zh": "我会一直在这里陪着你，直到你慢慢地沉入那个最温柔的梦里。",
+        "zh": "今天天气真不错，适合出去散散步。",
         "zh_short": "这款产品的名字，叫变态坑爹牛肉丸。",
     }
     return prompts.get(prompt_type, prompts["zh"])

From dfebed5bbd620d73079f975f9ec75cd937ba71fa Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 27 May 2026 00:35:44 +0530
Subject: [PATCH 38/54] refactor(ming-tts): centralize stop reason metadata

Import MING_STOP_REASON_KEY from patch_emission across the Ming runtime and derive the stage input processor code map from MING_STOP_REASON_CODES.

Remove the unused preprocess_input wrapper and the dead frame_hop parameter from pad_prompt_waveform while preserving the existing prompt padding behavior.

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 vllm_omni/model_executor/models/ming_tts/ming_tts.py  |  6 +-----
 .../models/ming_tts/ming_tts_audio_vae.py             |  2 +-
 .../model_executor/models/ming_tts/prompt_utils.py    |  5 +----
 .../model_executor/stage_input_processors/ming_tts.py | 11 +++++------
 4 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts.py b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
index a50fc2af39b..112f036703d 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
@@ -34,6 +34,7 @@
 from .loader import (
     load_weights,
 )
+from .patch_emission import MING_STOP_REASON_KEY
 from .prompt_utils import (
     _coerce_prompt_latents,
     _find_audio_placeholder_positions,
@@ -44,8 +45,6 @@
     coerce_speaker_embeddings,
 )
 
-MING_STOP_REASON_KEY = "ming_stop_reason"
-
 
 class _ModelSampleAdapter(nn.Module):
     def __init__(self, model: nn.Module):
@@ -120,9 +119,6 @@ def preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tensor | None,
             else self._decode_preprocess(input_ids, input_embeds, **info_dict)
         )
 
-    def preprocess_input(self, input_ids: torch.Tensor, input_embeds: torch.Tensor | None, **info_dict: Any):
-        return self.preprocess(input_ids, input_embeds, **info_dict)
-
     def postprocess(self, hidden_states: torch.Tensor, **info_dict: Any) -> dict[str, Any]:
         if self.model_stage != "llm" or hidden_states.numel() == 0:
             return {}
diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py b/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
index 17b900fcfe4..186a1366f24 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
@@ -16,10 +16,10 @@
 
 from .audio_tokenizer.modeling_audio_vae import AudioVAE
 from .config_ming_tts import KEY_CHUNK_ID, KEY_REQUEST_ID, MingTTSConfig
+from .patch_emission import MING_STOP_REASON_KEY
 
 logger = init_logger(__name__)
 
-MING_STOP_REASON_KEY = "ming_stop_reason"
 MING_FINAL_DECODE_STEP_KEY = "ming_final_decode_step"
 
 
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_utils.py b/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
index 6f6863e47b4..5b66c04d9c8 100644
--- a/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
+++ b/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
@@ -79,10 +79,8 @@ def pad_prompt_waveform(
     *,
     patch_size: int = PATCH_SIZE,
     sample_rate: int = SAMPLE_RATE,
-    frame_hop: int = AUDIO_FRAME_HOP,
 ) -> torch.Tensor:
     tensor = coerce_prompt_waveform(waveform)
-    del frame_hop
     pad_align = int((float(sample_rate) / 12.5) * int(patch_size))
     new_len = ((int(tensor.shape[-1]) + pad_align - 1) // pad_align) * pad_align
     if new_len == int(tensor.shape[-1]):
@@ -177,7 +175,7 @@ def count_prompt_waveform_patches(
 ) -> int:
     if value is None:
         return 0
-    waveform = pad_prompt_waveform(value, patch_size=patch_size, frame_hop=frame_hop)
+    waveform = pad_prompt_waveform(value, patch_size=patch_size)
     frame_count = int(math.ceil(float(waveform.shape[-1]) / float(frame_hop)))
     latent_frames = int(math.ceil(float(frame_count) / float(vae_patch_size)))
     if latent_frames % int(patch_size) != 0:
@@ -429,7 +427,6 @@ def _encode_prompt_waveform_to_latents(wrapper: Any, waveform: Any, waveform_len
         waveform,
         patch_size=wrapper.ming_config.patch_size,
         sample_rate=wrapper.ming_config.sample_rate,
-        frame_hop=wrapper.ming_config.audio_frame_hop,
     )
     dev = next(encoder.encoder.parameters()).device
     waveform = waveform.to(device=dev, dtype=next(encoder.encoder.parameters()).dtype)
diff --git a/vllm_omni/model_executor/stage_input_processors/ming_tts.py b/vllm_omni/model_executor/stage_input_processors/ming_tts.py
index 00aa364c570..4ad4f9687c3 100644
--- a/vllm_omni/model_executor/stage_input_processors/ming_tts.py
+++ b/vllm_omni/model_executor/stage_input_processors/ming_tts.py
@@ -17,6 +17,10 @@
     LATENT_LEFT_CONTEXT,
     PATCH_SIZE,
 )
+from vllm_omni.model_executor.models.ming_tts.patch_emission import (
+    MING_STOP_REASON_CODES,
+    MING_STOP_REASON_KEY,
+)
 from vllm_omni.model_executor.stage_input_processors._chunk_transfer import (
     get_chunk_config_int,
     get_request_payload_store,
@@ -29,13 +33,8 @@
 MING_LATENT_SHAPE_KEY = "ming_latent_shape"
 MING_ESTIMATED_BYTES_KEY = "ming_estimated_bytes"
 MING_FINAL_FLUSH_KEY = "ming_final_flush"
-MING_STOP_REASON_KEY = "ming_stop_reason"
 MING_FINAL_DECODE_STEP_KEY = "ming_final_decode_step"
-MING_STOP_REASON_BY_CODE = {
-    0: "continue",
-    1: "stop_head",
-    2: "max_decode_steps",
-}
+MING_STOP_REASON_BY_CODE = {code: reason for reason, code in MING_STOP_REASON_CODES.items()}
 
 
 def _extract_last_patch(pooling_output: dict[str, Any] | None) -> torch.Tensor | None:

From e9519a0f3eb9394076ed65c9b4606ae1c3876b39 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 27 May 2026 01:03:41 +0530
Subject: [PATCH 39/54] test(ming-tts): keep branch tests focused on e2e

Drop the non-Ming regression tests added while wiring async chunk metadata so the PR test surface matches the requested Ming TTS e2e coverage.

Remove the dead Ming audio_tokenizer ISTFT shim and import ISTFTHead directly from the shared ming_utils.audio_dsp module.

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../test_chunk_transfer_adapter.py            | 25 ---------
 tests/engine/test_async_omni_engine_input.py  | 54 -------------------
 tests/worker/test_omni_gpu_model_runner.py    | 22 --------
 .../models/ming_tts/audio_tokenizer/istft.py  |  3 --
 .../ming_tts/audio_tokenizer/vae_modules.py   |  3 +-
 5 files changed, 1 insertion(+), 106 deletions(-)
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py

diff --git a/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py b/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py
index a97ffe6b69b..9f10aea0c89 100644
--- a/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py
+++ b/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py
@@ -127,31 +127,6 @@ def test_load_poll(build_adapter):
     assert "req-1" not in adapter._pending_load_reqs
 
 
-def test_generation_load_preserves_payload_metadata(build_adapter):
-    adapter, connector = build_adapter(stage_id=1, model_mode="generation")
-    request = _req("req-1", RequestStatus.WAITING, external_req_id="external-1")
-    payload = {
-        "code_predictor_codes": [0],
-        "left_context_size": 3,
-        "ming_latent_patches": torch.ones((10, 4, 64), dtype=torch.float32),
-        "ming_request_id": "external-1",
-        "ming_chunk_id": 7,
-        "finished": torch.tensor(False),
-    }
-    connector.get.return_value = (payload, 16)
-
-    adapter._poll_single_request(request)
-
-    assert request.prompt_token_ids == [0]
-    assert request.additional_information["left_context_size"] == 3
-    assert request.additional_information["ming_request_id"] == "external-1"
-    assert request.additional_information["ming_chunk_id"] == 7
-    assert request.additional_information["ming_latent_patches"].shape == (10, 4, 64)
-    assert "code_predictor_codes" not in request.additional_information
-    assert "finished" not in request.additional_information
-    assert request.num_computed_tokens == 0
-
-
 def test_save_async(build_adapter):
     adapter, _ = build_adapter(stage_id=1)
     request = _req("req-1", RequestStatus.WAITING, external_req_id="external-1")
diff --git a/tests/engine/test_async_omni_engine_input.py b/tests/engine/test_async_omni_engine_input.py
index 653df2ee19c..a1ec472f613 100644
--- a/tests/engine/test_async_omni_engine_input.py
+++ b/tests/engine/test_async_omni_engine_input.py
@@ -1,7 +1,4 @@
-from unittest.mock import Mock
-
 import pytest
-import torch
 from pytest_mock import MockerFixture
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import EngineCoreRequest
@@ -95,57 +92,6 @@ def test_build_add_request_message_with_resumable_streaming(mocker: MockerFixtur
     assert input_processor.process_inputs.call_args.kwargs["resumable"] is True
 
 
-def test_build_add_request_message_uses_ingress_processed_prompt_for_additional_information():
-    engine = object.__new__(AsyncOmniEngine)
-    params = SamplingParams(max_tokens=8)
-    engine.default_sampling_params_list = [params]
-    engine.stage_metadata = [{"stage_type": "llm"}]
-    engine.supported_tasks = ("speech",)
-
-    input_processor = Mock()
-    input_processor.process_inputs.return_value = _make_engine_core_request()
-    input_processor.input_preprocessor = Mock()
-    prompt_latents = torch.ones((4, 64), dtype=torch.float32)
-    processed_prompt = {
-        "prompt_token_ids": [1, 2, 3, 4],
-        "additional_information": {
-            "ming_prompt_latents": prompt_latents,
-            "global_request_id": ["req-1"],
-        },
-    }
-    input_processor.input_preprocessor.consume_last_processed_prompt.return_value = processed_prompt
-    engine.input_processor = input_processor
-
-    output_processor = Mock()
-    engine.output_processors = [output_processor]
-
-    raw_prompt = {
-        "prompt_token_ids": [1, 2, 3],
-        "additional_information": {},
-    }
-
-    msg = engine._build_add_request_message(
-        request_id="req-1",
-        prompt=raw_prompt,
-        sampling_params_list=[params],
-        final_stage_id=0,
-        arrival_time=0.0,
-    )
-
-    request = msg["prompt"]
-    assert isinstance(request, OmniEngineCoreRequest)
-    assert request.additional_information is not None
-    assert request.additional_information.entries["ming_prompt_latents"].tensor_shape == [4, 64]
-    input_processor.input_preprocessor.consume_last_processed_prompt.assert_called_once()
-    output_processor.add_request.assert_called_once()
-    call_kwargs = output_processor.add_request.call_args.kwargs
-    assert call_kwargs["request"] is request
-    assert call_kwargs["prompt"] is None
-    assert call_kwargs["parent_req"] is None
-    assert call_kwargs["request_index"] == 0
-    assert call_kwargs["queue"] is None
-
-
 class _FakeStageClient:
     stage_type = "llm"
     final_output = False
diff --git a/tests/worker/test_omni_gpu_model_runner.py b/tests/worker/test_omni_gpu_model_runner.py
index 0cf05b2f344..b834d8733b0 100644
--- a/tests/worker/test_omni_gpu_model_runner.py
+++ b/tests/worker/test_omni_gpu_model_runner.py
@@ -372,28 +372,6 @@ def test_update_intermediate_buffer_skips_unknown_req_id():
     assert "unknown_req" not in runner.model_intermediate_buffer
 
 
-def test_update_additional_information_uses_legacy_additional_information():
-    runner = _make_runner(req_ids=("r1",), hidden_size=4)
-
-    scheduler_output = SimpleNamespace(
-        scheduled_new_reqs=[
-            SimpleNamespace(
-                req_id="r1",
-                additional_information={"new_field": 1},
-            )
-        ],
-        scheduled_cached_reqs=SimpleNamespace(
-            additional_information={"r1": {"cached_field": 3}},
-        ),
-    )
-
-    OmniGPUModelRunner._update_additional_information(runner, scheduler_output)
-
-    info = runner.model_intermediate_buffer["r1"]
-    assert info["new_field"] == 1
-    assert info["cached_field"] == 3
-
-
 def test_maybe_run_batch_preprocess_calls_model_hook():
     runner = object.__new__(OmniGPUModelRunner)
     runner.model_intermediate_buffer = {"r1": {"text": ["hello"]}}
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py
deleted file mode 100644
index 982762338c2..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/istft.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from vllm_omni.model_executor.models.ming_utils.audio_dsp import ISTFT, FourierHead, ISTFTHead
-
-__all__ = ["FourierHead", "ISTFT", "ISTFTHead"]
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py
index 28a4ce44aea..21631c252d9 100644
--- a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py
+++ b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py
@@ -6,10 +6,9 @@
 import torch.nn.functional as F
 from transformers import Qwen2Config, Qwen2Model
 
+from vllm_omni.model_executor.models.ming_utils.audio_dsp import ISTFTHead
 from vllm_omni.model_executor.models.ming_utils.audio_vae import StreamingLinearUpsample
 
-from .istft import ISTFTHead
-
 
 class Encoder(nn.Module):
     def __init__(self, encoder_args, input_dim=320, hop_size=320, latent_dim=64, patch_size=-1):

From 93bde5fa3d690992fa2fa1b84909477b8359529d Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 27 May 2026 02:29:34 +0530
Subject: [PATCH 40/54] refactor(ming-tts): remove redundant validation from
 FlowLoss.sample

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../models/ming_tts/flowloss_head.py          | 20 ++-----------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/flowloss_head.py b/vllm_omni/model_executor/models/ming_tts/flowloss_head.py
index 30a08b6e11f..2a2f845232a 100644
--- a/vllm_omni/model_executor/models/ming_tts/flowloss_head.py
+++ b/vllm_omni/model_executor/models/ming_tts/flowloss_head.py
@@ -18,25 +18,9 @@ def __init__(self, z_channels, llm_cond_dim, **kwargs):
         self.cfm = CFM(model=DiT(in_channels=z_channels, llm_cond_dim=llm_cond_dim, **kwargs))
 
     def sample(self, z, latent_history, cfg=2.0, patch_size=1, sigma=0.25, temperature=0):
-        if z.ndim != 3:
-            raise ValueError(f"Expected z rank-3 [Batch, Time, Dimension], got {tuple(z.shape)}")
-        if z.shape[1] != 1:
-            raise ValueError(f"Expected z time dim to be 1 for Ming dense decode, got {z.shape[1]}")
-        if latent_history.ndim != 3:
-            raise ValueError(
-                f"Expected latent_history rank-3 [Batch, Time, Dimension], got {tuple(latent_history.shape)}"
-            )
-        if z.shape[0] != latent_history.shape[0]:
-            raise ValueError(f"Batch mismatch: z batch={z.shape[0]} vs latent_history batch={latent_history.shape[0]}")
-        if patch_size <= 0:
-            raise ValueError(f"patch_size must be positive, got {patch_size}")
-        if not torch.isfinite(z).all():
-            raise RuntimeError("Non-finite conditioning z in FlowLoss.sample().")
-        if not torch.isfinite(latent_history).all():
-            raise RuntimeError("Non-finite latent_history in FlowLoss.sample().")
+        # z: [Batch, Time=1, Dimension]
+        # latent_history: [Batch, History_Time, Dimension]
         noise = torch.randn(z.shape[0], self.z_channels, patch_size, device=z.device)
-        if not torch.isfinite(noise).all():
-            raise RuntimeError("Non-finite noise in FlowLoss.sample().")
         noise = noise.to(dtype=z.dtype)
         out, _ = self.cfm.sample(
             noise=noise,

From c924b34492e0d4de69a7f9083f74bc72e5a74519 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 27 May 2026 02:49:48 +0530
Subject: [PATCH 41/54] refactor(ming-tts): remove dead conditioning dropout
 arg

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 vllm_omni/model_executor/models/ming_tts/fm/dit.py | 3 +--
 vllm_omni/model_executor/models/ming_utils/dit.py  | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/fm/dit.py b/vllm_omni/model_executor/models/ming_tts/fm/dit.py
index 4cfe9867396..f5db42f931c 100644
--- a/vllm_omni/model_executor/models/ming_tts/fm/dit.py
+++ b/vllm_omni/model_executor/models/ming_tts/fm/dit.py
@@ -52,7 +52,6 @@ def __init__(
         num_heads=16,
         mlp_ratio=4.0,
         llm_cond_dim=896,
-        cfg_dropout_prob=0.1,
         **kwargs,
     ):
         super().__init__()
@@ -62,7 +61,7 @@ def __init__(
         self.num_heads = num_heads
         self.t_embedder = TimestepEmbedder(hidden_size)
         self.x_embedder = nn.Linear(in_channels, hidden_size)
-        self.c_embedder = CondEmbedder(llm_cond_dim, hidden_size, cfg_dropout_prob)
+        self.c_embedder = CondEmbedder(llm_cond_dim, hidden_size)
         self.hidden_size = hidden_size
         self.rotary_embed = RotaryEmbedding(hidden_size // num_heads)
         self.blocks = nn.ModuleList(
diff --git a/vllm_omni/model_executor/models/ming_utils/dit.py b/vllm_omni/model_executor/models/ming_utils/dit.py
index 0a3a153548f..0210e160568 100644
--- a/vllm_omni/model_executor/models/ming_utils/dit.py
+++ b/vllm_omni/model_executor/models/ming_utils/dit.py
@@ -193,9 +193,8 @@ def forward(self, x):
 
 
 class CondEmbedder(nn.Module):
-    def __init__(self, input_feature_size, hidden_size, dropout_prob=None):
+    def __init__(self, input_feature_size, hidden_size):
         super().__init__()
-        del dropout_prob
         self.cond_embedder = nn.Linear(input_feature_size, hidden_size)
 
     def forward(self, llm_cond):

From 0b2d070f09f63ccf57cdaed15067ef5cc238ad95 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 27 May 2026 15:14:11 +0530
Subject: [PATCH 42/54] refactor(ming-tts): use runner request id

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 vllm_omni/model_executor/models/ming_tts/ming_tts.py | 10 +++++-----
 vllm_omni/worker/gpu_model_runner.py                 |  1 -
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts.py b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
index 112f036703d..3b495efb259 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
@@ -122,7 +122,7 @@ def preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tensor | None,
     def postprocess(self, hidden_states: torch.Tensor, **info_dict: Any) -> dict[str, Any]:
         if self.model_stage != "llm" or hidden_states.numel() == 0:
             return {}
-        req_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
+        req_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("request_id"))
         pending = self.model.pop_postprocess_update(req_id)
         if not pending or not isinstance(pending.get("ming_latent_patch"), torch.Tensor):
             return {}
@@ -145,7 +145,7 @@ def postprocess(self, hidden_states: torch.Tensor, **info_dict: Any) -> dict[str
     def _prefill_preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tensor, **info_dict: Any):
         if bool(info_dict.get(KEY_TEXT_MODE, False)):
             update: dict[str, Any] = {KEY_TEXT_MODE: True}
-            request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
+            request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("request_id"))
             if request_id is not None:
                 update[KEY_REQUEST_ID] = request_id
             if int(input_ids.shape[0]) > 1 and int(input_ids[-1].item()) == AUDIO_START_TOKEN_ID:
@@ -192,7 +192,7 @@ def _prefill_preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tenso
             if take > 0:
                 input_embeds[placeholder_pos[:take]] = prompt_embeds[:take].to(dtype=input_embeds.dtype)
 
-        request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
+        request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("request_id"))
         if request_id is not None:
             update[KEY_REQUEST_ID] = request_id
         _copy_runtime_controls(update, info_dict)
@@ -201,7 +201,7 @@ def _prefill_preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tenso
     def _decode_preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tensor, **info_dict: Any):
         if bool(info_dict.get(KEY_TEXT_MODE, False)):
             update: dict[str, Any] = {KEY_TEXT_MODE: True}
-            request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
+            request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("request_id"))
             if request_id is not None:
                 update[KEY_REQUEST_ID] = request_id
             return input_ids, input_embeds, update
@@ -233,7 +233,7 @@ def _decode_preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tensor
             if not torch.isfinite(input_embeds[0]).all():
                 raise RuntimeError("Non-finite backbone input_embeds after decode preprocess write.")
 
-        request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("req_id"))
+        request_id = info_dict.get(KEY_REQUEST_ID, info_dict.get("request_id"))
         if request_id is not None:
             update[KEY_REQUEST_ID] = request_id
         _copy_runtime_controls(update, info_dict)
diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
index 5c1e1d4a737..03619a33d9c 100644
--- a/vllm_omni/worker/gpu_model_runner.py
+++ b/vllm_omni/worker/gpu_model_runner.py
@@ -1508,7 +1508,6 @@ def flush_decode_batch() -> None:
             for req_index, req_id in enumerate(self.input_batch.req_ids):
                 req_infos = self.model_intermediate_buffer.get(req_id, {})
                 req_infos = dict(req_infos) if isinstance(req_infos, dict) else {}
-                req_infos.setdefault("req_id", req_id)
 
                 # mimo-audio check
                 req_state = self.requests.get(req_id)

From 4d923c708099939178e932ff153c63749b430fd1 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 27 May 2026 16:44:39 +0530
Subject: [PATCH 43/54] examples: consolidate Ming TTS examples

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../examples/offline_inference/ming_tts.md    | 20 ++++----
 .../examples/online_serving/ming_tts.md       | 12 ++---
 .../text_to_speech/README.md                  | 41 +++++++++++++++++
 .../{ => text_to_speech}/ming_tts/README.md   | 14 +++---
 .../{ => text_to_speech}/ming_tts/_runner.py  |  0
 .../{ => text_to_speech}/ming_tts/cases.yaml  |  0
 .../{ => text_to_speech}/ming_tts/end2end.py  |  0
 .../online_serving/text_to_speech/README.md   | 46 +++++++++++++++++++
 .../{ => text_to_speech}/ming_tts/README.md   |  4 +-
 .../ming_tts/openai_speech_client.py          |  0
 .../{ => text_to_speech}/ming_tts/run_curl.sh |  0
 .../ming_tts/run_server.sh                    |  0
 12 files changed, 112 insertions(+), 25 deletions(-)
 rename examples/offline_inference/{ => text_to_speech}/ming_tts/README.md (94%)
 rename examples/offline_inference/{ => text_to_speech}/ming_tts/_runner.py (100%)
 rename examples/offline_inference/{ => text_to_speech}/ming_tts/cases.yaml (100%)
 rename examples/offline_inference/{ => text_to_speech}/ming_tts/end2end.py (100%)
 rename examples/online_serving/{ => text_to_speech}/ming_tts/README.md (99%)
 rename examples/online_serving/{ => text_to_speech}/ming_tts/openai_speech_client.py (100%)
 rename examples/online_serving/{ => text_to_speech}/ming_tts/run_curl.sh (100%)
 rename examples/online_serving/{ => text_to_speech}/ming_tts/run_server.sh (100%)

diff --git a/docs/user_guide/examples/offline_inference/ming_tts.md b/docs/user_guide/examples/offline_inference/ming_tts.md
index 6a51c7965f6..6ff31258cba 100644
--- a/docs/user_guide/examples/offline_inference/ming_tts.md
+++ b/docs/user_guide/examples/offline_inference/ming_tts.md
@@ -1,6 +1,6 @@
 # Ming-omni-tts
 
-Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/ming_tts>.
+Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/text_to_speech/ming_tts>.
 
 This directory contains an offline Ming example that uses the in-repo Ming prompt builder directly. It covers the broader upstream dense 0.5B surface: style, IP, music-only generation, TTA, emotion, dialect, zero-shot clone, podcast, speech+bgm, and speech+sound.
 
@@ -9,7 +9,7 @@ This directory contains an offline Ming example that uses the in-repo Ming promp
 Run a zero-speaker style case:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case style \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager
@@ -18,7 +18,7 @@ python examples/offline_inference/ming_tts/end2end.py \
 Run emotion-controlled speech:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case emotion \
     --ref-audio /path/to/emotion_prompt.wav \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
@@ -28,7 +28,7 @@ python examples/offline_inference/ming_tts/end2end.py \
 Run zero-shot cloning with a transcript:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case zero_shot \
     --ref-audio /path/to/reference.wav \
     --ref-text "在此奉劝大家别乱打美白针。" \
@@ -39,7 +39,7 @@ python examples/offline_inference/ming_tts/end2end.py \
 Run podcast generation:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case podcast \
     --ref-audio-paths /path/to/CTS-CN-F2F-2019-11-11-423-012-A.wav /path/to/CTS-CN-F2F-2019-11-11-423-012-B.wav \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
@@ -49,7 +49,7 @@ python examples/offline_inference/ming_tts/end2end.py \
 Run text-to-audio event generation:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case tta \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager
@@ -58,7 +58,7 @@ python examples/offline_inference/ming_tts/end2end.py \
 Run with stats and a manifest:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case style \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager \
@@ -86,7 +86,7 @@ python examples/offline_inference/ming_tts/end2end.py \
 Use async_chunk streaming with `AsyncOmni`:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case basic \
     --ref-audio /path/to/10002287-00000095.wav \
     --streaming \
@@ -132,9 +132,9 @@ The offline example also exposes vLLM-Omni runtime/reporting controls such as:
 
 ??? abstract "README.md"
     ``````md
-    --8<-- "examples/offline_inference/ming_tts/README.md"
+    --8<-- "examples/offline_inference/text_to_speech/ming_tts/README.md"
     ``````
 ??? abstract "end2end.py"
     ``````py
-    --8<-- "examples/offline_inference/ming_tts/end2end.py"
+    --8<-- "examples/offline_inference/text_to_speech/ming_tts/end2end.py"
     ``````
diff --git a/docs/user_guide/examples/online_serving/ming_tts.md b/docs/user_guide/examples/online_serving/ming_tts.md
index a011bddbd58..119b5a4e7ea 100644
--- a/docs/user_guide/examples/online_serving/ming_tts.md
+++ b/docs/user_guide/examples/online_serving/ming_tts.md
@@ -1,6 +1,6 @@
 # Ming-omni-tts
 
-Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serving/ming_tts>.
+Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serving/text_to_speech/ming_tts>.
 
 This example shows how to serve Ming through the OpenAI-compatible `/v1/audio/speech` endpoint. The server builds Ming prompts directly with the in-repo prompt builder, so online requests support Ming-specific structured controls instead of the Qwen placeholder path.
 
@@ -21,7 +21,7 @@ vllm-omni serve inclusionAI/Ming-omni-tts-0.5B \
 Or:
 
 ```bash
-cd examples/online_serving/ming_tts
+cd examples/online_serving/text_to_speech/ming_tts
 ./run_server.sh
 ```
 
@@ -170,17 +170,17 @@ For Ming, the generic OpenAI request fields map to Ming controls like this:
 
 ??? abstract "README.md"
     ``````md
-    --8<-- "examples/online_serving/ming_tts/README.md"
+    --8<-- "examples/online_serving/text_to_speech/ming_tts/README.md"
     ``````
 ??? abstract "run_server.sh"
     ``````sh
-    --8<-- "examples/online_serving/ming_tts/run_server.sh"
+    --8<-- "examples/online_serving/text_to_speech/ming_tts/run_server.sh"
     ``````
 ??? abstract "openai_speech_client.py"
     ``````py
-    --8<-- "examples/online_serving/ming_tts/openai_speech_client.py"
+    --8<-- "examples/online_serving/text_to_speech/ming_tts/openai_speech_client.py"
     ``````
 ??? abstract "run_curl.sh"
     ``````sh
-    --8<-- "examples/online_serving/ming_tts/run_curl.sh"
+    --8<-- "examples/online_serving/text_to_speech/ming_tts/run_curl.sh"
     ``````
diff --git a/examples/offline_inference/text_to_speech/README.md b/examples/offline_inference/text_to_speech/README.md
index 0bf0efc4ede..eb515340dbd 100644
--- a/examples/offline_inference/text_to_speech/README.md
+++ b/examples/offline_inference/text_to_speech/README.md
@@ -17,6 +17,7 @@ list of supported architectures across all modalities, see
 | CosyVoice3 | `FunAudioLLM/Fun-CosyVoice3-0.5B-2512` | 2 (talker + code2wav) | ✓ | ✓ | — | 24 kHz |
 | Fish Speech S2 Pro | `fishaudio/s2-pro` | dual-AR | ✓ | ✓ | — | 44.1 kHz |
 | GLM-TTS | `zai-org/GLM-TTS` | 2 (AR + DiT) | ✓ (required) | ✓ | — | 24 kHz |
+| Ming-omni-tts | `inclusionAI/Ming-omni-tts-0.5B` | 2 (AR + audio VAE) | ✓ | ✓ | style / IP / dialect / TTA / podcast | 44.1 kHz |
 | Ming-flash-omni-TTS | `Jonathan1909/Ming-flash-omni-2.0` | single (talker only) | — (caption-controlled) | — | style / IP / basic captions | 44.1 kHz |
 | MOSS-TTS-Nano | `OpenMOSS-Team/MOSS-TTS-Nano` | single (AR + codec) | ✓ (required) | ✓ | voice_clone, continuation | 48 kHz |
 | OmniVoice | `k2-fsa/OmniVoice` | 2 (gen + dec) | ✓ | — | voice design, language hint | 24 kHz |
@@ -159,6 +160,46 @@ Streaming requires `async_chunk: true` in the stage config.
 
 ---
 
+## Ming-omni-tts
+
+Dense 0.5B two-stage TTS pipeline (`AR + flow` + audio VAE) at 44.1 kHz. The example covers style, IP voice, music-only generation, text-to-audio events, emotion, dialect, zero-shot cloning, podcast, speech+BGM, and speech+environment-sound cases.
+
+### Quick start
+```bash
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
+    --case style \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
+    --enforce-eager
+```
+
+### Voice cloning
+```bash
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
+    --case zero_shot \
+    --ref-audio /path/to/reference.wav \
+    --ref-text "在此奉劝大家别乱打美白针。" \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
+    --enforce-eager
+```
+
+### Streaming
+```bash
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
+    --case basic \
+    --ref-audio /path/to/reference.wav \
+    --streaming \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
+    --enforce-eager
+```
+
+### Notes
+- `style`, `ip`, `bgm`, and `tta` do not require reference audio.
+- Reference-audio cases use `--ref-audio`; `zero_shot` also requires `--ref-text`.
+- `podcast` uses multiple references via `--ref-audio-paths`.
+- Full case details live in [`ming_tts/README.md`](ming_tts/README.md).
+
+---
+
 ## Ming-flash-omni-TTS
 
 Standalone talker-only deployment of Ming-flash-omni-2.0 at 44.1 kHz. Voice is controlled through caption fields (`风格` / `IP` / `语速`/`基频`/`音量`) rather than reference audio.
diff --git a/examples/offline_inference/ming_tts/README.md b/examples/offline_inference/text_to_speech/ming_tts/README.md
similarity index 94%
rename from examples/offline_inference/ming_tts/README.md
rename to examples/offline_inference/text_to_speech/ming_tts/README.md
index 0c296620acd..a416c5c5ad0 100644
--- a/examples/offline_inference/ming_tts/README.md
+++ b/examples/offline_inference/text_to_speech/ming_tts/README.md
@@ -56,7 +56,7 @@ These cases cover the upstream dense 0.5B cookbook surface that maps cleanly ont
 Run the zero-speaker style example:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case style \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager
@@ -65,7 +65,7 @@ python examples/offline_inference/ming_tts/end2end.py \
 Run zero-shot cloning with a transcript:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case zero_shot \
     --ref-audio /path/to/10002287-00000094.wav \
     --ref-text "在此奉劝大家别乱打美白针。" \
@@ -76,7 +76,7 @@ python examples/offline_inference/ming_tts/end2end.py \
 Run emotion-controlled speech:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case emotion \
     --ref-audio /path/to/emotion_prompt.wav \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
@@ -86,7 +86,7 @@ python examples/offline_inference/ming_tts/end2end.py \
 Run podcast generation with two reference clips:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case podcast \
     --ref-audio-paths /path/to/CTS-CN-F2F-2019-11-11-423-012-A.wav /path/to/CTS-CN-F2F-2019-11-11-423-012-B.wav \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
@@ -106,7 +106,7 @@ where the JSON is a list of speaker embeddings, one 192-d vector per speaker.
 Run text-to-audio event generation:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case tta \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager
@@ -115,7 +115,7 @@ python examples/offline_inference/ming_tts/end2end.py \
 Use async_chunk streaming:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case basic \
     --ref-audio /path/to/10002287-00000095.wav \
     --streaming \
@@ -130,7 +130,7 @@ supports one prompt per process invocation; use blocking mode for
 Collect runtime stats and a manifest:
 
 ```bash
-python examples/offline_inference/ming_tts/end2end.py \
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
     --case style \
     --deploy-config vllm_omni/deploy/ming_tts.yaml \
     --enforce-eager \
diff --git a/examples/offline_inference/ming_tts/_runner.py b/examples/offline_inference/text_to_speech/ming_tts/_runner.py
similarity index 100%
rename from examples/offline_inference/ming_tts/_runner.py
rename to examples/offline_inference/text_to_speech/ming_tts/_runner.py
diff --git a/examples/offline_inference/ming_tts/cases.yaml b/examples/offline_inference/text_to_speech/ming_tts/cases.yaml
similarity index 100%
rename from examples/offline_inference/ming_tts/cases.yaml
rename to examples/offline_inference/text_to_speech/ming_tts/cases.yaml
diff --git a/examples/offline_inference/ming_tts/end2end.py b/examples/offline_inference/text_to_speech/ming_tts/end2end.py
similarity index 100%
rename from examples/offline_inference/ming_tts/end2end.py
rename to examples/offline_inference/text_to_speech/ming_tts/end2end.py
diff --git a/examples/online_serving/text_to_speech/README.md b/examples/online_serving/text_to_speech/README.md
index 922156d4c1a..110bf3d1d55 100644
--- a/examples/online_serving/text_to_speech/README.md
+++ b/examples/online_serving/text_to_speech/README.md
@@ -16,6 +16,7 @@ For the full list of supported architectures across all modalities, see
 |---|---|---|---|---|---|
 | Fish Speech S2 Pro | `fishaudio/s2-pro` | ✓ (`ref_audio`+`ref_text`) | ✓ (PCM stream) | — | ✓ |
 | GLM-TTS | `zai-org/GLM-TTS` | ✓ (`ref_audio`+`ref_text`, required) | ✓ (PCM stream) | — | ✓ |
+| Ming-omni-tts | `inclusionAI/Ming-omni-tts-0.5B` | ✓ (`ref_audio` / `speaker_embedding`) | ✓ (PCM stream) | IP labels + structured `instructions` | — |
 | Ming-flash-omni-TTS | `Jonathan1909/Ming-flash-omni-2.0` | — (caption-controlled) | — | caption fields (`instructions`) | — |
 | MOSS-TTS-Nano | `OpenMOSS-Team/MOSS-TTS-Nano` | ✓ (`ref_audio` required) | ✓ (PCM stream) | — | ✓ |
 | OmniVoice | `k2-fsa/OmniVoice` | (offline only) | — | — | — |
@@ -182,6 +183,51 @@ python fish_speech/gradio_demo.py --api-base http://localhost:8091  # if server
 
 ---
 
+## Ming-omni-tts
+
+Dense 0.5B two-stage TTS served through `/v1/audio/speech`. Ming uses the standard speech endpoint plus structured controls in `instructions`, `voice`, `language`, `ref_audio`, `ref_text`, and `speaker_embedding`.
+
+### Launch
+```bash
+bash examples/online_serving/text_to_speech/ming_tts/run_server.sh
+```
+Equivalent manual command:
+```bash
+vllm-omni serve inclusionAI/Ming-omni-tts-0.5B \
+    --deploy-config vllm_omni/deploy/ming_tts.yaml \
+    --host 0.0.0.0 --port 8091 \
+    --enforce-eager --omni
+```
+
+### Sending requests
+```bash
+python examples/online_serving/text_to_speech/ming_tts/openai_speech_client.py \
+    --text "你好，这是 Ming 在线语音合成测试。"
+```
+
+Structured dialect control:
+```bash
+python examples/online_serving/text_to_speech/ming_tts/openai_speech_client.py \
+    --text "我觉得社会企业同个人都有责任" \
+    --instruction-json '{"方言":"广粤话"}'
+```
+
+Zero-shot cloning:
+```bash
+python examples/online_serving/text_to_speech/ming_tts/openai_speech_client.py \
+    --task-type Base \
+    --text "我们的愿景是构建未来服务业的数字化基础设施。" \
+    --ref-audio /path/to/reference.wav \
+    --ref-text "在此奉劝大家别乱打美白针。"
+```
+
+### Notes
+- `run_curl.sh` keeps a small sanity subset; use the Ming README for the broader request cookbook.
+- Online serving is speech-shaped today; music-only `bgm` and text-to-audio `tta` remain offline examples.
+- Full request details live in [`ming_tts/README.md`](ming_tts/README.md).
+
+---
+
 ## Ming-flash-omni-TTS
 
 Standalone talker-only deployment of Ming-flash-omni-2.0. Voice is controlled through caption text passed via `instructions`.
diff --git a/examples/online_serving/ming_tts/README.md b/examples/online_serving/text_to_speech/ming_tts/README.md
similarity index 99%
rename from examples/online_serving/ming_tts/README.md
rename to examples/online_serving/text_to_speech/ming_tts/README.md
index 2717d62ca87..f75d737eda2 100644
--- a/examples/online_serving/ming_tts/README.md
+++ b/examples/online_serving/text_to_speech/ming_tts/README.md
@@ -23,7 +23,7 @@ vllm-omni serve inclusionAI/Ming-omni-tts-0.5B \
 Or use the convenience script:
 
 ```bash
-cd examples/online_serving/ming_tts
+cd examples/online_serving/text_to_speech/ming_tts
 ./run_server.sh
 ```
 
@@ -39,7 +39,7 @@ The recommended online-serving path is eager async-chunk mode through
 The canonical Ming online client is:
 
 ```bash
-cd examples/online_serving/ming_tts
+cd examples/online_serving/text_to_speech/ming_tts
 python openai_speech_client.py --text "你好，世界"
 ```
 
diff --git a/examples/online_serving/ming_tts/openai_speech_client.py b/examples/online_serving/text_to_speech/ming_tts/openai_speech_client.py
similarity index 100%
rename from examples/online_serving/ming_tts/openai_speech_client.py
rename to examples/online_serving/text_to_speech/ming_tts/openai_speech_client.py
diff --git a/examples/online_serving/ming_tts/run_curl.sh b/examples/online_serving/text_to_speech/ming_tts/run_curl.sh
similarity index 100%
rename from examples/online_serving/ming_tts/run_curl.sh
rename to examples/online_serving/text_to_speech/ming_tts/run_curl.sh
diff --git a/examples/online_serving/ming_tts/run_server.sh b/examples/online_serving/text_to_speech/ming_tts/run_server.sh
similarity index 100%
rename from examples/online_serving/ming_tts/run_server.sh
rename to examples/online_serving/text_to_speech/ming_tts/run_server.sh

From da94f8a1ef0f8b978bd9eb93adab19ebb863cc74 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Wed, 27 May 2026 23:52:23 +0530
Subject: [PATCH 44/54] Add Ming-omni-tts 0.5b Dense recipe

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 recipes/README.md                         |   1 +
 recipes/inclusionAI/Ming-omni-tts-0.5B.md | 246 ++++++++++++++++++++++
 2 files changed, 247 insertions(+)
 create mode 100644 recipes/inclusionAI/Ming-omni-tts-0.5B.md

diff --git a/recipes/README.md b/recipes/README.md
index d2e1eeff3b0..e0ddea83337 100644
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -31,6 +31,7 @@ recipes/
 | [`Baidu/ERNIE-Image.md`](./Baidu/ERNIE-Image.md) | Text-to-image online serving (ERNIE-Image 8B) | 1x or 2x RTX 4090 24GB |
 | [`fishaudio/Fish-Speech-S2-Pro.md`](./fishaudio/Fish-Speech-S2-Pro.md) | Online serving for TTS | 1x A800 80GB |
 | [`inclusionAI/Ming-flash-omni-2.0.md`](./inclusionAI/Ming-flash-omni-2.0.md) | Online serving for multimodal chat + standalone TTS | 4x H100 / 1x H100 80GB |
+| [`inclusionAI/Ming-omni-tts-0.5B.md`](./inclusionAI/Ming-omni-tts-0.5B.md) | Offline + online dense Ming TTS/audio generation | 1x A100 40GB |
 | [`LTX/LTX-2.md`](./LTX/LTX-2.md) | Text-to-video and image-to-video serving | 1x H200 141GB |
 | [`LTX/LTX-2.3.md`](./LTX/LTX-2.3.md) | Text-to-video with audio generation (22B) | 1x GPU (96GB VRAM) |
 | [`Qwen/Qwen-Image.md`](./Qwen/Qwen-Image.md) | Text-to-image serving with step-wise continuous batching replay and ModelOpt mixed FP8/NVFP4 | 1x A100 80GB / 2x B200 |
diff --git a/recipes/inclusionAI/Ming-omni-tts-0.5B.md b/recipes/inclusionAI/Ming-omni-tts-0.5B.md
new file mode 100644
index 00000000000..1c1422bd245
--- /dev/null
+++ b/recipes/inclusionAI/Ming-omni-tts-0.5B.md
@@ -0,0 +1,246 @@
+# Ming-omni-tts 0.5B
+
+> Offline and online TTS/audio generation with the dense Ming two-stage AR + Flow/VAE pipeline
+
+## Summary
+
+- Vendor: inclusionAI
+- Model: `inclusionAI/Ming-omni-tts-0.5B`
+- Task: Text-to-speech, voice/style control, zero-shot cloning, podcast-style multi-speaker generation, and text-to-audio/music cases
+- Mode: Offline `Omni` / `AsyncOmni` and online OpenAI-compatible `/v1/audio/speech`
+- Maintainer: Community
+
+## When to use this recipe
+
+Use this recipe when you want to run the dense 0.5B Ming TTS model through
+vLLM-Omni's two-stage pipeline:
+
+- Stage 0: Qwen2-based autoregressive backbone with inline Ming flow controls
+- Stage 1: audio VAE decode to mono 44.1 kHz waveform
+
+The verified flow covers blocking offline generation, async-chunk offline
+generation, and online serving for speech cases. Music-only `bgm` and `tta`
+are covered by offline inference; the online `/v1/audio/speech` endpoint does
+not yet expose the corresponding `prompt_mode` fields.
+
+## References
+
+- Hugging Face model:
+  [`inclusionAI/Ming-omni-tts-0.5B`](https://huggingface.co/inclusionAI/Ming-omni-tts-0.5B)
+- Offline example:
+  [`examples/offline_inference/text_to_speech/ming_tts/`](../../examples/offline_inference/text_to_speech/ming_tts/)
+- Online example:
+  [`examples/online_serving/text_to_speech/ming_tts/`](../../examples/online_serving/text_to_speech/ming_tts/)
+- Deploy config:
+  [`vllm_omni/deploy/ming_tts.yaml`](../../vllm_omni/deploy/ming_tts.yaml)
+
+## Installing vLLM-Omni
+
+Use a fresh Python environment. The verified run used vLLM `0.21.0` with the
+CUDA 13 PyTorch stack.
+
+```bash
+export VLLM_VERSION="0.21.0"
+
+uv venv
+source .venv/bin/activate
+uv pip install vllm==$VLLM_VERSION --torch-backend=cu130
+uv pip install -e .
+uv pip install soundfile pyyaml openai aiohttp huggingface_hub
+```
+
+## Hardware Support
+
+## GPU
+
+### 1x A100 40GB
+
+#### Environment
+
+- OS: Linux
+- Python: 3.12.13
+- GPU: NVIDIA A100-SXM4-40GB, 40960 MiB
+- Driver: 580.82.07
+- PyTorch: `2.11.0+cu130`
+- CUDA runtime reported by PyTorch: 13.0
+- vLLM version: 0.21.0
+- vLLM-Omni branch / commit: `feat/ming-omni-tts-dense` / `4d923c708099939178e932ff153c63749b430fd1`
+- Deploy config: `vllm_omni/deploy/ming_tts.yaml`
+
+#### Offline Command
+
+Run a single blocking case:
+
+```bash
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
+  --model inclusionAI/Ming-omni-tts-0.5B \
+  --case style \
+  --deploy-config vllm_omni/deploy/ming_tts.yaml \
+  --enforce-eager
+```
+
+Run a streaming async-chunk case:
+
+```bash
+python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
+  --model inclusionAI/Ming-omni-tts-0.5B \
+  --case basic \
+  --ref-audio /path/to/10002287-00000095.wav \
+  --streaming \
+  --deploy-config vllm_omni/deploy/ming_tts.yaml \
+  --enforce-eager
+```
+
+The offline example includes 11 built-in cases: `style`, `ip`, `bgm`, `tta`,
+`emotion`, `basic`, `dialect`, `zero_shot`, `podcast`, `speech_bgm`, and
+`speech_sound`.
+
+#### Online Command
+
+Start the OpenAI-compatible speech server:
+
+```bash
+vllm-omni serve inclusionAI/Ming-omni-tts-0.5B \
+  --deploy-config vllm_omni/deploy/ming_tts.yaml \
+  --host 127.0.0.1 \
+  --port 8091 \
+  --enforce-eager \
+  --omni \
+  --stage-init-timeout 600 \
+  --init-timeout 900 \
+  --log-stats
+```
+
+Or use the bundled helper:
+
+```bash
+cd examples/online_serving/text_to_speech/ming_tts
+./run_server.sh
+```
+
+#### Verification
+
+Basic speech:
+
+```bash
+curl -X POST http://127.0.0.1:8091/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer EMPTY" \
+  -d '{
+    "model": "inclusionAI/Ming-omni-tts-0.5B",
+    "input": "你好，这是 Ming 在线语音合成测试。",
+    "response_format": "wav",
+    "max_new_tokens": 200
+  }' \
+  --output ming_basic.wav
+```
+
+Style-conditioned speech:
+
+```bash
+curl -X POST http://127.0.0.1:8091/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer EMPTY" \
+  -d '{
+    "model": "inclusionAI/Ming-omni-tts-0.5B",
+    "input": "我会一直在这里陪着你，直到你慢慢、慢慢地沉入那个最温柔的梦里……好吗？",
+    "instructions": "{\"风格\":\"ASMR耳语，轻柔普通话，音量极低，语速极慢\"}",
+    "response_format": "wav",
+    "max_new_tokens": 200
+  }' \
+  --output ming_style.wav
+```
+
+Zero-shot cloning with reference audio and transcript:
+
+```bash
+curl -X POST http://127.0.0.1:8091/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer EMPTY" \
+  -d '{
+    "model": "inclusionAI/Ming-omni-tts-0.5B",
+    "input": "我们的愿景是构建未来服务业的数字化基础设施。",
+    "task_type": "Base",
+    "ref_audio": "data:audio/wav;base64,<BASE64_WAV>",
+    "ref_text": "在此奉劝大家别乱打美白针。",
+    "response_format": "wav",
+    "max_new_tokens": 200
+  }' \
+  --output ming_zero_shot.wav
+```
+
+Streaming PCM:
+
+```bash
+curl -X POST http://127.0.0.1:8091/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer EMPTY" \
+  -d '{
+    "model": "inclusionAI/Ming-omni-tts-0.5B",
+    "input": "你好，这是 Ming 在线流式语音合成测试。",
+    "instructions": "平静，普通话",
+    "response_format": "pcm",
+    "stream": true,
+    "max_new_tokens": 200
+  }' \
+  --output ming_streaming.pcm
+```
+
+## Key Parameters
+
+| Parameter | Scope | Description |
+|---|---|---|
+| `--deploy-config` | Offline / online | Use `vllm_omni/deploy/ming_tts.yaml` for the two-stage Ming pipeline |
+| `--enforce-eager` | Offline / online | Recommended and used by the verified run |
+| `--case` | Offline | Built-in case name from `cases.yaml` |
+| `--streaming` | Offline | Uses `AsyncOmni` and async-chunk transfer |
+| `voice` | Online | Selects a built-in IP voice such as `灵小甄` |
+| `instructions` | Online | Free-form text or JSON-encoded Ming controls such as style, emotion, dialect, BGM, or environmental sound |
+| `ref_audio` | Online | Reference audio, usually sent as a data URL for HTTP requests |
+| `ref_text` | Online | Transcript paired with `ref_audio` for zero-shot cloning |
+| `task_type` | Online | Use `Base` for reference-audio cloning requests |
+| `response_format` | Online | `wav` for complete audio or `pcm` for streaming |
+| `stream` | Online | Set `true` with `response_format="pcm"` for streaming output |
+| `max_new_tokens` | Online | Upper bound for speech token generation |
+
+## Verified Results
+
+The following measurements came from the result summaries in
+`/home/aja/Music/mingE2E27may` for commit
+`4d923c708099939178e932ff153c63749b430fd1`. Each case used one warmup run and
+one measured run on 1x A100 40GB. Memory peak was not available in the captured
+stats.
+
+### Offline
+
+| Mode | Cases | E2E RTF | Elapsed range | TTFP |
+|---|---:|---:|---:|---:|
+| Blocking | 11 / 11 | 0.5011 - 0.6090, avg 0.5568 | 2.3541s - 15.0980s | N/A |
+| Async chunk streaming | 11 / 11 | 0.4936 - 0.6079, avg 0.5468 | 2.2731s - 14.8571s | 2.2692s - 4.7519s, avg 4.0078s |
+
+Offline blocking and async-chunk streaming both completed all 11 cases:
+`style`, `ip`, `bgm`, `tta`, `emotion`, `basic`, `dialect`, `zero_shot`,
+`podcast`, `speech_bgm`, and `speech_sound`.
+
+### Online
+
+Server startup was 110.01s. The `/v1/audio/speech` endpoint returned HTTP 200
+for the warmup request, 9 WAV speech cases, and one streaming PCM request.
+
+| Request group | Cases | E2E RTF / latency |
+|---|---:|---:|
+| WAV speech cases | 9 | RTF 0.5208 - 1.6646, avg 0.7622; elapsed 2.38s - 15.98s |
+| Streaming PCM smoke test | 1 | elapsed 2.43s; TTFP 2.423s |
+
+Online WAV cases verified: `style`, `ip`, `basic`, `emotion`, `dialect`,
+`zero_shot`, `podcast`, `speech_bgm`, and `speech_sound`.
+
+## Notes
+
+- The deploy config sets `async_chunk: true`, `dtype: bfloat16`, and
+  `trust_remote_code: false`.
+- Stage 0 and Stage 1 both run on logical device `0` in the bundled config.
+- The verified online route skips `bgm` and `tta` because `/v1/audio/speech`
+  does not yet expose `prompt_mode=music` or `prompt_mode=tta`.
+- Reference-audio fixtures used by the validation come from
+  `inclusionAI/Ming-omni-tts/data/wavs`.

From 2b9d5b5d09ce402f094aab5313d3dcf8adb70af5 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Thu, 28 May 2026 21:40:16 +0530
Subject: [PATCH 45/54] refactor: remove dead ingress and preprocessor plumbing
 per review

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 vllm_omni/engine/async_omni_engine.py         |  27 +--
 vllm_omni/engine/stage_init_utils.py          |   8 -
 vllm_omni/inputs/preprocess.py                |  26 ---
 .../model_executor/models/ming_tts/ingress.py | 173 ------------------
 4 files changed, 2 insertions(+), 232 deletions(-)
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/ingress.py

diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index d13eb6ec8b7..01e3781621c 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -217,27 +217,6 @@ def _apply_omni_final_stage_metadata(
     )
 
 
-def _consume_processed_prompt(
-    input_processor: InputProcessor | None,
-    fallback_prompt: Any,
-) -> Any:
-    """Return the prompt dict actually seen by the stage-0 preprocessor."""
-    if input_processor is None:
-        return fallback_prompt
-    preprocessor = getattr(input_processor, "input_preprocessor", None)
-    if preprocessor is None:
-        return fallback_prompt
-    consume = getattr(preprocessor, "consume_last_processed_prompt", None)
-    if consume is None:
-        return fallback_prompt
-    processed_prompt = consume()
-    if processed_prompt is None:
-        return fallback_prompt
-    if fallback_prompt is not None and not isinstance(processed_prompt, type(fallback_prompt)):
-        return fallback_prompt
-    return processed_prompt
-
-
 def _weak_shutdown_async_omni_engine(
     orchestrator_thread: threading.Thread | None,
     request_queue: janus.Queue[EngineQueueMessage] | None,
@@ -1683,7 +1662,6 @@ def _build_add_request_message(
                     data_parallel_rank=data_parallel_rank,
                     resumable=resumable,
                 )
-                processed_prompt = _consume_processed_prompt(self.input_processor, prompt)
             except Exception:
                 if preselected_stage0_replica is not None and self.stage_pools:
                     self.stage_pools[0].release_binding(request_id)
@@ -1691,7 +1669,7 @@ def _build_add_request_message(
             _preprocess_ms = (time.perf_counter() - _t_preprocess) * 1000.0
             # TODO (Peiqi): add this for Qwen3-TTS only. Other models don't have
             # additional_information field in the prompt.
-            request = _upgrade_to_omni_request(request, processed_prompt)
+            request = _upgrade_to_omni_request(request, prompt)
 
             if reasoning_ended is not None:
                 request.reasoning_ended = reasoning_ended
@@ -1757,8 +1735,7 @@ def _enqueue_cfg_companions(
                 params=companion_params,
                 supported_tasks=self.supported_tasks,
             )
-            processed_prompt = _consume_processed_prompt(self.input_processor, companion_prompt)
-            request = _upgrade_to_omni_request(request, processed_prompt)
+            request = _upgrade_to_omni_request(request, companion_prompt)
             request.external_req_id = cid
 
             # Registration of this companion on stage-0's output processor is
diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py
index 2bca382fa76..1a9f98fe5f3 100644
--- a/vllm_omni/engine/stage_init_utils.py
+++ b/vllm_omni/engine/stage_init_utils.py
@@ -359,7 +359,6 @@ class StageMetadata:
     final_output_type: str | None
     default_sampling_params: OmniSamplingParams
     custom_process_input_func: Callable | None
-    initial_prompt_processor_factory: Callable | None
     model_stage: str | None
     runtime_cfg: Any
     prompt_expand_func: Callable | None = None
@@ -403,11 +402,6 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata:
         mod_path, fn_name = _cpif_path.rsplit(".", 1)
         custom_process_input_func = getattr(importlib.import_module(mod_path), fn_name)
 
-    initial_prompt_processor_factory: Callable | None = None
-    if hasattr(stage_config, "initial_prompt_processor"):
-        mod_path, fn_name = stage_config.initial_prompt_processor.rsplit(".", 1)
-        initial_prompt_processor_factory = getattr(importlib.import_module(mod_path), fn_name)
-
     prompt_expand_func: Callable | None = None
     _pef_path = getattr(stage_config, "prompt_expand_func", None)
     if _pef_path:
@@ -432,7 +426,6 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata:
             final_output_type=final_output_type,
             default_sampling_params=default_sampling_params,
             custom_process_input_func=custom_process_input_func,
-            initial_prompt_processor_factory=initial_prompt_processor_factory,
             model_stage=None,
             runtime_cfg=runtime_cfg,
             cfg_kv_collect_func=cfg_kv_collect_func,
@@ -454,7 +447,6 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata:
         final_output_type=final_output_type,
         default_sampling_params=default_sampling_params,
         custom_process_input_func=custom_process_input_func,
-        initial_prompt_processor_factory=initial_prompt_processor_factory,
         model_stage=model_stage,
         runtime_cfg=runtime_cfg,
         prompt_expand_func=prompt_expand_func,
diff --git a/vllm_omni/inputs/preprocess.py b/vllm_omni/inputs/preprocess.py
index 06edc84724b..7282d7a520d 100644
--- a/vllm_omni/inputs/preprocess.py
+++ b/vllm_omni/inputs/preprocess.py
@@ -25,30 +25,6 @@ class OmniInputPreprocessor(InputPreprocessor):
     Supports processing tokens, embeddings, text, and multimodal inputs.
     """
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.initial_prompt_processor = None
-        self._last_processed_prompt = None
-
-    def set_initial_prompt_processor(self, processor: Any) -> None:
-        self.initial_prompt_processor = processor
-
-    def consume_last_processed_prompt(self) -> Any:
-        prompt = self._last_processed_prompt
-        self._last_processed_prompt = None
-        return prompt
-
-    def _apply_initial_prompt_processor(self, prompt: SingletonDictPrompt) -> SingletonDictPrompt:
-        self._last_processed_prompt = prompt
-        processor = self.initial_prompt_processor
-        if processor is None or not isinstance(prompt, dict):
-            return prompt
-        processed = processor(prompt)
-        if not isinstance(processed, dict):
-            raise TypeError(f"Initial prompt processor must return a prompt dict, got {type(processed).__name__}")
-        self._last_processed_prompt = processed
-        return processed
-
     def _process_text(
         self,
         parsed_content: OmniTextPrompt,
@@ -192,8 +168,6 @@ def _prompt_to_llm_inputs(
 
         * [`SingletonInput`][vllm.inputs.engine.SingletonInput] instance
         """
-        prompt = self._apply_initial_prompt_processor(prompt)
-
         if "prompt_embeds" in prompt:
             return self._process_embeds(prompt)  # type: ignore[arg-type]
 
diff --git a/vllm_omni/model_executor/models/ming_tts/ingress.py b/vllm_omni/model_executor/models/ming_tts/ingress.py
deleted file mode 100644
index 03187ee846e..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/ingress.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-import copy
-import os
-import time
-from typing import Any
-
-from vllm.logger import init_logger
-
-from .config_ming_tts import (
-    AUDIO_DUMMY_TOKEN_ID,
-    AUDIO_START_TOKEN_ID,
-    KEY_PROMPT_LATENTS,
-    KEY_SPEAKER_EMBEDDING,
-    KEY_TEXT_MODE,
-    MingTTSConfig,
-)
-from .prompt_utils import (
-    build_dense_prompt_token_ids,
-    coerce_speaker_embeddings,
-    count_prompt_waveform_patches,
-    create_instruction,
-)
-
-logger = init_logger(__name__)
-
-
-def _rebuild_prompt_token_ids_with_exact_patch_count(prompt_token_ids: Any, prompt_patch_count: int) -> list[int]:
-    if not isinstance(prompt_token_ids, list) or not prompt_token_ids:
-        raise ValueError("Ming prompt finalization requires existing prompt_token_ids")
-
-    audio_start_index = -1
-    for idx in range(len(prompt_token_ids) - 1, -1, -1):
-        if int(prompt_token_ids[idx]) == AUDIO_START_TOKEN_ID:
-            audio_start_index = idx
-            break
-    if audio_start_index < 0:
-        raise ValueError("Ming prompt finalization could not locate <audio> token")
-
-    trailing_tokens = prompt_token_ids[audio_start_index + 1 :]
-    if any(int(token_id) != AUDIO_DUMMY_TOKEN_ID for token_id in trailing_tokens):
-        raise ValueError("Ming prompt finalization expected only trailing <audioPatch> tokens after <audio>")
-
-    return prompt_token_ids[: audio_start_index + 1] + ([AUDIO_DUMMY_TOKEN_ID] * int(prompt_patch_count))
-
-
-class MingIngressProcessor:
-    def __init__(self, *, vllm_config: Any, tokenizer: Any):
-        if tokenizer is None:
-            raise RuntimeError("Ming ingress processor requires an initialized tokenizer")
-
-        self.tokenizer = tokenizer
-        self.profile_ingress = (
-            os.environ.get("MING_TTS_INGRESS_PROFILE") == "1" or os.environ.get("MING_TTS_ASYNC_DEBUG") == "1"
-        )
-
-        self.ming_config = MingTTSConfig.from_hf_config(vllm_config.model_config.hf_config)
-        self.ming_config.validate()
-
-    def __call__(self, prompt: Any) -> Any:
-        total_start = time.perf_counter()
-        if not isinstance(prompt, dict):
-            return prompt
-
-        raw_additional_information = prompt.get("additional_information")
-        if raw_additional_information is None:
-            additional_information = {}
-        elif isinstance(raw_additional_information, dict):
-            additional_information = raw_additional_information
-        else:
-            return prompt
-
-        modalities = prompt.get("modalities")
-        text_mode = isinstance(modalities, (list, tuple)) and ("text" in modalities) and ("audio" not in modalities)
-        if text_mode:
-            finalized_prompt = copy.copy(prompt)
-            finalized_additional_information = dict(additional_information)
-            finalized_additional_information[KEY_TEXT_MODE] = True
-            prompt_token_ids = finalized_prompt.get("prompt_token_ids")
-            if isinstance(prompt_token_ids, list) and prompt_token_ids:
-                if int(prompt_token_ids[-1]) == AUDIO_START_TOKEN_ID:
-                    finalized_prompt["prompt_token_ids"] = prompt_token_ids[:-1]
-            finalized_prompt["additional_information"] = finalized_additional_information
-            return finalized_prompt
-
-        prompt_waveform = additional_information.get("prompt_waveform", prompt.get("prompt_waveform"))
-        prompt_text = additional_information.get("prompt_text", prompt.get("prompt_text"))
-        if prompt_waveform is None:
-            return prompt
-        if prompt_text is None:
-            raise RuntimeError(
-                "Ming prompt_waveform requires prompt_text before ingress can build prompt latents. "
-                "Use ming_speaker_embedding for reference-audio-only speaker conditioning."
-            )
-
-        prompt_latents = additional_information.get(KEY_PROMPT_LATENTS, prompt.get("prompt_latents"))
-        if prompt_latents is not None:
-            raise ValueError(
-                "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
-                "Choose exactly one source of truth."
-            )
-
-        patch_start = time.perf_counter()
-        prompt_patch_count = count_prompt_waveform_patches(
-            prompt_waveform,
-            patch_size=self.ming_config.patch_size,
-            frame_hop=self.ming_config.audio_frame_hop,
-            vae_patch_size=self.ming_config.vae_patch_size,
-        )
-        patch_ms = (time.perf_counter() - patch_start) * 1000.0
-
-        finalized_prompt = copy.copy(prompt)
-        finalized_additional_information = dict(additional_information)
-        finalized_prompt["additional_information"] = finalized_additional_information
-
-        prompt_prefix = finalized_prompt.get("prompt")
-        text = finalized_prompt.get("text")
-        token_start = time.perf_counter()
-        if isinstance(prompt_prefix, str) and isinstance(text, str):
-            speaker_embedding = finalized_prompt.get("speaker_embedding")
-            if speaker_embedding is None:
-                speaker_embedding = finalized_additional_information.get(KEY_SPEAKER_EMBEDDING)
-            speaker_embeddings = coerce_speaker_embeddings(
-                speaker_embedding,
-                use_zero_spk_emb=bool(finalized_additional_information.get("use_zero_spk_emb", False)),
-            )
-
-            instruction = finalized_prompt.get("instruction")
-            if instruction is None:
-                instruction = finalized_additional_information.get("instruction")
-            instruction_text = instruction if isinstance(instruction, str) else create_instruction(instruction)
-
-            finalized_prompt["prompt_token_ids"] = build_dense_prompt_token_ids(
-                self.tokenizer,
-                prompt=prompt_prefix,
-                text=text,
-                instruction=instruction_text,
-                prompt_text=prompt_text,
-                speaker_count=0 if speaker_embeddings is None else len(speaker_embeddings),
-                prompt_patch_count=prompt_patch_count,
-            )
-            if self.profile_ingress:
-                elapsed_ms = (time.perf_counter() - total_start) * 1000.0
-                token_ms = (time.perf_counter() - token_start) * 1000.0
-                logger.info(
-                    "MING_INGRESS_PROFILE finalize_prompt prompt_patch_count=%d speaker_count=%d "
-                    "patch_ms=%.3f token_rebuild_ms=%.3f elapsed_ms=%.3f",
-                    prompt_patch_count,
-                    0 if speaker_embeddings is None else len(speaker_embeddings),
-                    patch_ms,
-                    token_ms,
-                    elapsed_ms,
-                )
-            return finalized_prompt
-
-        finalized_prompt["prompt_token_ids"] = _rebuild_prompt_token_ids_with_exact_patch_count(
-            finalized_prompt.get("prompt_token_ids"),
-            prompt_patch_count,
-        )
-        if self.profile_ingress:
-            elapsed_ms = (time.perf_counter() - total_start) * 1000.0
-            token_ms = (time.perf_counter() - token_start) * 1000.0
-            logger.info(
-                "MING_INGRESS_PROFILE finalize_prompt prompt_patch_count=%d speaker_count=unknown "
-                "patch_ms=%.3f token_rebuild_ms=%.3f elapsed_ms=%.3f",
-                prompt_patch_count,
-                patch_ms,
-                token_ms,
-                elapsed_ms,
-            )
-        return finalized_prompt

From a8a7bf7363c9786999b52ca79e070c8e0449713b Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Thu, 28 May 2026 23:10:11 +0530
Subject: [PATCH 46/54] fix(ming-tts): prevent abandoned stream leaks and fix
 encoder race condition

Adds a size-bounded TTL cache to expire abandoned streams in AudioVAE and implements double-checked locking for prompt encoder cold loads.

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../models/ming_tts/ming_tts_audio_vae.py     |  24 +++++
 .../models/ming_tts/prompt_utils.py           | 101 ++++++++++--------
 2 files changed, 82 insertions(+), 43 deletions(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py b/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
index 186a1366f24..27c397dbf58 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
@@ -2,7 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
+import time
 import warnings
+from collections import OrderedDict
 from collections.abc import Iterable
 from typing import Any
 
@@ -21,6 +23,8 @@
 logger = init_logger(__name__)
 
 MING_FINAL_DECODE_STEP_KEY = "ming_final_decode_step"
+_STREAM_STATE_TTL_SECONDS = 15 * 60
+_MAX_STREAM_STATES = 1024
 
 
 class MingAudioVAEModel(nn.Module):
@@ -46,6 +50,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self._stream_state: dict[str, tuple[Any, Any, Any]] = {}
         self._patch_totals: dict[str, int] = {}
         self._sample_totals: dict[str, int] = {}
+        self._state_access_times: OrderedDict[str, float] = OrderedDict()
 
     def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor:
         hidden_size = int(self.ming_config.llm_hidden_size)
@@ -123,6 +128,7 @@ def forward(
                     f"Ming Stage-2 received a payload without {KEY_REQUEST_ID}. keys={sorted(info.keys())}"
                 )
             request_id = _resolve_request_id(info, idx)
+            self._touch_request_state(request_id)
             chunk_id = _coerce_optional_int(info.get(KEY_CHUNK_ID))
             finished = _coerce_finished(info.get("stream_finished", torch.tensor(True)))
             latent = info.get("ming_latent_patches")
@@ -245,6 +251,24 @@ def _clear_request_state(self, request_id: str) -> None:
         self._stream_state.pop(request_id, None)
         self._patch_totals.pop(request_id, None)
         self._sample_totals.pop(request_id, None)
+        self._state_access_times.pop(request_id, None)
+
+    def _touch_request_state(self, request_id: str) -> None:
+        now = time.monotonic()
+        self._evict_expired_request_states(now)
+        self._state_access_times[request_id] = now
+        self._state_access_times.move_to_end(request_id)
+        while len(self._state_access_times) > _MAX_STREAM_STATES:
+            evicted_request_id, _ = self._state_access_times.popitem(last=False)
+            self._clear_request_state(evicted_request_id)
+
+    def _evict_expired_request_states(self, now: float) -> None:
+        cutoff = now - _STREAM_STATE_TTL_SECONDS
+        while self._state_access_times:
+            request_id, last_access = next(iter(self._state_access_times.items()))
+            if last_access >= cutoff:
+                break
+            self._clear_request_state(request_id)
 
 
 def _coerce_finished(value: Any) -> bool:
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_utils.py b/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
index 5b66c04d9c8..2ecf83e3521 100644
--- a/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
+++ b/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
@@ -5,12 +5,15 @@
 import json
 import math
 import re
+import threading
+import time
 from io import BytesIO
 from pathlib import Path
 from typing import Any
 
 import torch
 from safetensors import safe_open
+from vllm.logger import init_logger
 
 from vllm_omni.engine.stage_init_utils import _resolve_model_to_local_path
 from vllm_omni.model_executor.models.ming_flash_omni.prompt_utils import (
@@ -34,6 +37,8 @@
     MingTTSConfig,
 )
 
+logger = init_logger(__name__)
+_PROMPT_ENCODER_LOAD_LOCK = threading.Lock()
 _DURATION_SECONDS_RE = re.compile(r"Duration:\s*([0-9]+(?:\.[0-9]+)?)\s*s\b", re.IGNORECASE)
 
 
@@ -229,7 +234,10 @@ def build_dense_prompt_token_ids(
     prompt_text_tokens = (
         tokenizer.encode(prompt_text) if int(prompt_patch_count) > 0 and prompt_text is not None else []
     )
-    prompt_latent_tokens = [tokenizer.convert_tokens_to_ids("<audioPatch>")] * int(prompt_patch_count)
+    audio_patch_token_id = tokenizer.convert_tokens_to_ids("<audioPatch>")
+    if audio_patch_token_id == tokenizer.unk_token_id:
+        raise ValueError("Ming tokenizer is missing required <audioPatch> token.")
+    prompt_latent_tokens = [audio_patch_token_id] * int(prompt_patch_count)
     text_input_prefix = (
         []
         if all(token in text for token in ("Genre: ", "Mood: ", "Instrument: ", "Theme: ", "Duration: "))
@@ -375,48 +383,55 @@ def _resolve_prompt_latents(wrapper: Any, info_dict: dict[str, Any]) -> dict[str
 def _load_prompt_encoder(wrapper: Any) -> AudioVAE:
     if wrapper._prompt_encoder is not None:
         return wrapper._prompt_encoder
-    if wrapper.ming_config.audio_tokenizer_config is None:
-        raise RuntimeError("Ming Stage-0 requires audio_tokenizer_config to encode prompt audio.")
-
-    encoder = AudioVAE(wrapper.ming_config.audio_tokenizer_config).eval()
-    state_dict = encoder.state_dict()
-    loaded = 0
-    loaded_encoder_params = set()
-    with torch.no_grad():
-        for shard_path in _iter_model_safetensors(
-            _resolve_model_to_local_path(str(wrapper.vllm_config.model_config.model))
-        ):
-            with safe_open(str(shard_path), framework="pt", device="cpu") as handle:
-                for key in handle.keys():
-                    if not key.startswith("audio.encoder."):
-                        continue
-                    name = key[len("audio.") :]
-                    if name not in state_dict:
-                        continue
-                    target = state_dict[name]
-                    target.copy_(handle.get_tensor(key).to(device=target.device, dtype=target.dtype))
-                    loaded += 1
-                    loaded_encoder_params.add(name)
-    if loaded == 0:
-        raise RuntimeError("Ming prompt encoder received no audio.encoder.* weights from checkpoint.")
-
-    expected_encoder_params = {f"encoder.{name}" for name, _ in encoder.encoder.named_parameters()}
-    missing = expected_encoder_params - loaded_encoder_params
-    if missing:
-        raise RuntimeError(f"Ming prompt encoder: {len(missing)} params not loaded. First few: {sorted(missing)[:5]}")
-
-    dev = next(wrapper.parameters()).device
-    try:
-        del encoder.decoder
-        encoder.decoder = None
-        if dev.type != "cpu":
-            encoder.encoder.to(dev, dtype=getattr(wrapper.model, "fm_dtype", torch.bfloat16))
-        else:
-            encoder.encoder.to(dev)
-    except Exception as exc:
-        raise RuntimeError(f"Failed to move Ming prompt encoder to {dev}: {exc}") from exc
-    wrapper._prompt_encoder = encoder
-    return encoder
+    with _PROMPT_ENCODER_LOAD_LOCK:
+        if wrapper._prompt_encoder is not None:
+            return wrapper._prompt_encoder
+        if wrapper.ming_config.audio_tokenizer_config is None:
+            raise RuntimeError("Ming Stage-0 requires audio_tokenizer_config to encode prompt audio.")
+
+        load_start = time.perf_counter()
+        encoder = AudioVAE(wrapper.ming_config.audio_tokenizer_config).eval()
+        state_dict = encoder.state_dict()
+        loaded = 0
+        loaded_encoder_params = set()
+        with torch.no_grad():
+            for shard_path in _iter_model_safetensors(
+                _resolve_model_to_local_path(str(wrapper.vllm_config.model_config.model))
+            ):
+                with safe_open(str(shard_path), framework="pt", device="cpu") as handle:
+                    for key in handle.keys():
+                        if not key.startswith("audio.encoder."):
+                            continue
+                        name = key[len("audio.") :]
+                        if name not in state_dict:
+                            continue
+                        target = state_dict[name]
+                        target.copy_(handle.get_tensor(key).to(device=target.device, dtype=target.dtype))
+                        loaded += 1
+                        loaded_encoder_params.add(name)
+        if loaded == 0:
+            raise RuntimeError("Ming prompt encoder received no audio.encoder.* weights from checkpoint.")
+
+        expected_encoder_params = {f"encoder.{name}" for name, _ in encoder.encoder.named_parameters()}
+        missing = expected_encoder_params - loaded_encoder_params
+        if missing:
+            raise RuntimeError(
+                f"Ming prompt encoder: {len(missing)} params not loaded. First few: {sorted(missing)[:5]}"
+            )
+
+        dev = next(wrapper.parameters()).device
+        try:
+            del encoder.decoder
+            encoder.decoder = None
+            if dev.type != "cpu":
+                encoder.encoder.to(dev, dtype=getattr(wrapper.model, "fm_dtype", torch.bfloat16))
+            else:
+                encoder.encoder.to(dev)
+        except Exception as exc:
+            raise RuntimeError(f"Failed to move Ming prompt encoder to {dev}: {exc}") from exc
+        wrapper._prompt_encoder = encoder
+        logger.info("Ming prompt encoder cold-loaded in %.3f ms", (time.perf_counter() - load_start) * 1000.0)
+        return encoder
 
 
 @torch.inference_mode()

From 6155fae6165ea662df6ac7ff2e4c5ebde303dc9e Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Thu, 28 May 2026 23:12:19 +0530
Subject: [PATCH 47/54] chore(ming-tts): address config, pathing and defensive
 review nits

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../text_to_speech/ming_tts/end2end.py                |  3 ++-
 .../text_to_speech/ming_tts/run_server.sh             |  5 ++++-
 vllm_omni/entrypoints/openai/serving_speech.py        | 11 -----------
 .../model_executor/models/ming_tts/config_ming_tts.py |  1 +
 vllm_omni/model_executor/models/ming_tts/ming_tts.py  |  2 +-
 5 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/examples/offline_inference/text_to_speech/ming_tts/end2end.py b/examples/offline_inference/text_to_speech/ming_tts/end2end.py
index 43d693b7675..a2607a03ff7 100644
--- a/examples/offline_inference/text_to_speech/ming_tts/end2end.py
+++ b/examples/offline_inference/text_to_speech/ming_tts/end2end.py
@@ -41,7 +41,8 @@
     )
 
 _DEFAULT_MODEL = "inclusionAI/Ming-omni-tts-0.5B"
-_DEFAULT_DEPLOY_CONFIG = "vllm_omni/deploy/ming_tts.yaml"
+_REPO_ROOT = Path(__file__).resolve().parents[4]
+_DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni/deploy/ming_tts.yaml")
 _CASES_FILE = Path(__file__).with_name("cases.yaml")
 
 CASE_DEFAULTS = yaml.safe_load(_CASES_FILE.read_text(encoding="utf-8")) or {}
diff --git a/examples/online_serving/text_to_speech/ming_tts/run_server.sh b/examples/online_serving/text_to_speech/ming_tts/run_server.sh
index ba35e13fd95..7d294d950ce 100755
--- a/examples/online_serving/text_to_speech/ming_tts/run_server.sh
+++ b/examples/online_serving/text_to_speech/ming_tts/run_server.sh
@@ -7,9 +7,12 @@
 
 set -e
 
+DIR="$(cd "$(dirname "$0")" && pwd)"
+ROOT="$(cd "$DIR/../../../.." && pwd)"
+
 MODEL="${MODEL:-inclusionAI/Ming-omni-tts-0.5B}"
 PORT="${PORT:-8091}"
-DEPLOY_CONFIG="${DEPLOY_CONFIG:-vllm_omni/deploy/ming_tts.yaml}"
+DEPLOY_CONFIG="${DEPLOY_CONFIG:-$ROOT/vllm_omni/deploy/ming_tts.yaml}"
 
 echo "Starting Ming-omni-tts server with model: $MODEL"
 echo "Deploy config: $DEPLOY_CONFIG"
diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index a4695712590..ab3073076dc 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -2915,17 +2915,6 @@ async def _prepare_speech_generation(
                 # Stage-0 needs one extra token beyond ming_max_decode_steps.
                 sampling_params_list[0].max_tokens = int(request.max_new_tokens) + 1
 
-        # Propagate per-request seed to sampling params so both Slow AR
-        # and Fast AR produce deterministic output for the same seed.
-        if request.seed is not None and sampling_params_list:
-            if not self._is_fish_speech:
-                logger.warning(
-                    "seed=%d requested but deterministic Fast AR seeding is "
-                    "only implemented for Fish Speech; other TTS models will "
-                    "use the seed for the main AR sampler only.",
-                    request.seed,
-                )
-
         if request.seed is not None and sampling_params_list:
             if sampling_params_list is self.engine_client.default_sampling_params_list:
                 import copy
diff --git a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
index 5b2036a63dd..a529e06c656 100644
--- a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
@@ -62,6 +62,7 @@ def _coerce_qwen2_config(value: Any) -> Qwen2Config:
 
 
 class MingDenseConfig(PretrainedConfig):
+    # Keep this aligned with the upstream inclusionAI HF config; the repo declares model_type="dense".
     model_type = "dense"
 
     def __init__(
diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts.py b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
index 3b495efb259..2410bdcfdde 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
@@ -171,7 +171,7 @@ def _prefill_preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tenso
                 speaker_embedding,
                 use_zero_spk_emb=bool(info_dict.get("use_zero_spk_emb", False)),
             )
-        if speaker_embeddings:
+        if speaker_embeddings is not None and len(speaker_embeddings) > 0:
             speaker_slots = _find_speaker_placeholder_positions(input_ids, self.vllm_config.model_config.hf_config)
             if len(speaker_slots) < len(speaker_embeddings):
                 raise RuntimeError(

From 7fc12efe6dea83b72e99ff6c4554e0831b34c3c1 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Thu, 28 May 2026 23:44:32 +0530
Subject: [PATCH 48/54] Gate Ming TTS final-stage logging

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../models/ming_tts/ming_tts_audio_vae.py     | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py b/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
index 27c397dbf58..a7eee3d750d 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
+import os
 import time
 import warnings
 from collections import OrderedDict
@@ -189,19 +190,20 @@ def forward(
                 )
 
             if finished:
-                logger.info(
-                    "MING_STAGE1_FINAL %s",
-                    {
-                        "request_id": request_id,
-                        "chunk_id": chunk_id,
-                        "stop_reason": info.get(MING_STOP_REASON_KEY),
-                        "final_decode_step": _coerce_optional_int(info.get(MING_FINAL_DECODE_STEP_KEY)),
-                        "final_chunk_patch_count": patch_count,
-                        "total_patch_count": total_patch_count,
-                        "final_chunk_waveform_numel": int(waveform_flat.numel()),
-                        "total_waveform_numel": total_waveform_numel,
-                    },
-                )
+                if os.environ.get("MING_TTS_STAGE1_FINAL_LOG") == "1":
+                    logger.info(
+                        "MING_STAGE1_FINAL %s",
+                        {
+                            "request_id": request_id,
+                            "chunk_id": chunk_id,
+                            "stop_reason": info.get(MING_STOP_REASON_KEY),
+                            "final_decode_step": _coerce_optional_int(info.get(MING_FINAL_DECODE_STEP_KEY)),
+                            "final_chunk_patch_count": patch_count,
+                            "total_patch_count": total_patch_count,
+                            "final_chunk_waveform_numel": int(waveform_flat.numel()),
+                            "total_waveform_numel": total_waveform_numel,
+                        },
+                    )
                 self._clear_request_state(request_id)
             else:
                 self._past_key_values[request_id] = past_key_values

From 9862a46f79af5d945023163a9fb6fa81d969f4e5 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Fri, 29 May 2026 01:07:17 +0530
Subject: [PATCH 49/54] Split Ming TTS prompt helpers by responsibility

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../text_to_speech/ming_tts/end2end.py        |   2 +-
 tests/e2e/offline_inference/test_ming_tts.py  |   2 +-
 .../entrypoints/openai/serving_speech.py      |   2 +-
 .../models/ming_tts/audio_prep.py             | 261 ++++++++
 .../models/ming_tts/ming_tts.py               |  18 +-
 .../models/ming_tts/prompt_assembly.py        | 274 ++++++++
 .../models/ming_tts/prompt_encoder.py         | 174 +++++
 .../models/ming_tts/prompt_utils.py           | 622 ------------------
 8 files changed, 721 insertions(+), 634 deletions(-)
 create mode 100644 vllm_omni/model_executor/models/ming_tts/audio_prep.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/prompt_assembly.py
 create mode 100644 vllm_omni/model_executor/models/ming_tts/prompt_encoder.py
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/prompt_utils.py

diff --git a/examples/offline_inference/text_to_speech/ming_tts/end2end.py b/examples/offline_inference/text_to_speech/ming_tts/end2end.py
index a2607a03ff7..5bcb18ac712 100644
--- a/examples/offline_inference/text_to_speech/ming_tts/end2end.py
+++ b/examples/offline_inference/text_to_speech/ming_tts/end2end.py
@@ -20,7 +20,7 @@
     KEY_TEMPERATURE,
     SAMPLE_RATE,
 )
-from vllm_omni.model_executor.models.ming_tts.prompt_utils import build_ming_dense_prompt
+from vllm_omni.model_executor.models.ming_tts.prompt_assembly import build_ming_dense_prompt
 from vllm_omni.model_executor.models.ming_tts.speaker_extractor import MingSpeakerEmbeddingExtractor
 
 try:
diff --git a/tests/e2e/offline_inference/test_ming_tts.py b/tests/e2e/offline_inference/test_ming_tts.py
index b979f234d8a..d9bab6dd26d 100644
--- a/tests/e2e/offline_inference/test_ming_tts.py
+++ b/tests/e2e/offline_inference/test_ming_tts.py
@@ -21,7 +21,7 @@
     SAMPLE_RATE,
     TEXT_EOS_TOKEN_ID,
 )
-from vllm_omni.model_executor.models.ming_tts.prompt_utils import DEFAULT_PROMPT, build_ming_dense_prompt
+from vllm_omni.model_executor.models.ming_tts.prompt_assembly import DEFAULT_PROMPT, build_ming_dense_prompt
 
 MODEL = "inclusionAI/Ming-omni-tts-0.5B"
 DEPLOY_CONFIG = get_deploy_config_path("ming_tts.yaml")
diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index ab3073076dc..777eb878e02 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -1930,7 +1930,7 @@ def _build_ming_dense_prompt(
         from transformers import AutoTokenizer
 
         from vllm_omni.model_executor.models.ming_tts.config_ming_tts import KEY_MAX_DECODE_STEPS
-        from vllm_omni.model_executor.models.ming_tts.prompt_utils import build_ming_dense_prompt
+        from vllm_omni.model_executor.models.ming_tts.prompt_assembly import build_ming_dense_prompt
 
         if self._tts_tokenizer is None:
             model_name = self.engine_client.model_config.model
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_prep.py b/vllm_omni/model_executor/models/ming_tts/audio_prep.py
new file mode 100644
index 00000000000..b63e70cc8a0
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/audio_prep.py
@@ -0,0 +1,261 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import math
+from io import BytesIO
+from typing import Any
+
+import torch
+
+from .config_ming_tts import (
+    AUDIO_FRAME_HOP,
+    LATENT_DIM,
+    PATCH_SIZE,
+    SAMPLE_RATE,
+    VAE_PATCH_SIZE,
+    VISION_START_TOKEN_ID,
+    MingTTSConfig,
+)
+
+
+def pad_prompt_waveform(
+    waveform: Any,
+    *,
+    patch_size: int = PATCH_SIZE,
+    sample_rate: int = SAMPLE_RATE,
+) -> torch.Tensor:
+    tensor = coerce_prompt_waveform(waveform)
+    pad_align = int((float(sample_rate) / 12.5) * int(patch_size))
+    new_len = ((int(tensor.shape[-1]) + pad_align - 1) // pad_align) * pad_align
+    if new_len == int(tensor.shape[-1]):
+        return tensor
+    padded = torch.zeros((1, new_len), dtype=tensor.dtype, device=tensor.device)
+    padded[:, : tensor.shape[-1]] = tensor
+    return padded
+
+
+def coerce_prompt_waveform(value: Any) -> torch.Tensor:
+    if value is None:
+        raise ValueError("prompt waveform cannot be None")
+    if isinstance(value, torch.Tensor):
+        tensor = value.detach()
+        if tensor.ndim == 1:
+            return tensor.unsqueeze(0).to(torch.float32)
+        if tensor.ndim == 2:
+            if tensor.shape[0] != 1:
+                return tensor.reshape(1, -1).to(torch.float32)
+            return tensor.to(torch.float32)
+        raise ValueError(f"Unsupported Ming prompt waveform rank: {tuple(tensor.shape)}")
+    if isinstance(value, (list, tuple)):
+        parts = [coerce_prompt_waveform(item) for item in value if item is not None]
+        if not parts:
+            raise ValueError("prompt waveform list was empty")
+        return torch.cat(parts, dim=-1)
+    return coerce_prompt_waveform(torch.as_tensor(value))
+
+
+def coerce_speaker_embeddings(value: Any, *, use_zero_spk_emb: bool = False) -> list[torch.Tensor] | None:
+    if value is None:
+        return [torch.zeros((192,), dtype=torch.float32)] if use_zero_spk_emb else None
+    if isinstance(value, torch.Tensor):
+        tensor = value.detach()
+        if tensor.ndim == 1:
+            tensor = tensor.unsqueeze(0)
+        if tensor.ndim != 2:
+            raise ValueError(f"Unsupported Ming speaker embedding shape: {tuple(tensor.shape)}")
+        items = [row.reshape(-1).to(torch.float32).cpu() for row in tensor]
+    elif isinstance(value, (list, tuple)):
+        if value and all(not isinstance(item, (list, tuple, torch.Tensor)) for item in value):
+            items = [torch.as_tensor(value).detach().reshape(-1).to(torch.float32).cpu()]
+        else:
+            items = []
+            for item in value:
+                if item is None:
+                    continue
+                if not isinstance(item, torch.Tensor):
+                    item = torch.as_tensor(item)
+                items.append(item.detach().reshape(-1).to(torch.float32).cpu())
+    else:
+        return coerce_speaker_embeddings(torch.as_tensor(value), use_zero_spk_emb=use_zero_spk_emb)
+    if not items:
+        return [torch.zeros((192,), dtype=torch.float32)] if use_zero_spk_emb else None
+    for item in items:
+        if int(item.numel()) != 192:
+            raise ValueError(f"Ming speaker embedding must have 192 dims, got {int(item.numel())}")
+    return items
+
+
+def count_prompt_latent_patches(
+    value: Any,
+    *,
+    patch_size: int = PATCH_SIZE,
+    latent_dim: int = LATENT_DIM,
+) -> int:
+    if value is None:
+        return 0
+    if not isinstance(value, torch.Tensor):
+        value = torch.as_tensor(value)
+    latents = value.detach()
+    if latents.ndim == 3 and latents.shape[0] == 1:
+        latents = latents.squeeze(0)
+    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
+        return int(latents.shape[0])
+    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
+        raise ValueError(f"Unsupported Ming prompt_latents shape: {tuple(latents.shape)}")
+    if latents.shape[0] % patch_size != 0:
+        raise ValueError(
+            f"Ming prompt_latents frame count must be divisible by patch_size={patch_size}, "
+            f"got frames={int(latents.shape[0])}"
+        )
+    return int(latents.shape[0] // patch_size)
+
+
+def count_prompt_waveform_patches(
+    value: Any,
+    *,
+    patch_size: int = PATCH_SIZE,
+    frame_hop: int = AUDIO_FRAME_HOP,
+    vae_patch_size: int = VAE_PATCH_SIZE,
+) -> int:
+    if value is None:
+        return 0
+    waveform = pad_prompt_waveform(value, patch_size=patch_size)
+    frame_count = int(math.ceil(float(waveform.shape[-1]) / float(frame_hop)))
+    latent_frames = int(math.ceil(float(frame_count) / float(vae_patch_size)))
+    if latent_frames % int(patch_size) != 0:
+        raise ValueError(
+            f"Ming prompt waveform produced latent frame count not divisible by patch_size={patch_size}: "
+            f"frames={latent_frames}"
+        )
+    return int(latent_frames // int(patch_size))
+
+
+def _normalize_prompt_waveform(value: Any, *, target_sr: int) -> torch.Tensor:
+    if isinstance(value, bytes):
+        import torchaudio
+
+        waveform, sr = torchaudio.load(BytesIO(value))
+        waveform = waveform[:1].to(torch.float32)
+        if int(sr) != int(target_sr):
+            from torchaudio.functional import resample as resample_audio
+
+            waveform = resample_audio(waveform, int(sr), int(target_sr))
+        return waveform
+
+    if isinstance(value, tuple) and len(value) == 2 and isinstance(value[1], int):
+        waveform = coerce_prompt_waveform(value[0])
+        if int(value[1]) != int(target_sr):
+            from torchaudio.functional import resample as resample_audio
+
+            waveform = resample_audio(waveform, int(value[1]), int(target_sr))
+        return waveform
+
+    if isinstance(value, dict):
+        samples = value.get("samples", value.get("array", value.get("waveform")))
+        sr = value.get("sample_rate", value.get("sr", target_sr))
+        return _normalize_prompt_waveform((samples, int(sr)), target_sr=target_sr)
+
+    return coerce_prompt_waveform(value)
+
+
+def _coerce_prompt_latents(
+    value: Any,
+    *,
+    patch_size: int,
+    latent_dim: int,
+) -> dict[str, torch.Tensor] | None:
+    if value is None:
+        return None
+    if not isinstance(value, torch.Tensor):
+        value = torch.as_tensor(value)
+
+    latents = value.detach()
+    if latents.ndim == 3 and latents.shape[0] == 1:
+        latents = latents.squeeze(0)
+
+    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
+        patches = latents
+        # [Patch, Time, Dimension] -> [Frame, Dimension] for history seeding.
+        frames = patches.reshape(-1, latent_dim)
+        return {"patches": patches, "frames": frames}
+
+    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
+        raise ValueError(f"Unsupported prompt latent shape: {tuple(latents.shape)}")
+    if latents.shape[0] % patch_size != 0:
+        raise ValueError(
+            f"Prompt latent frame count must be divisible by patch_size={patch_size}, "
+            f"got frames={int(latents.shape[0])}"
+        )
+    # [Frame, Dimension] -> [Patch, Time, Dimension] for Aggregator prompt slots.
+    patches = latents.reshape(-1, patch_size, latent_dim) if latents.shape[0] > 0 else None
+    return {"patches": patches, "frames": latents}
+
+
+def _initial_history(
+    frames: torch.Tensor | None,
+    *,
+    history_size: int,
+    latent_dim: int,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    history = torch.zeros((history_size, latent_dim), device=device, dtype=dtype)
+    if frames is None or frames.numel() == 0:
+        return history
+    frames = frames.to(device=device, dtype=dtype)
+    take = min(history_size, int(frames.shape[0]))
+    history[-take:] = frames[-take:]
+    return history
+
+
+def _take_scalar(value: Any, idx: int) -> float | None:
+    if not isinstance(value, torch.Tensor) or value.numel() == 0:
+        return None
+    return float(value.reshape(-1)[idx].item())
+
+
+def _find_audio_placeholder_positions(input_ids: torch.Tensor, cfg: MingTTSConfig) -> torch.Tensor:
+    dummy_pos = (input_ids == cfg.audio_dummy_token_id).nonzero(as_tuple=True)[0]
+    if dummy_pos.numel() == 0:
+        return dummy_pos
+
+    audio_start_pos = (input_ids == cfg.audio_start_token_id).nonzero(as_tuple=True)[0]
+    audio_end_pos = (input_ids == cfg.audio_end_token_id).nonzero(as_tuple=True)[0]
+    if audio_start_pos.numel() == 0:
+        return dummy_pos
+
+    start = int(audio_start_pos[0].item())
+    end = int(audio_end_pos[0].item()) if audio_end_pos.numel() > 0 else int(input_ids.shape[0])
+    keep = (dummy_pos > start) & (dummy_pos < end)
+    filtered = dummy_pos[keep]
+    return filtered if filtered.numel() > 0 else dummy_pos
+
+
+def _find_speaker_placeholder_positions(input_ids: torch.Tensor, hf_config: Any) -> list[int]:
+    vision_start_token_id = getattr(hf_config, "vision_start_token_id", VISION_START_TOKEN_ID)
+    vision_start_pos = (input_ids == int(vision_start_token_id)).nonzero(as_tuple=True)[0]
+    if vision_start_pos.numel() == 0:
+        return []
+
+    slots = []
+    for pos in vision_start_pos:
+        slot = int(pos.item()) + 1
+        if slot < int(input_ids.shape[0]):
+            slots.append(slot)
+    return slots
+
+
+__all__ = [
+    "coerce_prompt_waveform",
+    "coerce_speaker_embeddings",
+    "count_prompt_latent_patches",
+    "count_prompt_waveform_patches",
+    "pad_prompt_waveform",
+    "_coerce_prompt_latents",
+    "_find_audio_placeholder_positions",
+    "_find_speaker_placeholder_positions",
+    "_initial_history",
+    "_normalize_prompt_waveform",
+    "_take_scalar",
+]
diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts.py b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
index 2410bdcfdde..d31005c5099 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts.py
@@ -14,6 +14,14 @@
 
 from vllm_omni.model_executor.custom_process_mixin import CustomProcessMixin
 
+from .audio_prep import (
+    _coerce_prompt_latents,
+    _find_audio_placeholder_positions,
+    _find_speaker_placeholder_positions,
+    _initial_history,
+    _take_scalar,
+    coerce_speaker_embeddings,
+)
 from .config_ming_tts import (
     AUDIO_START_TOKEN_ID,
     KEY_CFG,
@@ -35,15 +43,7 @@
     load_weights,
 )
 from .patch_emission import MING_STOP_REASON_KEY
-from .prompt_utils import (
-    _coerce_prompt_latents,
-    _find_audio_placeholder_positions,
-    _find_speaker_placeholder_positions,
-    _initial_history,
-    _resolve_prompt_latents,
-    _take_scalar,
-    coerce_speaker_embeddings,
-)
+from .prompt_encoder import _resolve_prompt_latents
 
 
 class _ModelSampleAdapter(nn.Module):
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_assembly.py b/vllm_omni/model_executor/models/ming_tts/prompt_assembly.py
new file mode 100644
index 00000000000..c732e07a045
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/prompt_assembly.py
@@ -0,0 +1,274 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import copy
+import json
+import math
+import re
+from typing import Any
+
+import torch
+
+from .audio_prep import (
+    coerce_speaker_embeddings,
+    count_prompt_latent_patches,
+    count_prompt_waveform_patches,
+    pad_prompt_waveform,
+)
+from .config_ming_tts import (
+    AUDIO_FRAME_HOP,
+    KEY_MAX_DECODE_STEPS,
+    KEY_MIN_DECODE_STEPS,
+    KEY_PROMPT_LATENTS,
+    KEY_REQUEST_ID,
+    KEY_SPEAKER_EMBEDDING,
+    LATENT_DIM,
+    PATCH_SIZE,
+    SAMPLE_RATE,
+    VAE_PATCH_SIZE,
+)
+
+DEFAULT_PROMPT = "Please generate speech based on the following description.\n"
+BASE_CAPTION_TEMPLATE: dict[str, Any] = {
+    "audio_sequence": [
+        {
+            "序号": 1,
+            "说话人": "speaker_1",
+            "方言": None,
+            "风格": None,
+            "语速": None,
+            "基频": None,
+            "音量": None,
+            "情感": None,
+            "BGM": {
+                "Genre": None,
+                "Mood": None,
+                "Instrument": None,
+                "Theme": None,
+                "ENV": None,
+                "SNR": None,
+            },
+            "IP": None,
+        }
+    ]
+}
+_DURATION_SECONDS_RE = re.compile(r"Duration:\s*([0-9]+(?:\.[0-9]+)?)\s*s\b", re.IGNORECASE)
+
+
+def create_instruction(user_input: Any) -> str | None:
+    if user_input is None:
+        return None
+    if isinstance(user_input, str):
+        return user_input
+    if not isinstance(user_input, dict):
+        raise ValueError(f"Ming instruction must be str, dict, or None; got {type(user_input).__name__}")
+    caption = copy.deepcopy(BASE_CAPTION_TEMPLATE)
+    item = caption["audio_sequence"][0]
+    for key, value in user_input.items():
+        if key in item:
+            item[key] = value
+    return json.dumps(caption, ensure_ascii=False)
+
+
+def parse_duration_seconds(text: str | None) -> float | None:
+    if not isinstance(text, str):
+        return None
+    match = _DURATION_SECONDS_RE.search(text)
+    if match is None:
+        return None
+    try:
+        value = float(match.group(1))
+    except ValueError:
+        return None
+    if value <= 0.0:
+        return None
+    return value
+
+
+def estimate_decode_steps_for_duration(
+    duration_seconds: float,
+    *,
+    sample_rate: int = SAMPLE_RATE,
+    frame_hop: int = AUDIO_FRAME_HOP,
+    patch_size: int = PATCH_SIZE,
+    vae_patch_size: int = VAE_PATCH_SIZE,
+) -> int:
+    if duration_seconds <= 0.0:
+        return 0
+    samples_per_decode_step = int(frame_hop) * int(patch_size) * int(vae_patch_size)
+    required_samples = float(duration_seconds) * float(sample_rate)
+    return max(1, int(math.ceil(required_samples / float(samples_per_decode_step))))
+
+
+def estimate_decode_step_window_for_duration(duration_seconds: float) -> tuple[int, int]:
+    target_steps = estimate_decode_steps_for_duration(duration_seconds)
+    min_steps = max(1, target_steps - 3)
+    max_steps = max(min_steps, target_steps + 3)
+    return min_steps, max_steps
+
+
+def resolve_effective_runtime_controls(
+    *,
+    text: str,
+    runtime_controls: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    controls = {} if runtime_controls is None else dict(runtime_controls)
+    has_explicit_min = KEY_MIN_DECODE_STEPS in controls and controls[KEY_MIN_DECODE_STEPS] is not None
+    has_explicit_max = KEY_MAX_DECODE_STEPS in controls and controls[KEY_MAX_DECODE_STEPS] is not None
+    if has_explicit_min or has_explicit_max:
+        return controls
+    duration_seconds = parse_duration_seconds(text)
+    if duration_seconds is None:
+        return controls
+    min_decode_steps, max_decode_steps = estimate_decode_step_window_for_duration(duration_seconds)
+    controls[KEY_MIN_DECODE_STEPS] = min_decode_steps
+    controls[KEY_MAX_DECODE_STEPS] = max_decode_steps
+    return controls
+
+
+def build_dense_prompt_token_ids(
+    tokenizer: Any,
+    *,
+    prompt: str,
+    text: str,
+    instruction: str | None = None,
+    prompt_text: str | None = None,
+    speaker_count: int = 0,
+    prompt_patch_count: int = 0,
+) -> list[int]:
+    speaker_prompt = []
+    for idx in range(int(speaker_count)):
+        speaker_prompt.extend(
+            tokenizer.encode(f"  speaker_{idx + 1}:")
+            + tokenizer.encode("<|vision_start|>")
+            + tokenizer.encode("<|vision_pad|>")
+            + tokenizer.encode("<|vision_end|>\n")
+        )
+    instruction_prompt = (
+        tokenizer.encode(instruction) + tokenizer.encode("<|endoftext|>") if instruction is not None else []
+    )
+    prompt_text_tokens = (
+        tokenizer.encode(prompt_text) if int(prompt_patch_count) > 0 and prompt_text is not None else []
+    )
+    audio_patch_token_id = tokenizer.convert_tokens_to_ids("<audioPatch>")
+    if audio_patch_token_id == tokenizer.unk_token_id:
+        raise ValueError("Ming tokenizer is missing required <audioPatch> token.")
+    prompt_latent_tokens = [audio_patch_token_id] * int(prompt_patch_count)
+    text_input_prefix = (
+        []
+        if all(token in text for token in ("Genre: ", "Mood: ", "Instrument: ", "Theme: ", "Duration: "))
+        else tokenizer.encode(" Text input:\n")
+    )
+    return (
+        tokenizer.encode("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n")
+        + tokenizer.encode("<|im_start|>user\n")
+        + tokenizer.encode(prompt)
+        + speaker_prompt
+        + text_input_prefix
+        + prompt_text_tokens
+        + tokenizer.encode(text)
+        + tokenizer.encode("<|im_end|>\n")
+        + tokenizer.encode("<|im_start|>assistant\n")
+        + instruction_prompt
+        + tokenizer.encode("<audio>")
+        + prompt_latent_tokens
+    )
+
+
+def build_ming_dense_prompt(
+    tokenizer: Any,
+    *,
+    prompt: str,
+    text: str,
+    runtime_controls: dict[str, Any] | None = None,
+    instruction: Any = None,
+    prompt_text: str | None = None,
+    prompt_waveform: Any = None,
+    prompt_latents: Any = None,
+    speaker_embedding: Any = None,
+    use_zero_spk_emb: bool = False,
+    request_id: str | None = None,
+) -> dict[str, Any]:
+    instruction_text = create_instruction(instruction)
+    speaker_embeddings = coerce_speaker_embeddings(speaker_embedding, use_zero_spk_emb=use_zero_spk_emb)
+    effective_runtime_controls = resolve_effective_runtime_controls(text=text, runtime_controls=runtime_controls)
+
+    prompt_waveform_tensor = None
+    prompt_patch_count = 0
+    if prompt_waveform is not None:
+        prompt_waveform_tensor = pad_prompt_waveform(prompt_waveform)
+        prompt_patch_count = count_prompt_waveform_patches(prompt_waveform_tensor)
+    if prompt_waveform_tensor is not None and prompt_latents is not None:
+        raise ValueError(
+            "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
+            "Choose exactly one source of truth."
+        )
+
+    prompt_latent_value = None
+    if prompt_waveform_tensor is not None and prompt_text is None:
+        raise ValueError(
+            "Ming prompt_waveform requires prompt_text for prompt-latent conditioning. "
+            "Use speaker_embedding for reference-audio-only speaker conditioning."
+        )
+    if prompt_latents is not None:
+        prompt_latent_value = torch.as_tensor(prompt_latents)
+        prompt_patch_count = count_prompt_latent_patches(
+            prompt_latent_value, patch_size=PATCH_SIZE, latent_dim=LATENT_DIM
+        )
+
+    prompt_token_ids = build_dense_prompt_token_ids(
+        tokenizer,
+        prompt=prompt,
+        text=text,
+        instruction=instruction_text,
+        prompt_text=prompt_text if prompt_patch_count > 0 else None,
+        speaker_count=0 if speaker_embeddings is None else len(speaker_embeddings),
+        prompt_patch_count=prompt_patch_count,
+    )
+
+    additional_information = {}
+    for key, value in effective_runtime_controls.items():
+        if isinstance(value, torch.Tensor):
+            additional_information[key] = value
+        elif key in (KEY_MIN_DECODE_STEPS, KEY_MAX_DECODE_STEPS):
+            additional_information[key] = torch.tensor(int(value), dtype=torch.int32)
+        else:
+            additional_information[key] = torch.tensor(float(value), dtype=torch.float32)
+    if request_id is not None:
+        additional_information[KEY_REQUEST_ID] = request_id
+    if instruction_text is not None:
+        additional_information["instruction"] = instruction_text
+    if prompt_text is not None:
+        additional_information["prompt_text"] = prompt_text
+    if prompt_waveform_tensor is not None:
+        additional_information["prompt_waveform"] = prompt_waveform_tensor
+        additional_information["prompt_waveform_length"] = torch.tensor(
+            [int(prompt_waveform_tensor.shape[-1])], dtype=torch.int32
+        )
+    if prompt_latent_value is not None:
+        additional_information[KEY_PROMPT_LATENTS] = prompt_latent_value
+    if speaker_embeddings is not None:
+        additional_information[KEY_SPEAKER_EMBEDDING] = (
+            speaker_embeddings[0] if len(speaker_embeddings) == 1 else torch.stack(speaker_embeddings, dim=0)
+        )
+    if use_zero_spk_emb:
+        additional_information["use_zero_spk_emb"] = True
+    return {
+        "prompt": prompt,
+        "text": text,
+        "prompt_token_ids": prompt_token_ids,
+        "additional_information": additional_information,
+    }
+
+
+__all__ = [
+    "DEFAULT_PROMPT",
+    "build_dense_prompt_token_ids",
+    "build_ming_dense_prompt",
+    "create_instruction",
+    "estimate_decode_step_window_for_duration",
+    "estimate_decode_steps_for_duration",
+    "parse_duration_seconds",
+    "resolve_effective_runtime_controls",
+]
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_encoder.py b/vllm_omni/model_executor/models/ming_tts/prompt_encoder.py
new file mode 100644
index 00000000000..fb368411d8a
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_tts/prompt_encoder.py
@@ -0,0 +1,174 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import json
+import threading
+import time
+from pathlib import Path
+from typing import Any
+
+import torch
+from safetensors import safe_open
+from vllm.logger import init_logger
+
+from vllm_omni.engine.stage_init_utils import _resolve_model_to_local_path
+
+from .audio_prep import (
+    _coerce_prompt_latents,
+    _normalize_prompt_waveform,
+    count_prompt_latent_patches,
+    pad_prompt_waveform,
+)
+from .audio_tokenizer.modeling_audio_vae import AudioVAE
+from .config_ming_tts import KEY_PROMPT_LATENTS
+
+logger = init_logger(__name__)
+_PROMPT_ENCODER_LOAD_LOCK = threading.Lock()
+
+
+def _resolve_prompt_latents(wrapper: Any, info_dict: dict[str, Any]) -> dict[str, torch.Tensor] | None:
+    raw_latents = info_dict.get(KEY_PROMPT_LATENTS, info_dict.get("prompt_latents"))
+    raw_waveform = info_dict.get("prompt_waveform", info_dict.get("prompt_waveforms"))
+    if raw_latents is not None and raw_waveform is not None:
+        raise ValueError(
+            "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
+            "Choose exactly one source of truth."
+        )
+
+    direct_latents = _coerce_prompt_latents(
+        raw_latents,
+        patch_size=wrapper.ming_config.patch_size,
+        latent_dim=wrapper.ming_config.latent_dim,
+    )
+    if direct_latents is not None:
+        return direct_latents
+    if raw_waveform is None:
+        return None
+
+    encode_fn = getattr(wrapper, "_encode_prompt_waveform_to_latents", None)
+    if callable(encode_fn):
+        latents = encode_fn(raw_waveform, info_dict.get("prompt_waveform_length"))
+    else:
+        latents = _encode_prompt_waveform_to_latents(
+            wrapper,
+            raw_waveform,
+            info_dict.get("prompt_waveform_length"),
+        )
+    return _coerce_prompt_latents(
+        latents,
+        patch_size=wrapper.ming_config.patch_size,
+        latent_dim=wrapper.ming_config.latent_dim,
+    )
+
+
+def _load_prompt_encoder(wrapper: Any) -> AudioVAE:
+    if wrapper._prompt_encoder is not None:
+        return wrapper._prompt_encoder
+    with _PROMPT_ENCODER_LOAD_LOCK:
+        if wrapper._prompt_encoder is not None:
+            return wrapper._prompt_encoder
+        if wrapper.ming_config.audio_tokenizer_config is None:
+            raise RuntimeError("Ming Stage-0 requires audio_tokenizer_config to encode prompt audio.")
+
+        load_start = time.perf_counter()
+        encoder = AudioVAE(wrapper.ming_config.audio_tokenizer_config).eval()
+        state_dict = encoder.state_dict()
+        loaded = 0
+        loaded_encoder_params = set()
+        with torch.no_grad():
+            for shard_path in _iter_model_safetensors(
+                _resolve_model_to_local_path(str(wrapper.vllm_config.model_config.model))
+            ):
+                with safe_open(str(shard_path), framework="pt", device="cpu") as handle:
+                    for key in handle.keys():
+                        if not key.startswith("audio.encoder."):
+                            continue
+                        name = key[len("audio.") :]
+                        if name not in state_dict:
+                            continue
+                        target = state_dict[name]
+                        target.copy_(handle.get_tensor(key).to(device=target.device, dtype=target.dtype))
+                        loaded += 1
+                        loaded_encoder_params.add(name)
+        if loaded == 0:
+            raise RuntimeError("Ming prompt encoder received no audio.encoder.* weights from checkpoint.")
+
+        expected_encoder_params = {f"encoder.{name}" for name, _ in encoder.encoder.named_parameters()}
+        missing = expected_encoder_params - loaded_encoder_params
+        if missing:
+            raise RuntimeError(
+                f"Ming prompt encoder: {len(missing)} params not loaded. First few: {sorted(missing)[:5]}"
+            )
+
+        dev = next(wrapper.parameters()).device
+        try:
+            del encoder.decoder
+            encoder.decoder = None
+            if dev.type != "cpu":
+                encoder.encoder.to(dev, dtype=getattr(wrapper.model, "fm_dtype", torch.bfloat16))
+            else:
+                encoder.encoder.to(dev)
+        except Exception as exc:
+            raise RuntimeError(f"Failed to move Ming prompt encoder to {dev}: {exc}") from exc
+        wrapper._prompt_encoder = encoder
+        logger.info("Ming prompt encoder cold-loaded in %.3f ms", (time.perf_counter() - load_start) * 1000.0)
+        return encoder
+
+
+@torch.inference_mode()
+def _encode_prompt_waveform_to_latents(wrapper: Any, waveform: Any, waveform_length: Any = None) -> torch.Tensor:
+    encoder = _load_prompt_encoder(wrapper)
+    waveform = _normalize_prompt_waveform(waveform, target_sr=wrapper.ming_config.sample_rate)
+    waveform = pad_prompt_waveform(
+        waveform,
+        patch_size=wrapper.ming_config.patch_size,
+        sample_rate=wrapper.ming_config.sample_rate,
+    )
+    dev = next(encoder.encoder.parameters()).device
+    waveform = waveform.to(device=dev, dtype=next(encoder.encoder.parameters()).dtype)
+    if waveform_length is None:
+        waveform_length = torch.full((waveform.shape[0],), waveform.shape[-1], dtype=torch.int32, device=dev)
+    elif not isinstance(waveform_length, torch.Tensor):
+        waveform_length = torch.as_tensor(waveform_length, dtype=torch.int32, device=dev)
+    else:
+        waveform_length = waveform_length.to(device=dev, dtype=torch.int32)
+
+    latents, _ = encoder.encode_latent(waveform, waveform_length)
+    if latents.ndim == 3 and latents.shape[0] == 1:
+        latents = latents.squeeze(0)
+    count_prompt_latent_patches(
+        latents,
+        patch_size=wrapper.ming_config.patch_size,
+        latent_dim=wrapper.ming_config.latent_dim,
+    )
+    return latents.detach().to(dtype=torch.float32).contiguous()
+
+
+def _iter_model_safetensors(local_model_path: str) -> list[Path]:
+    model_root = Path(local_model_path)
+    index_path = model_root / "model.safetensors.index.json"
+    if index_path.exists():
+        with index_path.open("r", encoding="utf-8") as handle:
+            index_data = json.load(handle)
+        filenames = sorted(set(index_data.get("weight_map", {}).values()))
+        if not filenames:
+            raise RuntimeError(f"No checkpoint shards listed in {index_path}")
+        return [model_root / filename for filename in filenames]
+
+    single_file = model_root / "model.safetensors"
+    if single_file.exists():
+        return [single_file]
+
+    files = sorted(model_root.glob("*.safetensors"))
+    if not files:
+        raise RuntimeError(f"No .safetensors checkpoint found under {local_model_path}")
+    return files
+
+
+__all__ = [
+    "_encode_prompt_waveform_to_latents",
+    "_iter_model_safetensors",
+    "_load_prompt_encoder",
+    "_resolve_prompt_latents",
+]
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_utils.py b/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
deleted file mode 100644
index 2ecf83e3521..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/prompt_utils.py
+++ /dev/null
@@ -1,622 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-import json
-import math
-import re
-import threading
-import time
-from io import BytesIO
-from pathlib import Path
-from typing import Any
-
-import torch
-from safetensors import safe_open
-from vllm.logger import init_logger
-
-from vllm_omni.engine.stage_init_utils import _resolve_model_to_local_path
-from vllm_omni.model_executor.models.ming_flash_omni.prompt_utils import (
-    DEFAULT_PROMPT,
-    create_instruction,
-)
-
-from .audio_tokenizer.modeling_audio_vae import AudioVAE
-from .config_ming_tts import (
-    AUDIO_FRAME_HOP,
-    KEY_MAX_DECODE_STEPS,
-    KEY_MIN_DECODE_STEPS,
-    KEY_PROMPT_LATENTS,
-    KEY_REQUEST_ID,
-    KEY_SPEAKER_EMBEDDING,
-    LATENT_DIM,
-    PATCH_SIZE,
-    SAMPLE_RATE,
-    VAE_PATCH_SIZE,
-    VISION_START_TOKEN_ID,
-    MingTTSConfig,
-)
-
-logger = init_logger(__name__)
-_PROMPT_ENCODER_LOAD_LOCK = threading.Lock()
-_DURATION_SECONDS_RE = re.compile(r"Duration:\s*([0-9]+(?:\.[0-9]+)?)\s*s\b", re.IGNORECASE)
-
-
-def parse_duration_seconds(text: str | None) -> float | None:
-    if not isinstance(text, str):
-        return None
-    match = _DURATION_SECONDS_RE.search(text)
-    if match is None:
-        return None
-    try:
-        value = float(match.group(1))
-    except ValueError:
-        return None
-    if value <= 0.0:
-        return None
-    return value
-
-
-def estimate_decode_steps_for_duration(
-    duration_seconds: float,
-    *,
-    sample_rate: int = SAMPLE_RATE,
-    frame_hop: int = AUDIO_FRAME_HOP,
-    patch_size: int = PATCH_SIZE,
-    vae_patch_size: int = VAE_PATCH_SIZE,
-) -> int:
-    if duration_seconds <= 0.0:
-        return 0
-    samples_per_decode_step = int(frame_hop) * int(patch_size) * int(vae_patch_size)
-    required_samples = float(duration_seconds) * float(sample_rate)
-    return max(1, int(math.ceil(required_samples / float(samples_per_decode_step))))
-
-
-def estimate_decode_step_window_for_duration(duration_seconds: float) -> tuple[int, int]:
-    target_steps = estimate_decode_steps_for_duration(duration_seconds)
-    min_steps = max(1, target_steps - 3)
-    max_steps = max(min_steps, target_steps + 3)
-    return min_steps, max_steps
-
-
-def pad_prompt_waveform(
-    waveform: Any,
-    *,
-    patch_size: int = PATCH_SIZE,
-    sample_rate: int = SAMPLE_RATE,
-) -> torch.Tensor:
-    tensor = coerce_prompt_waveform(waveform)
-    pad_align = int((float(sample_rate) / 12.5) * int(patch_size))
-    new_len = ((int(tensor.shape[-1]) + pad_align - 1) // pad_align) * pad_align
-    if new_len == int(tensor.shape[-1]):
-        return tensor
-    padded = torch.zeros((1, new_len), dtype=tensor.dtype, device=tensor.device)
-    padded[:, : tensor.shape[-1]] = tensor
-    return padded
-
-
-def coerce_prompt_waveform(value: Any) -> torch.Tensor:
-    if value is None:
-        raise ValueError("prompt waveform cannot be None")
-    if isinstance(value, torch.Tensor):
-        tensor = value.detach()
-        if tensor.ndim == 1:
-            return tensor.unsqueeze(0).to(torch.float32)
-        if tensor.ndim == 2:
-            if tensor.shape[0] != 1:
-                return tensor.reshape(1, -1).to(torch.float32)
-            return tensor.to(torch.float32)
-        raise ValueError(f"Unsupported Ming prompt waveform rank: {tuple(tensor.shape)}")
-    if isinstance(value, (list, tuple)):
-        parts = [coerce_prompt_waveform(item) for item in value if item is not None]
-        if not parts:
-            raise ValueError("prompt waveform list was empty")
-        return torch.cat(parts, dim=-1)
-    return coerce_prompt_waveform(torch.as_tensor(value))
-
-
-def coerce_speaker_embeddings(value: Any, *, use_zero_spk_emb: bool = False) -> list[torch.Tensor] | None:
-    if value is None:
-        return [torch.zeros((192,), dtype=torch.float32)] if use_zero_spk_emb else None
-    if isinstance(value, torch.Tensor):
-        tensor = value.detach()
-        if tensor.ndim == 1:
-            tensor = tensor.unsqueeze(0)
-        if tensor.ndim != 2:
-            raise ValueError(f"Unsupported Ming speaker embedding shape: {tuple(tensor.shape)}")
-        items = [row.reshape(-1).to(torch.float32).cpu() for row in tensor]
-    elif isinstance(value, (list, tuple)):
-        if value and all(not isinstance(item, (list, tuple, torch.Tensor)) for item in value):
-            items = [torch.as_tensor(value).detach().reshape(-1).to(torch.float32).cpu()]
-        else:
-            items = []
-            for item in value:
-                if item is None:
-                    continue
-                if not isinstance(item, torch.Tensor):
-                    item = torch.as_tensor(item)
-                items.append(item.detach().reshape(-1).to(torch.float32).cpu())
-    else:
-        return coerce_speaker_embeddings(torch.as_tensor(value), use_zero_spk_emb=use_zero_spk_emb)
-    if not items:
-        return [torch.zeros((192,), dtype=torch.float32)] if use_zero_spk_emb else None
-    for item in items:
-        if int(item.numel()) != 192:
-            raise ValueError(f"Ming speaker embedding must have 192 dims, got {int(item.numel())}")
-    return items
-
-
-def count_prompt_latent_patches(
-    value: Any,
-    *,
-    patch_size: int = PATCH_SIZE,
-    latent_dim: int = LATENT_DIM,
-) -> int:
-    if value is None:
-        return 0
-    if not isinstance(value, torch.Tensor):
-        value = torch.as_tensor(value)
-    latents = value.detach()
-    if latents.ndim == 3 and latents.shape[0] == 1:
-        latents = latents.squeeze(0)
-    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
-        return int(latents.shape[0])
-    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
-        raise ValueError(f"Unsupported Ming prompt_latents shape: {tuple(latents.shape)}")
-    if latents.shape[0] % patch_size != 0:
-        raise ValueError(
-            f"Ming prompt_latents frame count must be divisible by patch_size={patch_size}, "
-            f"got frames={int(latents.shape[0])}"
-        )
-    return int(latents.shape[0] // patch_size)
-
-
-def count_prompt_waveform_patches(
-    value: Any,
-    *,
-    patch_size: int = PATCH_SIZE,
-    frame_hop: int = AUDIO_FRAME_HOP,
-    vae_patch_size: int = VAE_PATCH_SIZE,
-) -> int:
-    if value is None:
-        return 0
-    waveform = pad_prompt_waveform(value, patch_size=patch_size)
-    frame_count = int(math.ceil(float(waveform.shape[-1]) / float(frame_hop)))
-    latent_frames = int(math.ceil(float(frame_count) / float(vae_patch_size)))
-    if latent_frames % int(patch_size) != 0:
-        raise ValueError(
-            f"Ming prompt waveform produced latent frame count not divisible by patch_size={patch_size}: "
-            f"frames={latent_frames}"
-        )
-    return int(latent_frames // int(patch_size))
-
-
-def resolve_effective_runtime_controls(
-    *,
-    text: str,
-    runtime_controls: dict[str, Any] | None = None,
-) -> dict[str, Any]:
-    controls = {} if runtime_controls is None else dict(runtime_controls)
-    has_explicit_min = KEY_MIN_DECODE_STEPS in controls and controls[KEY_MIN_DECODE_STEPS] is not None
-    has_explicit_max = KEY_MAX_DECODE_STEPS in controls and controls[KEY_MAX_DECODE_STEPS] is not None
-    if has_explicit_min or has_explicit_max:
-        return controls
-    duration_seconds = parse_duration_seconds(text)
-    if duration_seconds is None:
-        return controls
-    min_decode_steps, max_decode_steps = estimate_decode_step_window_for_duration(duration_seconds)
-    controls[KEY_MIN_DECODE_STEPS] = min_decode_steps
-    controls[KEY_MAX_DECODE_STEPS] = max_decode_steps
-    return controls
-
-
-def build_dense_prompt_token_ids(
-    tokenizer: Any,
-    *,
-    prompt: str,
-    text: str,
-    instruction: str | None = None,
-    prompt_text: str | None = None,
-    speaker_count: int = 0,
-    prompt_patch_count: int = 0,
-) -> list[int]:
-    speaker_prompt = []
-    for idx in range(int(speaker_count)):
-        speaker_prompt.extend(
-            tokenizer.encode(f"  speaker_{idx + 1}:")
-            + tokenizer.encode("<|vision_start|>")
-            + tokenizer.encode("<|vision_pad|>")
-            + tokenizer.encode("<|vision_end|>\n")
-        )
-    instruction_prompt = (
-        tokenizer.encode(instruction) + tokenizer.encode("<|endoftext|>") if instruction is not None else []
-    )
-    prompt_text_tokens = (
-        tokenizer.encode(prompt_text) if int(prompt_patch_count) > 0 and prompt_text is not None else []
-    )
-    audio_patch_token_id = tokenizer.convert_tokens_to_ids("<audioPatch>")
-    if audio_patch_token_id == tokenizer.unk_token_id:
-        raise ValueError("Ming tokenizer is missing required <audioPatch> token.")
-    prompt_latent_tokens = [audio_patch_token_id] * int(prompt_patch_count)
-    text_input_prefix = (
-        []
-        if all(token in text for token in ("Genre: ", "Mood: ", "Instrument: ", "Theme: ", "Duration: "))
-        else tokenizer.encode(" Text input:\n")
-    )
-    return (
-        tokenizer.encode("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n")
-        + tokenizer.encode("<|im_start|>user\n")
-        + tokenizer.encode(prompt)
-        + speaker_prompt
-        + text_input_prefix
-        + prompt_text_tokens
-        + tokenizer.encode(text)
-        + tokenizer.encode("<|im_end|>\n")
-        + tokenizer.encode("<|im_start|>assistant\n")
-        + instruction_prompt
-        + tokenizer.encode("<audio>")
-        + prompt_latent_tokens
-    )
-
-
-def build_ming_dense_prompt(
-    tokenizer: Any,
-    *,
-    prompt: str,
-    text: str,
-    runtime_controls: dict[str, Any] | None = None,
-    instruction: Any = None,
-    prompt_text: str | None = None,
-    prompt_waveform: Any = None,
-    prompt_latents: Any = None,
-    speaker_embedding: Any = None,
-    use_zero_spk_emb: bool = False,
-    request_id: str | None = None,
-) -> dict[str, Any]:
-    instruction_text = create_instruction(instruction)
-    speaker_embeddings = coerce_speaker_embeddings(speaker_embedding, use_zero_spk_emb=use_zero_spk_emb)
-    effective_runtime_controls = resolve_effective_runtime_controls(text=text, runtime_controls=runtime_controls)
-
-    prompt_waveform_tensor = None
-    prompt_patch_count = 0
-    if prompt_waveform is not None:
-        prompt_waveform_tensor = pad_prompt_waveform(prompt_waveform)
-        prompt_patch_count = count_prompt_waveform_patches(prompt_waveform_tensor)
-    if prompt_waveform_tensor is not None and prompt_latents is not None:
-        raise ValueError(
-            "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
-            "Choose exactly one source of truth."
-        )
-
-    prompt_latent_value = None
-    if prompt_waveform_tensor is not None and prompt_text is None:
-        raise ValueError(
-            "Ming prompt_waveform requires prompt_text for prompt-latent conditioning. "
-            "Use speaker_embedding for reference-audio-only speaker conditioning."
-        )
-    if prompt_latents is not None:
-        prompt_latent_value = torch.as_tensor(prompt_latents)
-        prompt_patch_count = count_prompt_latent_patches(
-            prompt_latent_value, patch_size=PATCH_SIZE, latent_dim=LATENT_DIM
-        )
-
-    prompt_token_ids = build_dense_prompt_token_ids(
-        tokenizer,
-        prompt=prompt,
-        text=text,
-        instruction=instruction_text,
-        prompt_text=prompt_text if prompt_patch_count > 0 else None,
-        speaker_count=0 if speaker_embeddings is None else len(speaker_embeddings),
-        prompt_patch_count=prompt_patch_count,
-    )
-
-    additional_information = {}
-    for key, value in effective_runtime_controls.items():
-        if isinstance(value, torch.Tensor):
-            additional_information[key] = value
-        elif key in (KEY_MIN_DECODE_STEPS, KEY_MAX_DECODE_STEPS):
-            additional_information[key] = torch.tensor(int(value), dtype=torch.int32)
-        else:
-            additional_information[key] = torch.tensor(float(value), dtype=torch.float32)
-    if request_id is not None:
-        additional_information[KEY_REQUEST_ID] = request_id
-    if instruction_text is not None:
-        additional_information["instruction"] = instruction_text
-    if prompt_text is not None:
-        additional_information["prompt_text"] = prompt_text
-    if prompt_waveform_tensor is not None:
-        additional_information["prompt_waveform"] = prompt_waveform_tensor
-        additional_information["prompt_waveform_length"] = torch.tensor(
-            [int(prompt_waveform_tensor.shape[-1])], dtype=torch.int32
-        )
-    if prompt_latent_value is not None:
-        additional_information[KEY_PROMPT_LATENTS] = prompt_latent_value
-    if speaker_embeddings is not None:
-        additional_information[KEY_SPEAKER_EMBEDDING] = (
-            speaker_embeddings[0] if len(speaker_embeddings) == 1 else torch.stack(speaker_embeddings, dim=0)
-        )
-    if use_zero_spk_emb:
-        additional_information["use_zero_spk_emb"] = True
-    return {
-        "prompt": prompt,
-        "text": text,
-        "prompt_token_ids": prompt_token_ids,
-        "additional_information": additional_information,
-    }
-
-
-def _resolve_prompt_latents(wrapper: Any, info_dict: dict[str, Any]) -> dict[str, torch.Tensor] | None:
-    raw_latents = info_dict.get(KEY_PROMPT_LATENTS, info_dict.get("prompt_latents"))
-    raw_waveform = info_dict.get("prompt_waveform", info_dict.get("prompt_waveforms"))
-    if raw_latents is not None and raw_waveform is not None:
-        raise ValueError(
-            "Ming waveform cloning request provided both raw prompt_waveform and explicit prompt_latents. "
-            "Choose exactly one source of truth."
-        )
-
-    direct_latents = _coerce_prompt_latents(
-        raw_latents,
-        patch_size=wrapper.ming_config.patch_size,
-        latent_dim=wrapper.ming_config.latent_dim,
-    )
-    if direct_latents is not None:
-        return direct_latents
-    if raw_waveform is None:
-        return None
-
-    encode_fn = getattr(wrapper, "_encode_prompt_waveform_to_latents", None)
-    if callable(encode_fn):
-        latents = encode_fn(raw_waveform, info_dict.get("prompt_waveform_length"))
-    else:
-        latents = _encode_prompt_waveform_to_latents(
-            wrapper,
-            raw_waveform,
-            info_dict.get("prompt_waveform_length"),
-        )
-    return _coerce_prompt_latents(
-        latents,
-        patch_size=wrapper.ming_config.patch_size,
-        latent_dim=wrapper.ming_config.latent_dim,
-    )
-
-
-def _load_prompt_encoder(wrapper: Any) -> AudioVAE:
-    if wrapper._prompt_encoder is not None:
-        return wrapper._prompt_encoder
-    with _PROMPT_ENCODER_LOAD_LOCK:
-        if wrapper._prompt_encoder is not None:
-            return wrapper._prompt_encoder
-        if wrapper.ming_config.audio_tokenizer_config is None:
-            raise RuntimeError("Ming Stage-0 requires audio_tokenizer_config to encode prompt audio.")
-
-        load_start = time.perf_counter()
-        encoder = AudioVAE(wrapper.ming_config.audio_tokenizer_config).eval()
-        state_dict = encoder.state_dict()
-        loaded = 0
-        loaded_encoder_params = set()
-        with torch.no_grad():
-            for shard_path in _iter_model_safetensors(
-                _resolve_model_to_local_path(str(wrapper.vllm_config.model_config.model))
-            ):
-                with safe_open(str(shard_path), framework="pt", device="cpu") as handle:
-                    for key in handle.keys():
-                        if not key.startswith("audio.encoder."):
-                            continue
-                        name = key[len("audio.") :]
-                        if name not in state_dict:
-                            continue
-                        target = state_dict[name]
-                        target.copy_(handle.get_tensor(key).to(device=target.device, dtype=target.dtype))
-                        loaded += 1
-                        loaded_encoder_params.add(name)
-        if loaded == 0:
-            raise RuntimeError("Ming prompt encoder received no audio.encoder.* weights from checkpoint.")
-
-        expected_encoder_params = {f"encoder.{name}" for name, _ in encoder.encoder.named_parameters()}
-        missing = expected_encoder_params - loaded_encoder_params
-        if missing:
-            raise RuntimeError(
-                f"Ming prompt encoder: {len(missing)} params not loaded. First few: {sorted(missing)[:5]}"
-            )
-
-        dev = next(wrapper.parameters()).device
-        try:
-            del encoder.decoder
-            encoder.decoder = None
-            if dev.type != "cpu":
-                encoder.encoder.to(dev, dtype=getattr(wrapper.model, "fm_dtype", torch.bfloat16))
-            else:
-                encoder.encoder.to(dev)
-        except Exception as exc:
-            raise RuntimeError(f"Failed to move Ming prompt encoder to {dev}: {exc}") from exc
-        wrapper._prompt_encoder = encoder
-        logger.info("Ming prompt encoder cold-loaded in %.3f ms", (time.perf_counter() - load_start) * 1000.0)
-        return encoder
-
-
-@torch.inference_mode()
-def _encode_prompt_waveform_to_latents(wrapper: Any, waveform: Any, waveform_length: Any = None) -> torch.Tensor:
-    encoder = _load_prompt_encoder(wrapper)
-    waveform = _normalize_prompt_waveform(waveform, target_sr=wrapper.ming_config.sample_rate)
-    waveform = pad_prompt_waveform(
-        waveform,
-        patch_size=wrapper.ming_config.patch_size,
-        sample_rate=wrapper.ming_config.sample_rate,
-    )
-    dev = next(encoder.encoder.parameters()).device
-    waveform = waveform.to(device=dev, dtype=next(encoder.encoder.parameters()).dtype)
-    if waveform_length is None:
-        waveform_length = torch.full((waveform.shape[0],), waveform.shape[-1], dtype=torch.int32, device=dev)
-    elif not isinstance(waveform_length, torch.Tensor):
-        waveform_length = torch.as_tensor(waveform_length, dtype=torch.int32, device=dev)
-    else:
-        waveform_length = waveform_length.to(device=dev, dtype=torch.int32)
-
-    latents, _ = encoder.encode_latent(waveform, waveform_length)
-    if latents.ndim == 3 and latents.shape[0] == 1:
-        latents = latents.squeeze(0)
-    count_prompt_latent_patches(
-        latents,
-        patch_size=wrapper.ming_config.patch_size,
-        latent_dim=wrapper.ming_config.latent_dim,
-    )
-    return latents.detach().to(dtype=torch.float32).contiguous()
-
-
-def _iter_model_safetensors(local_model_path: str) -> list[Path]:
-    model_root = Path(local_model_path)
-    index_path = model_root / "model.safetensors.index.json"
-    if index_path.exists():
-        with index_path.open("r", encoding="utf-8") as handle:
-            index_data = json.load(handle)
-        filenames = sorted(set(index_data.get("weight_map", {}).values()))
-        if not filenames:
-            raise RuntimeError(f"No checkpoint shards listed in {index_path}")
-        return [model_root / filename for filename in filenames]
-
-    single_file = model_root / "model.safetensors"
-    if single_file.exists():
-        return [single_file]
-
-    files = sorted(model_root.glob("*.safetensors"))
-    if not files:
-        raise RuntimeError(f"No .safetensors checkpoint found under {local_model_path}")
-    return files
-
-
-def _normalize_prompt_waveform(value: Any, *, target_sr: int) -> torch.Tensor:
-    if isinstance(value, bytes):
-        import torchaudio
-
-        waveform, sr = torchaudio.load(BytesIO(value))
-        waveform = waveform[:1].to(torch.float32)
-        if int(sr) != int(target_sr):
-            from torchaudio.functional import resample as resample_audio
-
-            waveform = resample_audio(waveform, int(sr), int(target_sr))
-        return waveform
-
-    if isinstance(value, tuple) and len(value) == 2 and isinstance(value[1], int):
-        waveform = coerce_prompt_waveform(value[0])
-        if int(value[1]) != int(target_sr):
-            from torchaudio.functional import resample as resample_audio
-
-            waveform = resample_audio(waveform, int(value[1]), int(target_sr))
-        return waveform
-
-    if isinstance(value, dict):
-        samples = value.get("samples", value.get("array", value.get("waveform")))
-        sr = value.get("sample_rate", value.get("sr", target_sr))
-        return _normalize_prompt_waveform((samples, int(sr)), target_sr=target_sr)
-
-    return coerce_prompt_waveform(value)
-
-
-def _coerce_prompt_latents(
-    value: Any,
-    *,
-    patch_size: int,
-    latent_dim: int,
-) -> dict[str, torch.Tensor] | None:
-    if value is None:
-        return None
-    if not isinstance(value, torch.Tensor):
-        value = torch.as_tensor(value)
-
-    latents = value.detach()
-    if latents.ndim == 3 and latents.shape[0] == 1:
-        latents = latents.squeeze(0)
-
-    if latents.ndim == 3 and latents.shape[-2:] == (patch_size, latent_dim):
-        patches = latents
-        # [Patch, Time, Dimension] -> [Frame, Dimension] for history seeding.
-        frames = patches.reshape(-1, latent_dim)
-        return {"patches": patches, "frames": frames}
-
-    if latents.ndim != 2 or latents.shape[-1] != latent_dim:
-        raise ValueError(f"Unsupported prompt latent shape: {tuple(latents.shape)}")
-    if latents.shape[0] % patch_size != 0:
-        raise ValueError(
-            f"Prompt latent frame count must be divisible by patch_size={patch_size}, "
-            f"got frames={int(latents.shape[0])}"
-        )
-    # [Frame, Dimension] -> [Patch, Time, Dimension] for Aggregator prompt slots.
-    patches = latents.reshape(-1, patch_size, latent_dim) if latents.shape[0] > 0 else None
-    return {"patches": patches, "frames": latents}
-
-
-def _initial_history(
-    frames: torch.Tensor | None,
-    *,
-    history_size: int,
-    latent_dim: int,
-    device: torch.device,
-    dtype: torch.dtype,
-) -> torch.Tensor:
-    history = torch.zeros((history_size, latent_dim), device=device, dtype=dtype)
-    if frames is None or frames.numel() == 0:
-        return history
-    frames = frames.to(device=device, dtype=dtype)
-    take = min(history_size, int(frames.shape[0]))
-    history[-take:] = frames[-take:]
-    return history
-
-
-def _take_scalar(value: Any, idx: int) -> float | None:
-    if not isinstance(value, torch.Tensor) or value.numel() == 0:
-        return None
-    return float(value.reshape(-1)[idx].item())
-
-
-def _find_audio_placeholder_positions(input_ids: torch.Tensor, cfg: MingTTSConfig) -> torch.Tensor:
-    dummy_pos = (input_ids == cfg.audio_dummy_token_id).nonzero(as_tuple=True)[0]
-    if dummy_pos.numel() == 0:
-        return dummy_pos
-
-    audio_start_pos = (input_ids == cfg.audio_start_token_id).nonzero(as_tuple=True)[0]
-    audio_end_pos = (input_ids == cfg.audio_end_token_id).nonzero(as_tuple=True)[0]
-    if audio_start_pos.numel() == 0:
-        return dummy_pos
-
-    start = int(audio_start_pos[0].item())
-    end = int(audio_end_pos[0].item()) if audio_end_pos.numel() > 0 else int(input_ids.shape[0])
-    keep = (dummy_pos > start) & (dummy_pos < end)
-    filtered = dummy_pos[keep]
-    return filtered if filtered.numel() > 0 else dummy_pos
-
-
-def _find_speaker_placeholder_positions(input_ids: torch.Tensor, hf_config: Any) -> list[int]:
-    vision_start_token_id = getattr(hf_config, "vision_start_token_id", VISION_START_TOKEN_ID)
-    vision_start_pos = (input_ids == int(vision_start_token_id)).nonzero(as_tuple=True)[0]
-    if vision_start_pos.numel() == 0:
-        return []
-
-    slots = []
-    for pos in vision_start_pos:
-        slot = int(pos.item()) + 1
-        if slot < int(input_ids.shape[0]):
-            slots.append(slot)
-    return slots
-
-
-__all__ = [
-    "DEFAULT_PROMPT",
-    "build_dense_prompt_token_ids",
-    "build_ming_dense_prompt",
-    "coerce_prompt_waveform",
-    "coerce_speaker_embeddings",
-    "count_prompt_latent_patches",
-    "count_prompt_waveform_patches",
-    "create_instruction",
-    "estimate_decode_step_window_for_duration",
-    "estimate_decode_steps_for_duration",
-    "pad_prompt_waveform",
-    "parse_duration_seconds",
-    "resolve_effective_runtime_controls",
-    "_coerce_prompt_latents",
-    "_find_audio_placeholder_positions",
-    "_find_speaker_placeholder_positions",
-    "_initial_history",
-    "_resolve_prompt_latents",
-    "_take_scalar",
-]

From b9ea555202f2b0d23874c02c31b30dbf95b4baf2 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Sat, 30 May 2026 23:04:26 +0530
Subject: [PATCH 50/54] Refactor Ming shared AudioVAE and CFM utilities

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../ming_tts/test_ming_shared_modules.py      |  90 +++++
 .../entrypoints/openai/serving_speech.py      |  40 +--
 .../models/ming_flash_omni/audio_vae.py       | 243 --------------
 .../ming_flash_omni/ming_flash_omni_talker.py |   2 +-
 .../models/ming_flash_omni/talker_module.py   |  15 +-
 .../models/ming_flash_omni/voice_presets.py   |   3 +-
 .../configuration_audio_vae.py                |  40 ---
 .../audio_tokenizer/modeling_audio_vae.py     | 172 ----------
 .../ming_tts/audio_tokenizer/vae_modules.py   | 129 -------
 .../models/ming_tts/config_ming_tts.py        |   7 +-
 .../model_executor/models/ming_tts/fm/cfm.py  |  53 +--
 .../model_executor/models/ming_tts/fm/dit.py  |  37 +-
 .../models/ming_tts/ming_tts_audio_vae.py     |   2 +-
 .../models/ming_tts/prompt_encoder.py         |   2 +-
 .../models/ming_tts/validation.py             |  13 +-
 .../models/ming_utils/audio_vae.py            | 316 ++++++++++++++++++
 .../model_executor/models/ming_utils/fm.py    |  68 ++++
 17 files changed, 527 insertions(+), 705 deletions(-)
 create mode 100644 tests/model_executor/models/ming_tts/test_ming_shared_modules.py
 delete mode 100644 vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/audio_tokenizer/configuration_audio_vae.py
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/audio_tokenizer/modeling_audio_vae.py
 delete mode 100644 vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py
 create mode 100644 vllm_omni/model_executor/models/ming_utils/fm.py

diff --git a/tests/model_executor/models/ming_tts/test_ming_shared_modules.py b/tests/model_executor/models/ming_tts/test_ming_shared_modules.py
new file mode 100644
index 00000000000..828b9156564
--- /dev/null
+++ b/tests/model_executor/models/ming_tts/test_ming_shared_modules.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from types import SimpleNamespace
+
+import pytest
+
+from vllm_omni.entrypoints.openai.serving_speech import OmniOpenAIServingSpeech
+from vllm_omni.model_executor.models.ming_tts.constants import (
+    AGGREGATOR_HIDDEN_SIZE,
+    HISTORY_PATCH_SIZE,
+    LATENT_DIM,
+    LLM_HIDDEN_SIZE,
+    LLM_VOCAB_SIZE,
+    PATCH_SIZE,
+    SAMPLE_RATE,
+    VAE_PATCH_SIZE,
+)
+from vllm_omni.model_executor.models.ming_tts.fm.cfm import Solver as MingTTSSolver
+from vllm_omni.model_executor.models.ming_tts.validation import validate_ming_tts_config
+from vllm_omni.model_executor.models.ming_utils.audio_vae import AudioVAEConfig
+from vllm_omni.model_executor.models.ming_utils.fm import Solver
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.tts]
+
+
+def test_ming_tts_audio_vae_uses_common_config():
+    """AudioVAEConfig is shared by Ming dense and Ming flash modules."""
+    cfg = AudioVAEConfig(sample_rate=16000, patch_size=-1)
+
+    assert cfg.sample_rate == 16000
+    assert cfg.patch_size == -1
+
+
+def test_ming_tts_cfm_solver_uses_common_implementation():
+    """Ming dense CFM imports the shared solver implementation."""
+    assert MingTTSSolver is Solver
+
+
+def test_ming_dense_validation_rejects_semantic_audio_vae_config():
+    """Dense 0.5B validation rejects semantic AudioVAE configs."""
+    cfg = SimpleNamespace(
+        audio_dummy_token_id=151705,
+        audio_eos_token_id=151704,
+        text_eos_token_id=151669,
+        audio_tokenizer_config=AudioVAEConfig(
+            sample_rate=SAMPLE_RATE,
+            patch_size=VAE_PATCH_SIZE,
+            semantic_module_kwargs={"whisper_encoder": {}},
+            enc_kwargs={"latent_dim": LATENT_DIM, "input_dim": 882, "hop_size": 882},
+            dec_kwargs={"latent_dim": LATENT_DIM, "output_dim": 882},
+        ),
+        latent_dim=LATENT_DIM,
+        patch_size=PATCH_SIZE,
+        history_patch_size=HISTORY_PATCH_SIZE,
+        llm_hidden_size=LLM_HIDDEN_SIZE,
+        llm_vocab_size=LLM_VOCAB_SIZE,
+        sample_rate=SAMPLE_RATE,
+        vae_patch_size=VAE_PATCH_SIZE,
+        llm_config={"hidden_size": LLM_HIDDEN_SIZE},
+        aggregator_config={"hidden_size": AGGREGATOR_HIDDEN_SIZE},
+        ditar_config={"hidden_size": AGGREGATOR_HIDDEN_SIZE},
+        latent_chunk_size=1,
+        latent_left_context=0,
+        max_decode_steps=1,
+        stop_head_threshold=0.5,
+        stop_head_min_steps=0,
+    )
+
+    with pytest.raises(ValueError, match="semantic_module_kwargs"):
+        validate_ming_tts_config(cfg)
+
+
+def test_ming_instruction_parser_preserves_dense_and_flash_defaults():
+    """Ming dense and Ming flash keep distinct instruction defaults."""
+    serving = object.__new__(OmniOpenAIServingSpeech)
+    serving.uploaded_speakers = {"uploaded": {}}
+
+    dense_plain = serving._parse_ming_instruction(SimpleNamespace(instructions="calm", language=None, voice=None))
+    assert dense_plain == "calm"
+
+    dense_with_fields = serving._parse_ming_instruction(
+        SimpleNamespace(instructions="calm", language="Auto", voice="灵小甄")
+    )
+    assert dense_with_fields == {"IP": "灵小甄", "风格": "calm"}
+
+    flash_fields = serving._parse_ming_instruction_fields(
+        SimpleNamespace(instructions="calm", language="粤语", voice="灵小甄")
+    )
+    assert flash_fields == {"风格": "calm"}
diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
index 777eb878e02..c812acc7612 100644
--- a/vllm_omni/entrypoints/openai/serving_speech.py
+++ b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -1894,16 +1894,22 @@ def _extract_ming_speaker_embeddings_from_ref_audio(
             embeddings.append(flat.tolist())
         return embeddings
 
-    def _parse_ming_instruction(self, request: OpenAICreateSpeechRequest) -> Any:
-        """Build a Ming instruction payload from OpenAI speech fields."""
+    def _parse_ming_instruction_fields(
+        self,
+        request,
+        *,
+        include_language=False,
+        include_voice=False,
+        plain_text_passthrough=False,
+    ):
         instruction_text = request.instructions.strip() if isinstance(request.instructions, str) else None
         instruction_dict: dict[str, Any] = {}
 
-        if request.language not in (None, "", "Auto"):
+        if include_language and request.language not in (None, "", "Auto"):
             instruction_dict["方言"] = request.language
 
         voice_lower = request.voice.lower() if isinstance(request.voice, str) else None
-        if request.voice and not (voice_lower and voice_lower in self.uploaded_speakers):
+        if include_voice and request.voice and not (voice_lower and voice_lower in self.uploaded_speakers):
             instruction_dict["IP"] = request.voice
 
         if instruction_text:
@@ -1913,13 +1919,22 @@ def _parse_ming_instruction(self, request: OpenAICreateSpeechRequest) -> Any:
                 parsed = None
             if isinstance(parsed, dict):
                 instruction_dict.update(parsed)
-            elif instruction_dict:
+            elif instruction_dict or not plain_text_passthrough:
                 instruction_dict["风格"] = instruction_text
             else:
                 return instruction_text
 
         return instruction_dict or None
 
+    def _parse_ming_instruction(self, request: OpenAICreateSpeechRequest) -> Any:
+        """Build a Ming instruction payload from OpenAI speech fields."""
+        return self._parse_ming_instruction_fields(
+            request,
+            include_language=True,
+            include_voice=True,
+            plain_text_passthrough=True,
+        )
+
     def _build_ming_dense_prompt(
         self,
         request: OpenAICreateSpeechRequest,
@@ -2541,20 +2556,7 @@ def _build_ming_flash_omni_prompt(self, request: OpenAICreateSpeechRequest) -> d
         # 1. Plain text: mapped to the caption's 风格 (style) field
         # 2. JSON object: parsed and splatted into the caption. Unlocks
         #       Unknown keys are dropped by `ming_create_instruction`.
-        caption_fields: dict[str, Any] = {}
-        if request.instructions:
-            stripped = request.instructions.strip()
-            if stripped.startswith("{"):
-                try:
-                    parsed = json.loads(stripped)
-                except json.JSONDecodeError:
-                    parsed = None
-                if isinstance(parsed, dict):
-                    caption_fields.update(parsed)
-                else:
-                    caption_fields["风格"] = request.instructions
-            else:
-                caption_fields["风格"] = request.instructions
+        caption_fields = self._parse_ming_instruction_fields(request) or {}
 
         has_spk_emb = request.speaker_embedding is not None
 
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py b/vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py
deleted file mode 100644
index 7ef3b5050ae..00000000000
--- a/vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright 2025 The vLLM-Omni team.
-# Copyright (c) Ant Group. All rights reserved.
-# Adapted from:
-# https://github.com/inclusionAI/Ming/tree/e58533db227031990c5a6864dcf5f08fb53ed0d2/AudioVAE
-
-from __future__ import annotations
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import PretrainedConfig, PreTrainedModel, Qwen2Config, Qwen2Model
-from transformers.utils import is_flash_attn_2_available
-from vllm.logger import init_logger
-
-from vllm_omni.model_executor.models.ming_utils.audio_dsp import ISTFTHead
-from vllm_omni.model_executor.models.ming_utils.audio_vae import StreamingLinearUpsample
-
-logger = init_logger(__name__)
-
-
-class AudioVAEConfig(PretrainedConfig):
-    model_type = "audio_vae"
-
-    def __init__(
-        self,
-        sample_rate: int = 44100,
-        enc_kwargs: dict | None = None,
-        dec_kwargs: dict | None = None,
-        init_method: str = "kaiming",
-        patch_size: int = 4,
-        **kwargs,
-    ):
-        self.sample_rate = sample_rate
-        self.enc_kwargs = enc_kwargs or {}
-        self.dec_kwargs = dec_kwargs or {}
-        self.init_method = init_method
-        self.patch_size = patch_size
-        super().__init__(**kwargs)
-
-
-class Decoder(nn.Module):
-    def __init__(self, decoder_args, output_dim=320, latent_dim=64, patch_size=-1):
-        super().__init__()
-        config = Qwen2Config.from_dict(config_dict=decoder_args)
-        if is_flash_attn_2_available():
-            config._attn_implementation_autoset = True
-            config._attn_implementation = "flash_attention_2"
-        else:
-            config._attn_implementation = "sdpa"
-
-        logger.info("AudioVAE Decoder: using attn_implementation=%r", config._attn_implementation)
-        self.decoder = Qwen2Model(config)
-        self.output_dim = output_dim
-        self.latent_dim = latent_dim
-        self.fc1 = nn.Linear(latent_dim, config.hidden_size)
-        self.hop_length = output_dim
-        self.head = ISTFTHead(
-            dim=config.hidden_size, n_fft=self.hop_length * 4, hop_length=self.hop_length, padding="same"
-        )
-        self.patch_size = patch_size
-        if self.patch_size != -1:
-            self.upsampling = StreamingLinearUpsample(scale_factor=patch_size)
-
-    def low_level_reconstruct(self, x, past_key_values=None, use_cache=False, stream_state=None, last_chunk=False):
-        upsample_state, audio_buffer, window_buffer = stream_state
-        bsz, device, dtype = x.size(0), x.device, x.dtype
-        x = self.fc1(x)
-        if self.patch_size != -1:
-            if use_cache:
-                x, upsample_state = self.upsampling(x, state=upsample_state, is_last=last_chunk)
-                if x is None:
-                    stream_state = (upsample_state, audio_buffer, window_buffer)
-                    return torch.empty(bsz, 1, 0, device=device, dtype=dtype), stream_state, past_key_values
-            else:
-                x = self.upsampling.upsampler(x.transpose(1, 2)).transpose(1, 2)
-
-        hidden_states_list = []
-
-        if use_cache and getattr(self.decoder.config, "sliding_window", None) is not None:
-            sw_size = self.decoder.config.sliding_window
-            target_len = sw_size - 1
-            if past_key_values is None:
-                past_len = 0
-            elif hasattr(past_key_values, "get_seq_length"):
-                past_len = past_key_values.get_seq_length()
-            elif isinstance(past_key_values, tuple) and len(past_key_values) > 0:
-                past_len = past_key_values[0][0].shape[-2]
-            else:
-                past_len = 0
-
-            curr_len = x.shape[1]
-
-            if past_len < target_len and (past_len + curr_len) >= sw_size:
-                fill_len = target_len - past_len
-                x_fill = x[:, :fill_len, :]
-                outputs = self.decoder(inputs_embeds=x_fill, past_key_values=past_key_values, use_cache=use_cache)
-                hidden_states_list.append(outputs.last_hidden_state)
-                past_key_values = outputs.past_key_values
-                x = x[:, fill_len:, :]
-
-        outputs = self.decoder(inputs_embeds=x, past_key_values=past_key_values, use_cache=use_cache)
-        hidden_states_list.append(outputs.last_hidden_state)
-        past_key_values = outputs.past_key_values
-
-        if len(hidden_states_list) > 1:
-            full_hidden_state = torch.cat(hidden_states_list, dim=1)
-        else:
-            full_hidden_state = hidden_states_list[0]
-
-        x_out, _, audio_buffer, window_buffer = self.head(
-            full_hidden_state,
-            streaming=use_cache,
-            audio_buffer=audio_buffer,
-            window_buffer=window_buffer,
-            last_chunk=last_chunk,
-        )
-
-        stream_state = (upsample_state, audio_buffer, window_buffer)
-        return x_out, stream_state, past_key_values
-
-
-class Encoder(nn.Module):
-    def __init__(self, encoder_args, input_dim=320, hop_size=320, latent_dim=64, patch_size=-1):
-        super().__init__()
-        config = Qwen2Config.from_dict(config_dict=encoder_args)
-        if is_flash_attn_2_available():
-            config._attn_implementation_autoset = True
-            config._attn_implementation = "flash_attention_2"
-        else:
-            config._attn_implementation = "sdpa"
-
-        logger.info("AudioVAE Encoder: using attn_implementation=%r", config._attn_implementation)
-        self.encoder = Qwen2Model(config)
-        self.input_dim = input_dim
-        self.hop_size = hop_size
-        self.latent_dim = latent_dim
-        self.fc1 = nn.Linear(input_dim, config.hidden_size, bias=False)
-        self.fc2 = nn.Linear(config.hidden_size, config.hidden_size)
-        self.fc3 = nn.Linear(config.hidden_size, latent_dim * 2)
-        self.norm = nn.LayerNorm(config.hidden_size)
-        self.patch_size = patch_size
-        if patch_size != -1:
-            config.num_hidden_layers = 4
-            self.aggregator = Qwen2Model(config)
-            self.cls_embed = nn.Parameter(torch.rand(1, 1, config.hidden_size))
-            self.cls_embed.data.normal_(0, 0.02)
-
-    def get_frames(self, x):
-        num_frames_total = (x.size(-1) + self.hop_size - 1) // self.hop_size
-        expected_len = (num_frames_total - 1) * self.hop_size + self.input_dim
-        padding_needed = expected_len - x.size(-1)
-        waveform = F.pad(x, (0, padding_needed), value=0.0)
-        frames = waveform.unfold(dimension=-1, size=self.input_dim, step=self.hop_size)
-        return frames
-
-    def pad_patch_insert_cls(self, x):
-        bsz, _, dim = x.size()
-        num_frame = x.size(1)
-        r = num_frame % self.patch_size
-        pad_num = self.patch_size - r if r else 0
-        x = F.pad(x, (0, 0, 0, pad_num), value=0.0)
-        x = x.reshape(-1, self.patch_size, dim)
-        x = torch.cat((x, self.cls_embed.expand(x.size(0), -1, -1)), dim=1)
-        x = x.reshape(bsz, -1, dim)
-        return x
-
-    def forward(self, waveform):
-        x = self.get_frames(waveform)
-        x = self.fc1(x)
-        x = self.fc2(x)
-        x = self.encoder(inputs_embeds=x)
-        x = x.last_hidden_state
-
-        if self.patch_size != -1:
-            x = self.pad_patch_insert_cls(x)
-            x = self.aggregator(inputs_embeds=x)
-            x = x.last_hidden_state
-            bsz, _, dim = x.size()
-            x = x.reshape(-1, self.patch_size + 1, dim)
-            x = x[:, -1:, :].reshape(bsz, -1, dim)
-
-        x = self.fc3(x)
-        return x, waveform.unsqueeze(1)
-
-
-class AudioVAE(PreTrainedModel):
-    config_class = AudioVAEConfig
-
-    def __init__(self, config: AudioVAEConfig):
-        super().__init__(config)
-        self.encoder = Encoder(
-            encoder_args=config.enc_kwargs["backbone"],
-            input_dim=config.enc_kwargs["input_dim"],
-            hop_size=config.enc_kwargs.get("hop_size", 320),
-            latent_dim=config.enc_kwargs["latent_dim"],
-            patch_size=config.patch_size,
-        )
-        self.decoder = Decoder(
-            decoder_args=config.dec_kwargs["backbone"],
-            output_dim=config.dec_kwargs["output_dim"],
-            latent_dim=config.dec_kwargs["latent_dim"],
-            patch_size=config.patch_size,
-        )
-        self.post_init()
-
-    def _init_weights(self, module):
-        std = 0.02
-        if isinstance(module, nn.Linear):
-            if self.config.init_method == "kaiming":
-                nn.init.kaiming_normal_(module.weight, mode="fan_in", nonlinearity="relu")
-            else:
-                module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def encode_latent(self, waveform, waveform_length):
-        from diffusers.models.autoencoders.autoencoder_oobleck import OobleckDiagonalGaussianDistribution
-
-        frame_num = torch.ceil(waveform_length / self.config.enc_kwargs["input_dim"]).to(torch.int32)
-        if self.config.patch_size != -1:
-            frame_num = torch.ceil(frame_num / self.config.patch_size)
-        h, y = self.encoder(waveform)
-        h = h.transpose(1, 2)
-
-        posterior = OobleckDiagonalGaussianDistribution(h)
-        latent = posterior.sample()
-        latent = latent.transpose(1, 2)
-        return latent, frame_num
-
-    def decode(self, latent, past_key_values=None, use_cache=False, stream_state=(None, None, None), last_chunk=False):
-        waveform, stream_state, past_key_values = self.decoder.low_level_reconstruct(
-            latent,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            stream_state=stream_state,
-            last_chunk=last_chunk,
-        )
-        return waveform, stream_state, past_key_values
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_talker.py b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_talker.py
index 08ed9e85476..2bcc191201c 100644
--- a/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_talker.py
+++ b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_talker.py
@@ -26,10 +26,10 @@
 
 from vllm_omni.model_executor.custom_process_mixin import CustomProcessMixin
 from vllm_omni.model_executor.model_loader.weight_utils import download_weights_from_hf_specific
+from vllm_omni.model_executor.models.ming_utils.audio_vae import AudioVAE, AudioVAEConfig
 from vllm_omni.model_executor.models.output_templates import OmniOutput
 from vllm_omni.transformers_utils.configs.ming_flash_omni import MingFlashOmniTalkerConfig
 
-from .audio_vae import AudioVAE, AudioVAEConfig
 from .prompt_utils import DEFAULT_PROMPT as MING_DEFAULT_PROMPT
 from .talker_module import CFM, Aggregator, DiT, MingAudioGenerator, build_tts_input
 from .text_processing import segment_and_normalize
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py b/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py
index 9d07f644106..31f2e541cd4 100644
--- a/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py
+++ b/vllm_omni/model_executor/models/ming_flash_omni/talker_module.py
@@ -33,9 +33,9 @@
 from x_transformers.x_transformers import RotaryEmbedding
 
 from vllm_omni.model_executor.layers.timestep_embedding import DiTTimestepEmbedding
+from vllm_omni.model_executor.models.ming_utils.audio_vae import AudioVAE
 from vllm_omni.model_executor.models.ming_utils.dit import CondEmbedder, DiTBlock, FinalLayer, get_epss_timesteps
-
-from .audio_vae import AudioVAE
+from vllm_omni.model_executor.models.ming_utils.fm import apply_sway_sampling, integrate_cfm_steps
 
 logger = init_logger(__name__)
 
@@ -164,15 +164,8 @@ def fn(fn_t, x):
             pred, null_pred = torch.chunk(pred_cfg, 2, dim=0)
             return pred + (pred - null_pred) * sde_args[0]
 
-        if self.sway_sampling_coef is not None:
-            t = t + self.sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
-
-        for step in range(self.steps):
-            dt = t[step + 1] - t[step]
-            y0 = y0 + fn(t[step], y0) * dt
-            y0 = y0 + sde_args[1] * (sde_args[2] ** 0.5) * (dt.abs() ** 0.5) * sde_rnd[step]
-
-        return y0
+        t = apply_sway_sampling(t, self.sway_sampling_coef)
+        return integrate_cfm_steps(fn, y0, t, sde_args, sde_rnd, self.steps)
 
 
 class CFMGraphExecutor:
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/voice_presets.py b/vllm_omni/model_executor/models/ming_flash_omni/voice_presets.py
index 5f54687c0cb..025dd78cfec 100644
--- a/vllm_omni/model_executor/models/ming_flash_omni/voice_presets.py
+++ b/vllm_omni/model_executor/models/ming_flash_omni/voice_presets.py
@@ -22,7 +22,8 @@
 from .talker_module import resample
 
 if TYPE_CHECKING:
-    from .audio_vae import AudioVAE
+    from vllm_omni.model_executor.models.ming_utils.audio_vae import AudioVAE
+
     from .talker_module import Aggregator
 
 logger = init_logger(__name__)
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/configuration_audio_vae.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/configuration_audio_vae.py
deleted file mode 100644
index ce9c069c277..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/configuration_audio_vae.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/audio_tokenizer/configuration_audio_vae.py
-
-
-from transformers import PretrainedConfig
-
-
-class AudioVAEconfig(PretrainedConfig):
-    def __init__(
-        self,
-        sample_rate: int = 16000,
-        enc_kwargs: dict = None,
-        semantic_module_kwargs: dict = None,
-        dec_kwargs: dict = None,
-        hifi_gan_disc_kwargs: dict = None,
-        spec_disc_kwargs: dict = None,
-        lambda_disc=1.0,
-        lambda_mel_loss=15,
-        lambda_adv=1.0,
-        lambda_feat_match_loss=1.0,
-        lambda_semantic=5.0,
-        init_method="normal",
-        patch_size=-1,
-        **kwargs,
-    ):
-        self.sample_rate = sample_rate
-        self.enc_kwargs = enc_kwargs
-        self.semantic_module_kwargs = semantic_module_kwargs
-        self.dec_kwargs = dec_kwargs
-        self.hifi_gan_disc_kwargs = hifi_gan_disc_kwargs
-        self.spec_disc_kwargs = spec_disc_kwargs
-        self.lambda_disc = lambda_disc
-        self.lambda_mel_loss = lambda_mel_loss
-        self.lambda_adv = lambda_adv
-        self.lambda_feat_match_loss = lambda_feat_match_loss
-        self.lambda_semantic = lambda_semantic
-        self.init_method = init_method
-        self.patch_size = patch_size
-        super().__init__(**kwargs)
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/modeling_audio_vae.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/modeling_audio_vae.py
deleted file mode 100644
index e4741adcee0..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/modeling_audio_vae.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/audio_tokenizer/modeling_audio_vae.py
-# audio_tokenizer/modeling_audio_vae.py
-import torch
-import torch.nn as nn
-from vllm.logger import init_logger
-
-from .configuration_audio_vae import AudioVAEconfig
-from .vae_modules import Decoder, Encoder
-
-logger = init_logger(__name__)
-
-
-def _get_backbone(config: AudioVAEconfig, branch: str):
-    branch_cfg = getattr(config, branch, None)
-    if not isinstance(branch_cfg, dict):
-        return None
-    backbone = branch_cfg.get("backbone")
-    if not isinstance(backbone, dict):
-        return None
-    return backbone
-
-
-def _maybe_fallback_attention(config: AudioVAEconfig) -> None:
-    enc_backbone = _get_backbone(config, "enc_kwargs")
-    dec_backbone = _get_backbone(config, "dec_kwargs")
-    requested_attn_impl = "flash_attention_2"
-
-    if dec_backbone is not None:
-        requested_attn_impl = dec_backbone.get(
-            "_attn_implementation",
-            dec_backbone.get("attn_implementation", requested_attn_impl),
-        )
-    elif enc_backbone is not None:
-        requested_attn_impl = enc_backbone.get(
-            "_attn_implementation",
-            enc_backbone.get("attn_implementation", requested_attn_impl),
-        )
-
-    if requested_attn_impl != "flash_attention_2":
-        return
-
-    try:
-        import flash_attn  # noqa: F401
-    except ImportError:
-        if enc_backbone is not None:
-            enc_backbone["_attn_implementation"] = "sdpa"
-            enc_backbone["attn_implementation"] = "sdpa"
-        if dec_backbone is not None:
-            dec_backbone["_attn_implementation"] = "sdpa"
-            dec_backbone["attn_implementation"] = "sdpa"
-        logger.warning("flash_attn not available, falling back to sdpa for Ming audio VAE")
-
-
-class AudioVAE(nn.Module):
-    def __init__(self, config: AudioVAEconfig):
-        super().__init__()
-        self.config = config
-        _maybe_fallback_attention(self.config)
-
-        # --- Ming/Bailing config sanity (fail early on bad nested config parsing) ---
-        enc_kwargs = config.enc_kwargs
-        dec_kwargs = config.dec_kwargs
-
-        # Required nested fields
-        for k in ("backbone", "input_dim", "latent_dim"):
-            if k not in enc_kwargs:
-                raise ValueError(f"AudioVAE.enc_kwargs missing required key: {k}")
-        for k in ("backbone", "output_dim", "latent_dim"):
-            if k not in dec_kwargs:
-                raise ValueError(f"AudioVAE.dec_kwargs missing required key: {k}")
-
-        # Ming-specific geometry checks (safe because this integration targets Ming checkpoint family)
-        hop_size = enc_kwargs.get("hop_size", enc_kwargs["input_dim"])
-        if enc_kwargs["input_dim"] != hop_size:
-            raise ValueError(f"AudioVAE encoder input_dim ({enc_kwargs['input_dim']}) != hop_size ({hop_size}).")
-        if hop_size != dec_kwargs["output_dim"]:
-            raise ValueError(
-                f"AudioVAE encoder hop_size ({hop_size}) != decoder output_dim ({dec_kwargs['output_dim']})."
-            )
-
-        self.encoder = Encoder(
-            encoder_args=enc_kwargs["backbone"],
-            input_dim=enc_kwargs["input_dim"],
-            hop_size=hop_size,
-            latent_dim=enc_kwargs["latent_dim"],
-            patch_size=config.patch_size,
-        )
-
-        if config.semantic_module_kwargs is not None:
-            raise ValueError("Ming dense 0.5B expects semantic_module_kwargs to be null.")
-
-        self.decoder = Decoder(
-            decoder_args=dec_kwargs["backbone"],  # IMPORTANT: decoder uses dec_kwargs.backbone
-            output_dim=dec_kwargs["output_dim"],  # Ming checkpoint uses 882
-            latent_dim=dec_kwargs["latent_dim"],
-            patch_size=config.patch_size,
-        )
-
-    @torch.inference_mode()
-    def encode_latent(
-        self,
-        waveform: torch.Tensor,
-        waveform_length: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Encode waveform -> acoustic latent.
-        """
-        if waveform.ndim != 2:
-            raise ValueError(f"Expected waveform rank-2 [Batch, Time], got {tuple(waveform.shape)}")
-        if waveform_length.ndim != 1:
-            raise ValueError(f"Expected waveform_length rank-1 [Batch], got {tuple(waveform_length.shape)}")
-        if waveform.shape[0] != waveform_length.shape[0]:
-            raise ValueError(
-                "Batch mismatch: "
-                f"waveform batch={waveform.shape[0]} vs "
-                f"waveform_length batch={waveform_length.shape[0]}"
-            )
-        if torch.any(waveform_length <= 0):
-            raise ValueError("waveform_length must be strictly positive.")
-
-        frame_num = torch.ceil(waveform_length / self.config.enc_kwargs["input_dim"]).to(torch.int32)
-        if self.config.patch_size != -1:
-            frame_num = torch.ceil(frame_num / self.config.patch_size)
-
-        h, _ = self.encoder(waveform)
-        h = h.transpose(1, 2)  # [B, 2*latent_dim, T] (posterior params: mean + logvar)
-
-        # Inline OobleckDiagonalGaussianDistribution.sample()
-        mean, logvar = torch.chunk(h, 2, dim=1)
-        logvar = torch.clamp(logvar, -30.0, 20.0)
-        std = torch.exp(0.5 * logvar)
-        latent = mean + std * torch.randn_like(mean)  # [B, latent_dim, T]
-        latent = latent.transpose(1, 2)  # [B, T, d/2]
-
-        return latent, frame_num
-
-    @torch.inference_mode()
-    def decode(
-        self,
-        latent: torch.Tensor,
-        past_key_values=None,
-        use_cache: bool = False,
-        stream_state: tuple = (None, None, None),
-        last_chunk: bool = False,
-    ) -> tuple[torch.Tensor, tuple, object]:
-        """
-        Decode acoustic latent -> waveform.
-        """
-        if latent.dim() != 3:
-            raise ValueError(f"Expected latent rank-3 [B,T,D], got shape={tuple(latent.shape)}")
-        if latent.shape[0] <= 0:
-            raise ValueError("latent batch size must be positive.")
-
-        target_dtype = next(self.decoder.parameters()).dtype
-        target_device = next(self.decoder.parameters()).device
-        if latent.dtype != target_dtype or latent.device != target_device:
-            latent = latent.to(device=target_device, dtype=target_dtype)
-
-        expected_latent_dim = self.config.dec_kwargs["latent_dim"]
-        if latent.shape[-1] != expected_latent_dim:
-            raise ValueError(f"Latent dim mismatch in decode(): got {latent.shape[-1]}, expected {expected_latent_dim}")
-
-        waveform, stream_state, past_key_values = self.decoder.low_level_reconstruct(
-            latent,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            stream_state=stream_state,
-            last_chunk=last_chunk,
-        )
-        return waveform, stream_state, past_key_values
diff --git a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py b/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py
deleted file mode 100644
index 21631c252d9..00000000000
--- a/vllm_omni/model_executor/models/ming_tts/audio_tokenizer/vae_modules.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/audio_tokenizer/vae_modules.py
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import Qwen2Config, Qwen2Model
-
-from vllm_omni.model_executor.models.ming_utils.audio_dsp import ISTFTHead
-from vllm_omni.model_executor.models.ming_utils.audio_vae import StreamingLinearUpsample
-
-
-class Encoder(nn.Module):
-    def __init__(self, encoder_args, input_dim=320, hop_size=320, latent_dim=64, patch_size=-1):
-        super().__init__()
-        config = Qwen2Config.from_dict(config_dict=encoder_args)
-        self.encoder = Qwen2Model(config)
-        self.input_dim = input_dim
-        self.hop_size = hop_size
-        self.latent_dim = latent_dim
-        self.fc1 = nn.Linear(input_dim, config.hidden_size, bias=False)
-        self.fc2 = nn.Linear(config.hidden_size, config.hidden_size)
-        self.fc3 = nn.Linear(config.hidden_size, latent_dim * 2)
-        self.norm = nn.LayerNorm(config.hidden_size)
-        self.patch_size = patch_size
-        if patch_size != -1:
-            aggregator_config = Qwen2Config.from_dict({**encoder_args, "num_hidden_layers": 4})
-            self.aggregator = Qwen2Model(aggregator_config)
-            self.cls_embed = nn.Parameter(torch.rand(1, 1, config.hidden_size))
-            self.cls_embed.data.normal_(0, 0.02)
-
-    def get_frames(self, x):
-        num_frames_total = (x.size(-1) + self.hop_size - 1) // self.hop_size  # 向上取整的帧数
-        expected_len = (num_frames_total - 1) * self.hop_size + self.input_dim
-        padding_needed = expected_len - x.size(-1)
-        waveform = F.pad(x, (0, padding_needed), value=0.0)
-
-        frames = waveform.unfold(dimension=-1, size=self.input_dim, step=self.hop_size)  # [B, T, d]
-        return frames
-
-    def pad_patch_insert_cls(self, x):
-        bsz, _, dim = x.size()
-        num_frame = x.size(1)
-        r = num_frame % self.patch_size
-        pad_num = self.patch_size - r if r else 0
-        x = F.pad(x, (0, 0, 0, pad_num), value=0.0)  # 帧数对齐到patch_size倍数
-        x = x.reshape(-1, self.patch_size, dim)
-        x = torch.cat((x, self.cls_embed.expand(x.size(0), -1, -1)), dim=1)  # 每个patch后插入一个cls
-        x = x.reshape(bsz, -1, dim)
-        return x
-
-    def forward(self, waveform):
-        x = self.get_frames(waveform)
-
-        x = self.fc1(x)
-        x = self.fc2(x)
-        x = self.encoder(inputs_embeds=x)
-        x = x.last_hidden_state
-
-        # downsample
-        if self.patch_size != -1:
-            x = self.pad_patch_insert_cls(x)
-            x = self.aggregator(inputs_embeds=x)
-            x = x.last_hidden_state
-            bsz, _, dim = x.size()
-            x = x.reshape(-1, self.patch_size + 1, dim)
-            x = x[:, -1:, :].reshape(bsz, -1, dim)
-
-        x = self.fc3(x)
-        return x, waveform.unsqueeze(1)
-
-
-class Decoder(nn.Module):
-    def __init__(self, decoder_args, output_dim=320, latent_dim=64, patch_size=-1):
-        super().__init__()
-        config = Qwen2Config.from_dict(config_dict=decoder_args)
-        self.decoder = Qwen2Model(config)
-        self.output_dim = output_dim
-        self.latent_dim = latent_dim
-        self.fc1 = nn.Linear(latent_dim, config.hidden_size)
-
-        self.hop_length = output_dim
-        self.head = ISTFTHead(
-            dim=config.hidden_size, n_fft=self.hop_length * 4, hop_length=self.hop_length, padding="same"
-        )
-        self.patch_size = patch_size
-        if self.patch_size != -1:
-            self.upsampling = StreamingLinearUpsample(scale_factor=patch_size)
-
-    def forward(self, x):
-        x = self.fc1(x)
-
-        if self.patch_size != -1:
-            x = self.upsampling(x.transpose(1, 2)).transpose(1, 2)
-
-        x = self.decoder(inputs_embeds=x)
-        x = x.last_hidden_state
-
-        x, _ = self.head(x)
-
-        return x, None
-
-    def low_level_reconstruct(self, x, past_key_values=None, use_cache=False, stream_state=None, last_chunk=False):
-        # Guard against None on first chunk (connector initialises per-request)
-        if stream_state is None:
-            stream_state = (None, None, None)
-        upsample_state, audio_buffer, window_buffer = stream_state
-        bsz, device, dtype = x.size(0), x.device, x.dtype
-        x = self.fc1(x)
-        if self.patch_size != -1:
-            if use_cache:
-                # streaming
-                x, upsample_state = self.upsampling(x, state=upsample_state, is_last=last_chunk)
-                if x is None:
-                    stream_state = (upsample_state, audio_buffer, window_buffer)
-                    return torch.empty(bsz, 1, 0, device=device, dtype=dtype), stream_state, past_key_values
-            else:
-                x = self.upsampling.upsampler(x.transpose(1, 2)).transpose(1, 2)
-
-        outputs = self.decoder(inputs_embeds=x, past_key_values=past_key_values, use_cache=use_cache)
-        past_key_values = outputs.past_key_values
-        x = outputs.last_hidden_state
-
-        x, _, audio_buffer, window_buffer = self.head(
-            x, streaming=use_cache, audio_buffer=audio_buffer, window_buffer=window_buffer, last_chunk=last_chunk
-        )
-
-        stream_state = (upsample_state, audio_buffer, window_buffer)
-        return x, stream_state, past_key_values
diff --git a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
index a529e06c656..0102afe0559 100644
--- a/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
+++ b/vllm_omni/model_executor/models/ming_tts/config_ming_tts.py
@@ -7,7 +7,8 @@
 
 from transformers import PretrainedConfig, Qwen2Config
 
-from .audio_tokenizer.configuration_audio_vae import AudioVAEconfig
+from vllm_omni.model_executor.models.ming_utils.audio_vae import AudioVAEConfig
+
 from .constants import (
     AGGREGATOR_HIDDEN_SIZE,
     AUDIO_DUMMY_TOKEN_ID,
@@ -70,7 +71,7 @@ def __init__(
         llm_config: Qwen2Config | dict[str, Any] | None = None,
         ditar_config: dict[str, Any] | None = None,
         aggregator_config: dict[str, Any] | None = None,
-        audio_tokenizer_config: AudioVAEconfig | dict[str, Any] | None = None,
+        audio_tokenizer_config: AudioVAEConfig | dict[str, Any] | None = None,
         architectures: list[str] | None = None,
         **kwargs: Any,
     ) -> None:
@@ -100,7 +101,7 @@ class MingTTSConfig:
     ditar_config: dict[str, Any] = field(default_factory=dict)
     aggregator_config: dict[str, Any] = field(default_factory=dict)
 
-    audio_tokenizer_config: AudioVAEconfig | None = None
+    audio_tokenizer_config: AudioVAEConfig | None = None
     vae_patch_size: int = VAE_PATCH_SIZE
     sample_rate: int = SAMPLE_RATE
     audio_frame_hop: int = AUDIO_FRAME_HOP
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/cfm.py b/vllm_omni/model_executor/models/ming_tts/fm/cfm.py
index d53a19d798b..34a1a58173c 100644
--- a/vllm_omni/model_executor/models/ming_tts/fm/cfm.py
+++ b/vllm_omni/model_executor/models/ming_tts/fm/cfm.py
@@ -6,45 +6,7 @@
 import torch
 from torch import nn
 
-from vllm_omni.model_executor.models.ming_utils.dit import get_epss_timesteps
-
-
-class Solver:
-    def __init__(self, func, y0, sigma=0.25, temperature=1.5) -> None:
-        self.func = func
-        self.y0 = y0
-        self.sigma = sigma
-        self.temperature = temperature
-
-    def integrate(self, t):
-        solution = torch.empty(len(t), *self.y0.shape, dtype=self.y0.dtype, device=self.y0.device)
-        solution[0] = self.y0
-
-        j = 1
-        y0 = self.y0
-        for t0, t1 in zip(t[:-1], t[1:]):
-            dt = t1 - t0
-            f0 = self.func(t0, y0)
-            dy = dt * f0
-            y1 = y0 + dy
-
-            while j < len(t) and t1 >= t[j]:
-                solution[j] = self._linear_interp(t0, t1, y0, y1, t[j])
-                j += 1
-
-            noise = torch.randn_like(y0)
-            shift = self.sigma * (self.temperature**0.5) * (abs(dt) ** 0.5) * noise
-            y0 = y1 + shift
-
-        return solution
-
-    def _linear_interp(self, t0, t1, y0, y1, t):
-        if t == t0:
-            return y0
-        if t == t1:
-            return y1
-        slope = (t - t0) / (t1 - t0)
-        return y0 + slope * (y1 - y0)
+from vllm_omni.model_executor.models.ming_utils.fm import Solver, build_timesteps
 
 
 class CFM(nn.Module):
@@ -116,12 +78,13 @@ def fn(t, x):
             return pred + (pred - null_pred) * cfg_scale
 
         y0 = noise.transpose(1, 2)
-        if use_epss:
-            t = get_epss_timesteps(steps, device=self.device, dtype=noise.dtype)
-        else:
-            t = torch.linspace(0, 1, steps + 1, device=self.device, dtype=noise.dtype)
-        if sway_sampling_coef is not None:
-            t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
+        t = build_timesteps(
+            steps,
+            device=self.device,
+            dtype=noise.dtype,
+            use_epss=use_epss,
+            sway_sampling_coef=sway_sampling_coef,
+        )
 
         solver = Solver(fn, y0, sigma=sigma, temperature=temperature)
         trajectory = solver.integrate(t)
diff --git a/vllm_omni/model_executor/models/ming_tts/fm/dit.py b/vllm_omni/model_executor/models/ming_tts/fm/dit.py
index f5db42f931c..e2405c88f2c 100644
--- a/vllm_omni/model_executor/models/ming_tts/fm/dit.py
+++ b/vllm_omni/model_executor/models/ming_tts/fm/dit.py
@@ -2,47 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adopted from https://github.com/inclusionAI/Ming-omni-tts/blob/main/fm/dit.py
 
-import math
-
 import torch
 import torch.nn as nn
 from x_transformers.x_transformers import RotaryEmbedding
 
+from vllm_omni.model_executor.layers.timestep_embedding import DiTTimestepEmbedding
 from vllm_omni.model_executor.models.ming_utils.dit import CondEmbedder, DiTBlock, FinalLayer
 
 
-class SinusPositionEmbedding(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-
-    def forward(self, x, scale=1000):
-        if x.ndim == 0:
-            x = x.reshape(1)
-        if x.ndim != 1:
-            raise ValueError(f"Expected timestep rank-1 [Batch], got {tuple(x.shape)}")
-        device = x.device
-        half_dim = self.dim // 2
-        emb = math.log(10000) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
-        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
-        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
-        return emb
-
-
-class TimestepEmbedder(nn.Module):
-    def __init__(self, dim, freq_embed_dim=256):
-        super().__init__()
-        self.time_embed = SinusPositionEmbedding(freq_embed_dim)
-        self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
-
-    def forward(self, timestep):
-        time_hidden = self.time_embed(timestep)
-        time_hidden = time_hidden.to(timestep.dtype)
-        time = self.time_mlp(time_hidden)  # b d
-        return time
-
-
 class DiT(nn.Module):
     def __init__(
         self,
@@ -59,7 +26,7 @@ def __init__(
         self.in_channels = in_channels
         self.out_channels = in_channels
         self.num_heads = num_heads
-        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.t_embedder = DiTTimestepEmbedding(hidden_size)
         self.x_embedder = nn.Linear(in_channels, hidden_size)
         self.c_embedder = CondEmbedder(llm_cond_dim, hidden_size)
         self.hidden_size = hidden_size
diff --git a/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py b/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
index a7eee3d750d..7f902a39e79 100644
--- a/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
+++ b/vllm_omni/model_executor/models/ming_tts/ming_tts_audio_vae.py
@@ -15,9 +15,9 @@
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
+from vllm_omni.model_executor.models.ming_utils.audio_vae import AudioVAE
 from vllm_omni.model_executor.models.output_templates import OmniOutput
 
-from .audio_tokenizer.modeling_audio_vae import AudioVAE
 from .config_ming_tts import KEY_CHUNK_ID, KEY_REQUEST_ID, MingTTSConfig
 from .patch_emission import MING_STOP_REASON_KEY
 
diff --git a/vllm_omni/model_executor/models/ming_tts/prompt_encoder.py b/vllm_omni/model_executor/models/ming_tts/prompt_encoder.py
index fb368411d8a..599928b310b 100644
--- a/vllm_omni/model_executor/models/ming_tts/prompt_encoder.py
+++ b/vllm_omni/model_executor/models/ming_tts/prompt_encoder.py
@@ -13,6 +13,7 @@
 from vllm.logger import init_logger
 
 from vllm_omni.engine.stage_init_utils import _resolve_model_to_local_path
+from vllm_omni.model_executor.models.ming_utils.audio_vae import AudioVAE
 
 from .audio_prep import (
     _coerce_prompt_latents,
@@ -20,7 +21,6 @@
     count_prompt_latent_patches,
     pad_prompt_waveform,
 )
-from .audio_tokenizer.modeling_audio_vae import AudioVAE
 from .config_ming_tts import KEY_PROMPT_LATENTS
 
 logger = init_logger(__name__)
diff --git a/vllm_omni/model_executor/models/ming_tts/validation.py b/vllm_omni/model_executor/models/ming_tts/validation.py
index 3f674f69e11..99883273683 100644
--- a/vllm_omni/model_executor/models/ming_tts/validation.py
+++ b/vllm_omni/model_executor/models/ming_tts/validation.py
@@ -6,7 +6,8 @@
 
 from transformers import PretrainedConfig
 
-from .audio_tokenizer.configuration_audio_vae import AudioVAEconfig
+from vllm_omni.model_executor.models.ming_utils.audio_vae import AudioVAEConfig
+
 from .constants import (
     AGGREGATOR_HIDDEN_SIZE,
     HISTORY_PATCH_SIZE,
@@ -37,10 +38,10 @@ def _to_plain_dict(obj: Any) -> dict[str, Any]:
         return {}
 
 
-def _coerce_audio_vae_config(atc_raw: Any) -> AudioVAEconfig | None:
+def _coerce_audio_vae_config(atc_raw: Any) -> AudioVAEConfig | None:
     if atc_raw is None:
         return None
-    if isinstance(atc_raw, AudioVAEconfig):
+    if isinstance(atc_raw, AudioVAEConfig):
         return atc_raw
     if isinstance(atc_raw, PretrainedConfig):
         atc_dict = atc_raw.to_dict()
@@ -51,7 +52,7 @@ def _coerce_audio_vae_config(atc_raw: Any) -> AudioVAEconfig | None:
     else:
         raise TypeError(f"Unsupported audio_tokenizer_config type for Ming dense config: {type(atc_raw)!r}")
 
-    return AudioVAEconfig(**atc_dict)
+    return AudioVAEConfig(**atc_dict)
 
 
 def _nested_get(obj: Any, *keys: str, default: Any = None) -> Any:
@@ -128,6 +129,10 @@ def validate_ming_tts_config(cfg: Any) -> None:
         raise ValueError(f"ditar hidden_size mismatch: got {dit_h}, expected {AGGREGATOR_HIDDEN_SIZE}.")
 
     atc = cfg.audio_tokenizer_config
+    semantic_module_kwargs = _nested_get(atc, "semantic_module_kwargs", default=None)
+    if semantic_module_kwargs is not None:
+        raise ValueError("Ming dense 0.5B expects audio_tokenizer_config.semantic_module_kwargs to be null.")
+
     enc_latent = _nested_get(atc, "enc_kwargs", "latent_dim", default=None)
     dec_latent = _nested_get(atc, "dec_kwargs", "latent_dim", default=None)
     if enc_latent is not None and enc_latent != cfg.latent_dim:
diff --git a/vllm_omni/model_executor/models/ming_utils/audio_vae.py b/vllm_omni/model_executor/models/ming_utils/audio_vae.py
index 3f5e72a6b58..ccccc3a4af8 100644
--- a/vllm_omni/model_executor/models/ming_utils/audio_vae.py
+++ b/vllm_omni/model_executor/models/ming_utils/audio_vae.py
@@ -1,7 +1,67 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig, PreTrainedModel, Qwen2Config, Qwen2Model
+from transformers.utils import is_flash_attn_2_available
+from vllm.logger import init_logger
+
+from vllm_omni.model_executor.models.ming_utils.audio_dsp import ISTFTHead
+
+logger = init_logger(__name__)
+
+
+class AudioVAEConfig(PretrainedConfig):
+    model_type = "audio_vae"
+
+    def __init__(
+        self,
+        sample_rate=44100,
+        enc_kwargs=None,
+        semantic_module_kwargs=None,
+        dec_kwargs=None,
+        hifi_gan_disc_kwargs=None,
+        spec_disc_kwargs=None,
+        lambda_disc=1.0,
+        lambda_mel_loss=15,
+        lambda_adv=1.0,
+        lambda_feat_match_loss=1.0,
+        lambda_semantic=5.0,
+        init_method="kaiming",
+        patch_size=4,
+        **kwargs,
+    ):
+        self.sample_rate = sample_rate
+        self.enc_kwargs = enc_kwargs or {}
+        self.semantic_module_kwargs = semantic_module_kwargs
+        self.dec_kwargs = dec_kwargs or {}
+        self.hifi_gan_disc_kwargs = hifi_gan_disc_kwargs
+        self.spec_disc_kwargs = spec_disc_kwargs
+        self.lambda_disc = lambda_disc
+        self.lambda_mel_loss = lambda_mel_loss
+        self.lambda_adv = lambda_adv
+        self.lambda_feat_match_loss = lambda_feat_match_loss
+        self.lambda_semantic = lambda_semantic
+        self.init_method = init_method
+        self.patch_size = patch_size
+        super().__init__(**kwargs)
+
+
+def _qwen2_config(backbone):
+    config = Qwen2Config.from_dict(config_dict=backbone)
+    if (getattr(config, "_attn_implementation", None) or getattr(config, "attn_implementation", None)) not in (
+        None,
+        "flash_attention_2",
+    ):
+        return config
+    if is_flash_attn_2_available():
+        config._attn_implementation_autoset = True
+        config._attn_implementation = "flash_attention_2"
+    else:
+        config._attn_implementation = "sdpa"
+    return config
 
 
 class StreamingLinearUpsample(nn.Module):
@@ -61,3 +121,259 @@ def forward(self, x, state=None, is_last=False):
 
         final_out = torch.cat(output_chunks, dim=1) if output_chunks else None
         return final_out, state
+
+
+class Encoder(nn.Module):
+    def __init__(self, encoder_args, input_dim=320, hop_size=320, latent_dim=64, patch_size=-1):
+        super().__init__()
+        config = _qwen2_config(encoder_args)
+        logger.info("AudioVAE Encoder: using attn_implementation=%r", config._attn_implementation)
+        self.encoder = Qwen2Model(config)
+        self.input_dim = input_dim
+        self.hop_size = hop_size
+        self.latent_dim = latent_dim
+        self.fc1 = nn.Linear(input_dim, config.hidden_size, bias=False)
+        self.fc2 = nn.Linear(config.hidden_size, config.hidden_size)
+        self.fc3 = nn.Linear(config.hidden_size, latent_dim * 2)
+        self.norm = nn.LayerNorm(config.hidden_size)
+        self.patch_size = patch_size
+        if patch_size != -1:
+            aggregator_config = _qwen2_config({**encoder_args, "num_hidden_layers": 4})
+            self.aggregator = Qwen2Model(aggregator_config)
+            self.cls_embed = nn.Parameter(torch.rand(1, 1, config.hidden_size))
+            self.cls_embed.data.normal_(0, 0.02)
+
+    def get_frames(self, x):
+        num_frames_total = (x.size(-1) + self.hop_size - 1) // self.hop_size
+        expected_len = (num_frames_total - 1) * self.hop_size + self.input_dim
+        padding_needed = expected_len - x.size(-1)
+        waveform = F.pad(x, (0, padding_needed), value=0.0)
+        frames = waveform.unfold(dimension=-1, size=self.input_dim, step=self.hop_size)
+        return frames
+
+    def pad_patch_insert_cls(self, x):
+        bsz, _, dim = x.size()
+        num_frame = x.size(1)
+        r = num_frame % self.patch_size
+        pad_num = self.patch_size - r if r else 0
+        x = F.pad(x, (0, 0, 0, pad_num), value=0.0)
+        # [Batch, Time, Dimension] -> [Batch*PatchGroups, Patch, Dimension].
+        x = x.reshape(-1, self.patch_size, dim)
+        x = torch.cat((x, self.cls_embed.expand(x.size(0), -1, -1)), dim=1)
+        # [Batch*PatchGroups, Patch+1, Dimension] -> [Batch, Time, Dimension].
+        x = x.reshape(bsz, -1, dim)
+        return x
+
+    def forward(self, waveform):
+        x = self.get_frames(waveform)
+        x = self.fc1(x)
+        x = self.fc2(x)
+        x = self.encoder(inputs_embeds=x)
+        x = x.last_hidden_state
+
+        if self.patch_size != -1:
+            x = self.pad_patch_insert_cls(x)
+            x = self.aggregator(inputs_embeds=x)
+            x = x.last_hidden_state
+            bsz, _, dim = x.size()
+            # [Batch, Time, Dimension] -> [Batch*PatchGroups, Patch+1, Dimension].
+            x = x.reshape(-1, self.patch_size + 1, dim)
+            # [Batch*PatchGroups, 1, Dimension] -> [Batch, PatchGroups, Dimension].
+            x = x[:, -1:, :].reshape(bsz, -1, dim)
+
+        x = self.fc3(x)
+        return x, waveform.unsqueeze(1)
+
+
+class Decoder(nn.Module):
+    def __init__(self, decoder_args, output_dim=320, latent_dim=64, patch_size=-1):
+        super().__init__()
+        config = _qwen2_config(decoder_args)
+        logger.info("AudioVAE Decoder: using attn_implementation=%r", config._attn_implementation)
+        self.decoder = Qwen2Model(config)
+        self.output_dim = output_dim
+        self.latent_dim = latent_dim
+        self.fc1 = nn.Linear(latent_dim, config.hidden_size)
+        self.hop_length = output_dim
+        self.head = ISTFTHead(
+            dim=config.hidden_size, n_fft=self.hop_length * 4, hop_length=self.hop_length, padding="same"
+        )
+        self.patch_size = patch_size
+        if self.patch_size != -1:
+            self.upsampling = StreamingLinearUpsample(scale_factor=patch_size)
+
+    def forward(self, x):
+        x = self.fc1(x)
+
+        if self.patch_size != -1:
+            # [Batch, Time, Dimension] -> [Batch, Dimension, Time] -> [Batch, Time, Dimension].
+            x = self.upsampling.upsampler(x.transpose(1, 2)).transpose(1, 2)
+
+        x = self.decoder(inputs_embeds=x)
+        x = x.last_hidden_state
+        x, _ = self.head(x)
+        return x, None
+
+    def low_level_reconstruct(self, x, past_key_values=None, use_cache=False, stream_state=None, last_chunk=False):
+        if stream_state is None:
+            stream_state = (None, None, None)
+        upsample_state, audio_buffer, window_buffer = stream_state
+        bsz, device, dtype = x.size(0), x.device, x.dtype
+        x = self.fc1(x)
+        if self.patch_size != -1:
+            if use_cache:
+                x, upsample_state = self.upsampling(x, state=upsample_state, is_last=last_chunk)
+                if x is None:
+                    stream_state = (upsample_state, audio_buffer, window_buffer)
+                    return torch.empty(bsz, 1, 0, device=device, dtype=dtype), stream_state, past_key_values
+            else:
+                # [Batch, Time, Dimension] -> [Batch, Dimension, Time] -> [Batch, Time, Dimension].
+                x = self.upsampling.upsampler(x.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states_list = []
+
+        if use_cache and getattr(self.decoder.config, "sliding_window", None) is not None:
+            sw_size = self.decoder.config.sliding_window
+            target_len = sw_size - 1
+            if past_key_values is None:
+                past_len = 0
+            elif hasattr(past_key_values, "get_seq_length"):
+                past_len = past_key_values.get_seq_length()
+            elif isinstance(past_key_values, tuple) and len(past_key_values) > 0:
+                past_len = past_key_values[0][0].shape[-2]
+            else:
+                past_len = 0
+
+            curr_len = x.shape[1]
+            if past_len < target_len and (past_len + curr_len) >= sw_size:
+                fill_len = target_len - past_len
+                x_fill = x[:, :fill_len, :]
+                outputs = self.decoder(inputs_embeds=x_fill, past_key_values=past_key_values, use_cache=use_cache)
+                hidden_states_list.append(outputs.last_hidden_state)
+                past_key_values = outputs.past_key_values
+                x = x[:, fill_len:, :]
+
+        outputs = self.decoder(inputs_embeds=x, past_key_values=past_key_values, use_cache=use_cache)
+        hidden_states_list.append(outputs.last_hidden_state)
+        past_key_values = outputs.past_key_values
+
+        if len(hidden_states_list) > 1:
+            full_hidden_state = torch.cat(hidden_states_list, dim=1)
+        else:
+            full_hidden_state = hidden_states_list[0]
+
+        x_out, _, audio_buffer, window_buffer = self.head(
+            full_hidden_state,
+            streaming=use_cache,
+            audio_buffer=audio_buffer,
+            window_buffer=window_buffer,
+            last_chunk=last_chunk,
+        )
+
+        stream_state = (upsample_state, audio_buffer, window_buffer)
+        return x_out, stream_state, past_key_values
+
+
+class AudioVAE(PreTrainedModel):
+    config_class = AudioVAEConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        enc_kwargs = config.enc_kwargs
+        dec_kwargs = config.dec_kwargs
+        for key in ("backbone", "input_dim", "latent_dim"):
+            if key not in enc_kwargs:
+                raise ValueError(f"AudioVAE.enc_kwargs missing required key: {key}")
+        for key in ("backbone", "output_dim", "latent_dim"):
+            if key not in dec_kwargs:
+                raise ValueError(f"AudioVAE.dec_kwargs missing required key: {key}")
+
+        hop_size = enc_kwargs.get("hop_size", enc_kwargs["input_dim"])
+        if enc_kwargs["input_dim"] != hop_size:
+            raise ValueError(f"AudioVAE encoder input_dim ({enc_kwargs['input_dim']}) != hop_size ({hop_size}).")
+        if hop_size != dec_kwargs["output_dim"]:
+            raise ValueError(
+                f"AudioVAE encoder hop_size ({hop_size}) != decoder output_dim ({dec_kwargs['output_dim']})."
+            )
+        self.encoder = Encoder(
+            encoder_args=enc_kwargs["backbone"],
+            input_dim=enc_kwargs["input_dim"],
+            hop_size=hop_size,
+            latent_dim=enc_kwargs["latent_dim"],
+            patch_size=config.patch_size,
+        )
+        self.decoder = Decoder(
+            decoder_args=dec_kwargs["backbone"],
+            output_dim=dec_kwargs["output_dim"],
+            latent_dim=dec_kwargs["latent_dim"],
+            patch_size=config.patch_size,
+        )
+        self.post_init()
+
+    def _init_weights(self, module):
+        std = 0.02
+        if isinstance(module, nn.Linear):
+            if self.config.init_method == "kaiming":
+                nn.init.kaiming_normal_(module.weight, mode="fan_in", nonlinearity="relu")
+            else:
+                module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @torch.inference_mode()
+    def encode_latent(self, waveform, waveform_length):
+        if waveform.ndim != 2:
+            raise ValueError(f"Expected waveform rank-2 [Batch, Time], got {tuple(waveform.shape)}")
+        if waveform_length.ndim != 1:
+            raise ValueError(f"Expected waveform_length rank-1 [Batch], got {tuple(waveform_length.shape)}")
+        if waveform.shape[0] != waveform_length.shape[0]:
+            raise ValueError(
+                "Batch mismatch: "
+                f"waveform batch={waveform.shape[0]} vs "
+                f"waveform_length batch={waveform_length.shape[0]}"
+            )
+        if torch.any(waveform_length <= 0):
+            raise ValueError("waveform_length must be strictly positive.")
+
+        frame_num = torch.ceil(waveform_length / self.config.enc_kwargs["input_dim"]).to(torch.int32)
+        if self.config.patch_size != -1:
+            frame_num = torch.ceil(frame_num / self.config.patch_size)
+        h, _ = self.encoder(waveform)
+        h = h.transpose(1, 2)
+
+        mean, logvar = torch.chunk(h, 2, dim=1)
+        logvar = torch.clamp(logvar, -30.0, 20.0)
+        std = torch.exp(0.5 * logvar)
+        latent = mean + std * torch.randn_like(mean)
+        latent = latent.transpose(1, 2)
+        return latent, frame_num
+
+    @torch.inference_mode()
+    def decode(self, latent, past_key_values=None, use_cache=False, stream_state=(None, None, None), last_chunk=False):
+        if latent.dim() != 3:
+            raise ValueError(f"Expected latent rank-3 [B,T,D], got shape={tuple(latent.shape)}")
+        if latent.shape[0] <= 0:
+            raise ValueError("latent batch size must be positive.")
+
+        target_dtype = next(self.decoder.parameters()).dtype
+        target_device = next(self.decoder.parameters()).device
+        if latent.dtype != target_dtype or latent.device != target_device:
+            latent = latent.to(device=target_device, dtype=target_dtype)
+
+        expected_latent_dim = self.config.dec_kwargs["latent_dim"]
+        if latent.shape[-1] != expected_latent_dim:
+            raise ValueError(f"Latent dim mismatch in decode(): got {latent.shape[-1]}, expected {expected_latent_dim}")
+
+        waveform, stream_state, past_key_values = self.decoder.low_level_reconstruct(
+            latent,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            stream_state=stream_state,
+            last_chunk=last_chunk,
+        )
+        return waveform, stream_state, past_key_values
diff --git a/vllm_omni/model_executor/models/ming_utils/fm.py b/vllm_omni/model_executor/models/ming_utils/fm.py
new file mode 100644
index 00000000000..14fd2c35729
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_utils/fm.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm_omni.model_executor.models.ming_utils.dit import get_epss_timesteps
+
+
+def apply_sway_sampling(t, sway_sampling_coef):
+    if sway_sampling_coef is None:
+        return t
+    return t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
+
+
+def build_timesteps(steps, device, dtype, use_epss=True, sway_sampling_coef=-1.0):
+    if steps <= 0:
+        raise ValueError(f"steps must be positive, got {steps}")
+    if use_epss:
+        t = get_epss_timesteps(steps, device=device, dtype=dtype)
+    else:
+        t = torch.linspace(0, 1, steps + 1, device=device, dtype=dtype)
+    return apply_sway_sampling(t, sway_sampling_coef)
+
+
+class Solver:
+    def __init__(self, func, y0, sigma=0.25, temperature=1.5) -> None:
+        self.func = func
+        self.y0 = y0
+        self.sigma = sigma
+        self.temperature = temperature
+
+    def integrate(self, t):
+        solution = torch.empty(len(t), *self.y0.shape, dtype=self.y0.dtype, device=self.y0.device)
+        solution[0] = self.y0
+
+        j = 1
+        y0 = self.y0
+        for t0, t1 in zip(t[:-1], t[1:]):
+            dt = t1 - t0
+            f0 = self.func(t0, y0)
+            dy = dt * f0
+            y1 = y0 + dy
+
+            while j < len(t) and t1 >= t[j]:
+                solution[j] = self._linear_interp(t0, t1, y0, y1, t[j])
+                j += 1
+
+            noise = torch.randn_like(y0)
+            shift = self.sigma * (self.temperature**0.5) * (abs(dt) ** 0.5) * noise
+            y0 = y1 + shift
+
+        return solution
+
+    def _linear_interp(self, t0, t1, y0, y1, t):
+        if t == t0:
+            return y0
+        if t == t1:
+            return y1
+        slope = (t - t0) / (t1 - t0)
+        return y0 + slope * (y1 - y0)
+
+
+def integrate_cfm_steps(fn, y0, t, sde_args, sde_rnd, steps):
+    for step in range(steps):
+        dt = t[step + 1] - t[step]
+        y0 = y0 + fn(t[step], y0) * dt
+        y0 = y0 + sde_args[1] * (sde_args[2] ** 0.5) * (dt.abs() ** 0.5) * sde_rnd[step]
+    return y0

From 98364f10f34ed86522686a72a6366816630f4ba6 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Sat, 30 May 2026 23:05:40 +0530
Subject: [PATCH 51/54] Simplify Ming TTS documentation links

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../examples/offline_inference/ming_tts.md    | 131 +-----
 .../examples/online_serving/ming_tts.md       | 165 +-------
 .../text_to_speech/ming_tts/README.md         | 382 ++----------------
 recipes/inclusionAI/Ming-omni-tts-0.5B.md     | 225 +----------
 4 files changed, 60 insertions(+), 843 deletions(-)

diff --git a/docs/user_guide/examples/offline_inference/ming_tts.md b/docs/user_guide/examples/offline_inference/ming_tts.md
index 6ff31258cba..7c641272473 100644
--- a/docs/user_guide/examples/offline_inference/ming_tts.md
+++ b/docs/user_guide/examples/offline_inference/ming_tts.md
@@ -2,131 +2,8 @@
 
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/text_to_speech/ming_tts>.
 
-This directory contains an offline Ming example that uses the in-repo Ming prompt builder directly. It covers the broader upstream dense 0.5B surface: style, IP, music-only generation, TTA, emotion, dialect, zero-shot clone, podcast, speech+bgm, and speech+sound.
-
-## Quick Start
-
-Run a zero-speaker style case:
-
-```bash
-python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
-    --case style \
-    --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --enforce-eager
-```
-
-Run emotion-controlled speech:
-
-```bash
-python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
-    --case emotion \
-    --ref-audio /path/to/emotion_prompt.wav \
-    --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --enforce-eager
-```
-
-Run zero-shot cloning with a transcript:
-
-```bash
-python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
-    --case zero_shot \
-    --ref-audio /path/to/reference.wav \
-    --ref-text "在此奉劝大家别乱打美白针。" \
-    --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --enforce-eager
-```
-
-Run podcast generation:
-
-```bash
-python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
-    --case podcast \
-    --ref-audio-paths /path/to/CTS-CN-F2F-2019-11-11-423-012-A.wav /path/to/CTS-CN-F2F-2019-11-11-423-012-B.wav \
-    --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --enforce-eager
-```
-
-Run text-to-audio event generation:
-
-```bash
-python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
-    --case tta \
-    --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --enforce-eager
-```
-
-Run with stats and a manifest:
-
-```bash
-python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
-    --case style \
-    --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --enforce-eager \
-    --enable-stats \
-    --stats-log-file output_audio/ming_style_pipeline.log \
-    --metadata-json output_audio/ming_style_manifest.json
-```
-
-## Built-in Cases
-
-- `style`: zero-speaker style-conditioned speech
-- `ip`: zero-speaker IP voice generation
-- `bgm`: music generation
-- `tta`: text-to-audio event generation with FlowLoss controls
-- `emotion`: reference-audio speech with emotion control
-- `basic`: reference-audio cloning with speed / pitch / volume control
-- `dialect`: reference-audio cloning with dialect control
-- `zero_shot`: reference-audio cloning with explicit transcript
-- `podcast`: multi-reference dialogue generation with automatic speaker embedding extraction
-- `speech_bgm`: speech with background music conditioning
-- `speech_sound`: speech with environment sound conditioning
-
-## Streaming
-
-Use async_chunk streaming with `AsyncOmni`:
-
-```bash
-python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
-    --case basic \
-    --ref-audio /path/to/10002287-00000095.wav \
-    --streaming \
-    --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --enforce-eager
-```
-
-`--streaming` currently supports one prompt per process invocation. Use
-blocking mode for `--num-prompts > 1`.
-
-## Validation matrix
-
-The example is intended to cover the dense TTS workflows used by the Ming
-validation helper:
-
-| Case | Blocking | Async chunk | Extra inputs |
-|---|---:|---:|---|
-| `style` | Yes | Optional smoke test | none |
-| `ip` | Yes | Optional smoke test | none |
-| `bgm` | Yes | Optional smoke test | none |
-| `tta` | Yes | Optional smoke test | none |
-| `emotion` | Yes | Yes | reference WAV |
-| `basic` | Yes | Yes | reference WAV |
-| `dialect` | Yes | Yes | reference WAV |
-| `zero_shot` | Yes | Yes | reference WAV and transcript |
-| `podcast` | Yes | Yes | two reference WAVs |
-| `speech_bgm` | Yes | Yes | reference WAV |
-| `speech_sound` | Yes | Yes | reference WAV |
-
-The offline example also exposes vLLM-Omni runtime/reporting controls such as:
-
-- `--num-prompts`
-- `--enable-stats`
-- `--stats-log-file`
-- `--metadata-json`
-- `--stage-init-timeout`
-- `--init-timeout`
-- `--batch-timeout`
-- `--worker-backend`
-- `--ray-address`
+For the TTS model hub, see
+[`examples/offline_inference/text_to_speech/README.md`](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/text_to_speech/README.md).
 
 ## Example materials
 
@@ -138,3 +15,7 @@ The offline example also exposes vLLM-Omni runtime/reporting controls such as:
     ``````py
     --8<-- "examples/offline_inference/text_to_speech/ming_tts/end2end.py"
     ``````
+??? abstract "cases.yaml"
+    ``````yaml
+    --8<-- "examples/offline_inference/text_to_speech/ming_tts/cases.yaml"
+    ``````
diff --git a/docs/user_guide/examples/online_serving/ming_tts.md b/docs/user_guide/examples/online_serving/ming_tts.md
index 119b5a4e7ea..d59d578c38b 100644
--- a/docs/user_guide/examples/online_serving/ming_tts.md
+++ b/docs/user_guide/examples/online_serving/ming_tts.md
@@ -2,169 +2,8 @@
 
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serving/text_to_speech/ming_tts>.
 
-This example shows how to serve Ming through the OpenAI-compatible `/v1/audio/speech` endpoint. The server builds Ming prompts directly with the in-repo prompt builder, so online requests support Ming-specific structured controls instead of the Qwen placeholder path.
-
-## Installation
-
-Please refer to [README.md](https://github.com/vllm-project/vllm-omni/tree/main/README.md)
-
-## Launch the Server
-
-```bash
-vllm-omni serve inclusionAI/Ming-omni-tts-0.5B \
-    --deploy-config vllm_omni/deploy/ming_tts.yaml \
-    --omni \
-    --port 8091 \
-    --enforce-eager
-```
-
-Or:
-
-```bash
-cd examples/online_serving/text_to_speech/ming_tts
-./run_server.sh
-```
-
-The canonical Ming online client is `openai_speech_client.py`. It targets the
-local vLLM-Omni server, not OpenAI's cloud API, so `api_key=EMPTY` is enough
-for local testing.
-
-## Example Requests
-
-Basic TTS:
-
-```bash
-python openai_speech_client.py \
-    --text "你好，这是 Ming 在线语音合成测试。"
-```
-
-Style-conditioned speech:
-
-```bash
-python openai_speech_client.py \
-    --text "我会一直在这里陪着你。" \
-    --instructions "轻柔的ASMR耳语，慢速，贴近麦克风"
-```
-
-Structured Ming control:
-
-```bash
-python openai_speech_client.py \
-    --text "我觉得社会企业同个人都有责任" \
-    --instruction-json '{"方言":"广粤话"}'
-```
-
-IP voice generation:
-
-```bash
-python openai_speech_client.py \
-    --text "这款产品的名字，叫变态坑爹牛肉丸。" \
-    --voice 灵小甄
-```
-
-Reference-audio cloning:
-
-Use `ref_audio` by itself for Ming prompt-waveform conditioning. Add
-`ref_text` when the request is transcript cloning, such as zero-shot or
-podcast-style prompts.
-
-```bash
-python openai_speech_client.py \
-    --task-type Base \
-    --text "我们的愿景是构建未来服务业的数字化基础设施。" \
-    --ref-audio /path/to/reference.wav \
-    --ref-text "在此奉劝大家别乱打美白针。"
-```
-
-Speaker-embedding cloning:
-
-```bash
-python openai_speech_client.py \
-    --task-type Base \
-    --text "你好，这是一段使用说话人向量的合成语音。" \
-    --speaker-embedding /path/to/ming_speaker_embedding.json
-```
-
-Streaming PCM:
-
-```bash
-python openai_speech_client.py \
-    --text "你好，这是流式输出测试。" \
-    --instructions "平静，普通话" \
-    --stream \
-    --output ming_output.pcm
-```
-
-## Curl Helper
-
-Use the bundled helper for common request types:
-
-```bash
-./run_curl.sh basic
-./run_curl.sh style
-./run_curl.sh ip
-REF_AUDIO=/path/to/emotion_prompt.wav ./run_curl.sh emotion
-REF_AUDIO=/path/to/yue_prompt.wav ./run_curl.sh dialect
-REF_AUDIO=/path/to/reference.wav REF_TEXT="在此奉劝大家别乱打美白针。" ./run_curl.sh zero_shot
-REF_AUDIO=/path/to/speaker_1.wav REF_AUDIO_2=/path/to/speaker_2.wav REF_TEXT="speaker_1:你好。 speaker_2:你好。" ./run_curl.sh podcast
-REF_AUDIO=/path/to/00000309-00000300.wav ./run_curl.sh speech_bgm
-REF_AUDIO=/path/to/00000309-00000300.wav ./run_curl.sh speech_sound
-REF_AUDIO=/path/to/reference.wav REF_TEXT="在此奉劝大家别乱打美白针。" ./run_curl.sh clone_ref_audio
-SPEAKER_EMBEDDING=/path/to/ming_speaker_embedding.json ./run_curl.sh clone_embedding
-./run_curl.sh stream
-```
-
-## Audio Inputs
-
-- `ref_audio` accepts a local path, remote URL, or `data:` URL
-- The Python client converts local files into a base64 `data:` URL
-- `speaker_embedding` must be a JSON file with exactly 192 numeric values
-- Ming prompt-waveform cases can use `ref_audio` without `ref_text`
-- Zero-shot and podcast-style transcript cloning should include `ref_text`
-
-The bundled `run_curl.sh basic` mode is plain/default TTS and does not require
-`REF_AUDIO`. The upstream cookbook-style `basic` case uses `ref_audio` plus
-structured speed / pitch / volume instructions.
-
-## Request Types
-
-Ming online serving supports these request families through `/v1/audio/speech`:
-
-| Case | Online support | Required fields |
-|------|----------------|-----------------|
-| default TTS | Supported | `input`, `max_new_tokens=200` |
-| `style` | Supported | `input`, `instructions`, `max_new_tokens=200` |
-| `ip` | Supported | `input`, `voice`, `max_new_tokens=200` |
-| `basic` helper | Supported | `input`, `max_new_tokens=200` |
-| upstream `basic` case | Supported | `input`, `ref_audio`, structured speed / pitch / volume `instructions`, `max_new_tokens=200` |
-| `emotion` | Supported | `input`, `ref_audio`, structured emotion `instructions`, `max_new_tokens=200` |
-| `dialect` | Supported | `input`, `language` or structured `instructions`, `ref_audio`, `max_new_tokens=200` |
-| `zero_shot` | Supported | `input`, `ref_audio`, `ref_text`, `max_new_tokens=200` |
-| `podcast` | Supported | `input`, repeated/list `ref_audio`, `ref_text`, `max_new_tokens=200` |
-| `speech_bgm` | Supported | `input`, `ref_audio`, structured `instructions` with `{"BGM": ...}`, `max_new_tokens=200` |
-| `speech_sound` | Supported | `input`, `ref_audio`, structured `instructions` with `{"BGM": {"ENV": ...}}`, `max_new_tokens=200` |
-| `bgm` | Not supported online | Requires a future `prompt_mode=music` API extension |
-| `tta` | Not supported online | Requires a future `prompt_mode=tta` API extension |
-
-The online endpoint is speech-shaped today. Music-only `bgm` and text-to-audio
-`tta` remain offline workflows.
-
-## Field Mapping
-
-For Ming, the generic OpenAI request fields map to Ming controls like this:
-
-- `input` -> target text
-- `instructions` -> Ming instruction string, or a JSON string for the structured Ming control object
-- `voice` -> Ming `IP`
-- `language` -> Ming `方言`
-- `ref_audio` -> Ming prompt waveform
-- `ref_text` -> optional transcript for zero-shot and podcast-style cloning
-- `speaker_embedding` -> 192-d Ming speaker embedding
-
-## Voice Listing
-
-- `/v1/audio/voices` lists uploaded voices for Ming.
-- Built-in Ming IP labels can still be used as `voice`, but they are not enumerated by the API.
+For the online TTS serving hub, see
+[`examples/online_serving/text_to_speech/README.md`](https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serving/text_to_speech/README.md).
 
 ## Example materials
 
diff --git a/examples/online_serving/text_to_speech/ming_tts/README.md b/examples/online_serving/text_to_speech/ming_tts/README.md
index f75d737eda2..da553dabb94 100644
--- a/examples/online_serving/text_to_speech/ming_tts/README.md
+++ b/examples/online_serving/text_to_speech/ming_tts/README.md
@@ -1,16 +1,9 @@
-# Ming-omni-tts
+# Ming-omni-tts Online Serving
 
-## Installation
+Serve the dense `inclusionAI/Ming-omni-tts-0.5B` two-stage TTS model through
+the OpenAI-compatible `/v1/audio/speech` endpoint.
 
-Please refer to [README.md](../../../README.md)
-
-## Ming Model
-
-| Model | Description |
-|-------|-------------|
-| `inclusionAI/Ming-omni-tts-0.5B` | Dense 0.5B Ming two-stage TTS model for speech generation with dialect, style, IP voice, and cloning controls |
-
-## Launch the Server
+## Start Server
 
 ```bash
 vllm-omni serve inclusionAI/Ming-omni-tts-0.5B \
@@ -20,33 +13,17 @@ vllm-omni serve inclusionAI/Ming-omni-tts-0.5B \
     --enforce-eager
 ```
 
-Or use the convenience script:
+Or:
 
 ```bash
 cd examples/online_serving/text_to_speech/ming_tts
 ./run_server.sh
 ```
 
-The recommended online-serving path is eager async-chunk mode through
-`/v1/audio/speech`. `run_server.sh` defaults to:
-
-- model: `inclusionAI/Ming-omni-tts-0.5B`
-- deploy config: `vllm_omni/deploy/ming_tts.yaml`
-- auth: local testing only, no real OpenAI key required
-
 ## Send Requests
 
-The canonical Ming online client is:
-
-```bash
-cd examples/online_serving/text_to_speech/ming_tts
-python openai_speech_client.py --text "你好，世界"
-```
-
-This talks to the local vLLM-Omni server at `http://localhost:8091/v1` and
-uses `api_key=EMPTY`. It does not call OpenAI's cloud API.
-
-### Basic TTS
+The Python client targets `http://localhost:8091/v1` with `api_key=EMPTY`; it
+does not call OpenAI's hosted API.
 
 ```bash
 python openai_speech_client.py \
@@ -54,16 +31,7 @@ python openai_speech_client.py \
     --max-new-tokens 200
 ```
 
-### Style-conditioned speech without a reference clip
-
-```bash
-python openai_speech_client.py \
-    --text "我会一直在这里陪着你。" \
-    --instructions "轻柔的ASMR耳语，慢速，贴近麦克风" \
-    --max-new-tokens 200
-```
-
-### Structured Ming control via JSON
+Style or dialect controls can be plain text or Ming JSON:
 
 ```bash
 python openai_speech_client.py \
@@ -72,33 +40,7 @@ python openai_speech_client.py \
     --max-new-tokens 200
 ```
 
-### IP voice generation
-
-```bash
-python openai_speech_client.py \
-    --text "这款产品的名字，叫变态坑爹牛肉丸。" \
-    --voice 灵小甄 \
-    --max-new-tokens 200
-```
-
-### Reference-audio cloning
-
-Ming has two reference-audio paths:
-
-- prompt-waveform conditioning, where `ref_audio` steers the voice/style and
-  `ref_text` is not required
-- transcript cloning, where `ref_audio` and `ref_text` are paired
-
-```bash
-python openai_speech_client.py \
-    --task-type Base \
-    --text "我们的愿景是构建未来服务业的数字化基础设施。" \
-    --ref-audio /path/to/reference.wav \
-    --max-new-tokens 200
-```
-
-Pass `--ref-text` when the prompt case needs a transcript, such as zero-shot
-voice cloning:
+Reference-audio cloning:
 
 ```bash
 python openai_speech_client.py \
@@ -109,29 +51,26 @@ python openai_speech_client.py \
     --max-new-tokens 200
 ```
 
-### Podcast-style multi-speaker prompt
+Podcast-style multi-speaker prompt:
 
 ```bash
 python openai_speech_client.py \
     --text "speaker_1:你可以说一下。 speaker_2:我也不知道。" \
     --ref-audio /path/to/speaker_1.wav \
     --ref-audio /path/to/speaker_2.wav \
-    --ref-text "在此奉劝大家别乱打美白针。"
+    --ref-text "speaker_1:你好。 speaker_2:你好。"
 ```
 
-### x-vector style cloning with a precomputed embedding
+Streaming PCM:
 
 ```bash
 python openai_speech_client.py \
-    --task-type Base \
-    --text "你好，这是一段使用说话人向量的合成语音。" \
-    --speaker-embedding /path/to/ming_speaker_embedding.json \
-    --max-new-tokens 200
+    --text "你好，这是流式输出测试。" \
+    --stream \
+    --output ming_output.pcm
 ```
 
-### Curl examples
-
-`run_curl.sh` is intentionally small now. It keeps only three sanity checks:
+`run_curl.sh` keeps small smoke checks:
 
 ```bash
 ./run_curl.sh basic
@@ -139,278 +78,23 @@ REF_AUDIO=/path/to/reference.wav REF_TEXT="在此奉劝大家别乱打美白针
 ./run_curl.sh stream
 ```
 
-For the broader request cookbook, use direct `curl` payloads in this README.
-
-Basic speech:
-
-```bash
-curl -X POST http://localhost:8091/v1/audio/speech \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer EMPTY" \
-    -d '{
-        "model": "inclusionAI/Ming-omni-tts-0.5B",
-        "input": "你好，这是 Ming 在线语音合成测试。",
-        "response_format": "wav"
-    }' \
-    --output ming_output.wav
-```
-
-Style-conditioned speech:
-
-```bash
-curl -X POST http://localhost:8091/v1/audio/speech \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer EMPTY" \
-    -d '{
-        "model": "inclusionAI/Ming-omni-tts-0.5B",
-        "input": "我会一直在这里陪着你。",
-        "instructions": "轻柔的ASMR耳语，慢速，贴近麦克风",
-        "response_format": "wav"
-    }' \
-    --output ming_style.wav
-```
-
-IP voice generation:
-
-```bash
-curl -X POST http://localhost:8091/v1/audio/speech \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer EMPTY" \
-    -d '{
-        "model": "inclusionAI/Ming-omni-tts-0.5B",
-        "input": "这款产品的名字，叫变态坑爹牛肉丸。",
-        "voice": "灵小甄",
-        "response_format": "wav"
-    }' \
-    --output ming_ip.wav
-```
-
-Dialect control with structured instructions:
-
-```bash
-curl -X POST http://localhost:8091/v1/audio/speech \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer EMPTY" \
-    -d '{
-        "model": "inclusionAI/Ming-omni-tts-0.5B",
-        "input": "我觉得社会企业同个人都有责任",
-        "instructions": "{\"方言\":\"广粤话\"}",
-        "ref_audio": "data:audio/wav;base64,<BASE64_WAV>",
-        "response_format": "wav"
-    }' \
-    --output ming_dialect.wav
-```
-
-Zero-shot cloning with transcript:
-
-```bash
-curl -X POST http://localhost:8091/v1/audio/speech \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer EMPTY" \
-    -d '{
-        "model": "inclusionAI/Ming-omni-tts-0.5B",
-        "input": "我们的愿景是构建未来服务业的数字化基础设施。",
-        "ref_audio": "data:audio/wav;base64,<BASE64_WAV>",
-        "ref_text": "在此奉劝大家别乱打美白针。",
-        "response_format": "wav"
-    }' \
-    --output ming_zero_shot.wav
-```
-
-Podcast-style multi-speaker prompt:
-
-```bash
-curl -X POST http://localhost:8091/v1/audio/speech \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer EMPTY" \
-    -d '{
-        "model": "inclusionAI/Ming-omni-tts-0.5B",
-        "input": "speaker_1:你可以说一下。 speaker_2:我也不知道。",
-        "ref_audio": [
-            "data:audio/wav;base64,<BASE64_SPK1>",
-            "data:audio/wav;base64,<BASE64_SPK2>"
-        ],
-        "ref_text": "speaker_1:你好。 speaker_2:你好。",
-        "response_format": "wav"
-    }' \
-    --output ming_podcast.wav
-```
-
-Speaker-embedding cloning:
-
-```bash
-curl -X POST http://localhost:8091/v1/audio/speech \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer EMPTY" \
-    -d '{
-        "model": "inclusionAI/Ming-omni-tts-0.5B",
-        "input": "你好，这是一段使用说话人向量的合成语音。",
-        "speaker_embedding": [0.0, 0.0, 0.0],
-        "response_format": "wav"
-    }' \
-    --output ming_embedding.wav
-```
-
-Streaming PCM response:
-
-```bash
-curl -N -X POST http://localhost:8091/v1/audio/speech \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer EMPTY" \
-    -d '{
-        "model": "inclusionAI/Ming-omni-tts-0.5B",
-        "input": "你好，这是流式测试。",
-        "stream": true,
-        "response_format": "pcm"
-    }' \
-    --output ming_stream.pcm
-```
-
-## Request Types
-
-Ming online serving supports these main request families through
-`/v1/audio/speech`:
-
-| Case | Online support | Required fields |
-|------|----------------|-----------------|
-| default TTS | Supported | `input`, `max_new_tokens=200` |
-| `style` | Supported | `input`, `instructions`, `max_new_tokens=200` |
-| `ip` | Supported | `input`, `voice`, `max_new_tokens=200` |
-| `basic` helper | Supported | `input`, `max_new_tokens=200` |
-| upstream `basic` case | Supported | `input`, `ref_audio`, structured speed / pitch / volume `instructions`, `max_new_tokens=200` |
-| `emotion` | Supported | `input`, `ref_audio`, structured emotion `instructions`, `max_new_tokens=200` |
-| `dialect` | Supported | `input`, `language` or structured `instructions`, `ref_audio`, `max_new_tokens=200` |
-| `zero_shot` | Supported | `input`, `ref_audio`, `ref_text`, `max_new_tokens=200` |
-| `podcast` | Supported | `input`, repeated/list `ref_audio`, `ref_text`, `max_new_tokens=200` |
-| `speech_bgm` | Supported | `input`, `ref_audio`, structured `instructions` with `{"BGM": ...}`, `max_new_tokens=200` |
-| `speech_sound` | Supported | `input`, `ref_audio`, structured `instructions` with `{"BGM": {"ENV": ...}}`, `max_new_tokens=200` |
-| `bgm` | Not supported online | Requires a future `prompt_mode=music` API extension |
-
-This matrix intentionally mirrors the local online validation flow. The
-music-only `bgm` case remains offline-only because `/v1/audio/speech` always
-uses Ming's speech prompt path today.
-
-## Output
-
-- Non-streaming requests return full audio bytes, usually written to `.wav`
-- WAV outputs are expected to be readable at 44.1kHz
-- Streaming requests return progressive PCM bytes; wrap or convert them to WAV
-  before browser playback
-- The default Python client outputs:
-  - `ming_output.wav` for non-streaming
-  - `ming_output.pcm` for streaming
-
-## Validated Outputs
-
-Validation on an L4 GPU passed the online async_chunk `/v1/audio/speech` flow
-for every speech-mode case in the local validation script:
-
-| Case | Output | Size bytes | Sample rate | Frames |
-|------|--------|-----------:|------------:|-------:|
-| `style` | WAV | 790316 | 44100 | 395136 |
-| `ip` | WAV | 366956 | 44100 | 183456 |
-| `basic` | WAV | 536300 | 44100 | 268128 |
-| `emotion` | WAV | 649196 | 44100 | 324576 |
-| `dialect` | WAV | 395180 | 44100 | 197568 |
-| `zero_shot` | WAV | 931436 | 44100 | 465696 |
-| `podcast` | WAV | 846764 | 44100 | 423360 |
-| `speech_bgm` | WAV | 677420 | 44100 | 338688 |
-| `speech_sound` | WAV | 649196 | 44100 | 324576 |
-| `streaming` | PCM | 338688 | N/A | N/A |
-
-`bgm` is intentionally not included in the online pass list. It is a
-music-prompt workflow, while `/v1/audio/speech` currently routes Ming through
-the speech prompt path.
-
-## Performance
-
-Benchmark via `/v1/audio/speech`, `inclusionAI/Ming-omni-tts-0.5B`,
-10 prompts, concurrency 1, eager mode:
-
-| Config | Mean TTFP | Mean E2E | Mean RTF |
-|--------|----------:|---------:|---------:|
-| Sequential eager | 3354.83ms | 3357.01ms | 0.561 |
-| Async chunk eager | 3450.28ms | 3452.35ms | 0.577 |
-
-## Audio Inputs
-
-- `ref_audio` accepts:
-  - a local file path
-  - a remote `http://` or `https://` URL
-  - a `data:` URL
-  - repeated values for podcast-style multi-speaker prompts
-- `openai_speech_client.py` converts local reference audio files into a base64
-  `data:` URL before sending them to the server
-- `speaker_embedding` must be a JSON file containing exactly 192 numeric values
-- Ming prompt-waveform cases can use `ref_audio` without `ref_text`
-- Zero-shot and podcast-style transcript cloning should include `ref_text`
-
-## API Field Mapping
-
-The OpenAI-compatible `/v1/audio/speech` endpoint stays generic. Ming-specific controls are mapped like this:
-
-- `input` -> target text
-- `instructions` -> Ming instruction string, or a JSON string that becomes the structured Ming control object
-- `voice` -> Ming `IP` field when using built-in character voices
-- `language` -> Ming `方言` field
-- `ref_audio` -> Ming `prompt_waveform`
-- `ref_text` -> Ming `prompt_text`
-- `speaker_embedding` -> 192-d Ming speaker embedding
-- `max_new_tokens` -> Ming `max_decode_steps`
-
-## Voice Listing
-
-- `/v1/audio/voices` reflects uploaded voices for Ming.
-- Built-in Ming IP labels like `灵小甄` are passed through as `voice` values, but they are not enumerated by the API.
-
-## Streaming
-
-Use `stream=true` to get progressive PCM output:
-
-```bash
-python openai_speech_client.py \
-    --text "你好，这是流式输出测试。" \
-    --instructions "平静，普通话" \
-    --stream \
-    --output ming_output.pcm
-```
-
-## Not Supported Online Yet
-
-`bgm` music-prompt generation is not exposed through `/v1/audio/speech` today.
-It needs a future `prompt_mode=music` API extension so the server can select
-Ming's music system prompt instead of the speech system prompt.
-
-## Troubleshooting
-
-### No real OpenAI key
-
-The example targets a local vLLM-Omni server. `api_key=EMPTY` is expected and
-is sufficient for local testing.
-
-### `--ref-audio` fails
-
-- Confirm the local file exists
-- If using zero-shot or podcast transcript cloning, also provide `--ref-text`
-- If passing a URL, make sure the server can fetch it
-
-### `--speaker-embedding` fails
-
-- Make sure the JSON file contains exactly 192 numeric values
-- Do not wrap the list in another object
-
-### Connection refused
-
-- Check that the server is running on `localhost:8091`
-- Confirm the stage config path is correct
-
-### No audio or wrong output file
+## Request Fields
 
-- Use non-streaming for `.wav`
-- Use `--stream` for `.pcm`
+| Field | Ming meaning |
+|-------|--------------|
+| `input` | target text |
+| `instructions` | plain style text, or JSON object for structured Ming controls |
+| `voice` | Ming IP voice label unless it resolves to an uploaded speaker |
+| `language` | Ming `方言` control |
+| `ref_audio` | prompt waveform; repeat/list values for podcast prompts |
+| `ref_text` | transcript for zero-shot or podcast cloning |
+| `speaker_embedding` | 192-d Ming speaker embedding |
+| `max_new_tokens` | Ming `max_decode_steps` |
 
-### `bgm` is missing online
+## Notes
 
-Use the offline example for music-only `bgm`. Online support needs an explicit
-Ming prompt-mode API extension so the server can select the music prompt
-instead of the speech prompt.
+- `ref_audio` accepts local paths through the client, remote URLs, `file://`,
+  or `data:` URLs.
+- Non-streaming responses return WAV bytes; streaming responses return PCM.
+- Music-only `bgm` generation is offline-only until the API exposes Ming
+  prompt-mode selection.
diff --git a/recipes/inclusionAI/Ming-omni-tts-0.5B.md b/recipes/inclusionAI/Ming-omni-tts-0.5B.md
index 1c1422bd245..c5dc92a7529 100644
--- a/recipes/inclusionAI/Ming-omni-tts-0.5B.md
+++ b/recipes/inclusionAI/Ming-omni-tts-0.5B.md
@@ -1,47 +1,22 @@
 # Ming-omni-tts 0.5B
 
-> Offline and online TTS/audio generation with the dense Ming two-stage AR + Flow/VAE pipeline
-
 ## Summary
 
 - Vendor: inclusionAI
 - Model: `inclusionAI/Ming-omni-tts-0.5B`
-- Task: Text-to-speech, voice/style control, zero-shot cloning, podcast-style multi-speaker generation, and text-to-audio/music cases
-- Mode: Offline `Omni` / `AsyncOmni` and online OpenAI-compatible `/v1/audio/speech`
-- Maintainer: Community
-
-## When to use this recipe
-
-Use this recipe when you want to run the dense 0.5B Ming TTS model through
-vLLM-Omni's two-stage pipeline:
-
-- Stage 0: Qwen2-based autoregressive backbone with inline Ming flow controls
-- Stage 1: audio VAE decode to mono 44.1 kHz waveform
-
-The verified flow covers blocking offline generation, async-chunk offline
-generation, and online serving for speech cases. Music-only `bgm` and `tta`
-are covered by offline inference; the online `/v1/audio/speech` endpoint does
-not yet expose the corresponding `prompt_mode` fields.
-
-## References
+- Deploy config: `vllm_omni/deploy/ming_tts.yaml`
+- Pipeline: dense two-stage AR + Flow/VAE Ming TTS
+- Output: mono 44.1 kHz audio
 
-- Hugging Face model:
-  [`inclusionAI/Ming-omni-tts-0.5B`](https://huggingface.co/inclusionAI/Ming-omni-tts-0.5B)
-- Offline example:
-  [`examples/offline_inference/text_to_speech/ming_tts/`](../../examples/offline_inference/text_to_speech/ming_tts/)
-- Online example:
-  [`examples/online_serving/text_to_speech/ming_tts/`](../../examples/online_serving/text_to_speech/ming_tts/)
-- Deploy config:
-  [`vllm_omni/deploy/ming_tts.yaml`](../../vllm_omni/deploy/ming_tts.yaml)
+## Examples
 
-## Installing vLLM-Omni
+- Offline: [`examples/offline_inference/text_to_speech/ming_tts/`](../../examples/offline_inference/text_to_speech/ming_tts/)
+- Online: [`examples/online_serving/text_to_speech/ming_tts/`](../../examples/online_serving/text_to_speech/ming_tts/)
 
-Use a fresh Python environment. The verified run used vLLM `0.21.0` with the
-CUDA 13 PyTorch stack.
+## Install
 
 ```bash
 export VLLM_VERSION="0.21.0"
-
 uv venv
 source .venv/bin/activate
 uv pip install vllm==$VLLM_VERSION --torch-backend=cu130
@@ -49,27 +24,7 @@ uv pip install -e .
 uv pip install soundfile pyyaml openai aiohttp huggingface_hub
 ```
 
-## Hardware Support
-
-## GPU
-
-### 1x A100 40GB
-
-#### Environment
-
-- OS: Linux
-- Python: 3.12.13
-- GPU: NVIDIA A100-SXM4-40GB, 40960 MiB
-- Driver: 580.82.07
-- PyTorch: `2.11.0+cu130`
-- CUDA runtime reported by PyTorch: 13.0
-- vLLM version: 0.21.0
-- vLLM-Omni branch / commit: `feat/ming-omni-tts-dense` / `4d923c708099939178e932ff153c63749b430fd1`
-- Deploy config: `vllm_omni/deploy/ming_tts.yaml`
-
-#### Offline Command
-
-Run a single blocking case:
+## Offline
 
 ```bash
 python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
@@ -79,168 +34,26 @@ python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
   --enforce-eager
 ```
 
-Run a streaming async-chunk case:
-
-```bash
-python examples/offline_inference/text_to_speech/ming_tts/end2end.py \
-  --model inclusionAI/Ming-omni-tts-0.5B \
-  --case basic \
-  --ref-audio /path/to/10002287-00000095.wav \
-  --streaming \
-  --deploy-config vllm_omni/deploy/ming_tts.yaml \
-  --enforce-eager
-```
-
-The offline example includes 11 built-in cases: `style`, `ip`, `bgm`, `tta`,
-`emotion`, `basic`, `dialect`, `zero_shot`, `podcast`, `speech_bgm`, and
-`speech_sound`.
+The offline example owns the full case list, including speech, zero-shot,
+podcast, text-to-audio, and music-style workflows.
 
-#### Online Command
-
-Start the OpenAI-compatible speech server:
+## Online
 
 ```bash
 vllm-omni serve inclusionAI/Ming-omni-tts-0.5B \
   --deploy-config vllm_omni/deploy/ming_tts.yaml \
-  --host 127.0.0.1 \
-  --port 8091 \
-  --enforce-eager \
   --omni \
-  --stage-init-timeout 600 \
-  --init-timeout 900 \
-  --log-stats
-```
-
-Or use the bundled helper:
-
-```bash
-cd examples/online_serving/text_to_speech/ming_tts
-./run_server.sh
-```
-
-#### Verification
-
-Basic speech:
-
-```bash
-curl -X POST http://127.0.0.1:8091/v1/audio/speech \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer EMPTY" \
-  -d '{
-    "model": "inclusionAI/Ming-omni-tts-0.5B",
-    "input": "你好，这是 Ming 在线语音合成测试。",
-    "response_format": "wav",
-    "max_new_tokens": 200
-  }' \
-  --output ming_basic.wav
-```
-
-Style-conditioned speech:
-
-```bash
-curl -X POST http://127.0.0.1:8091/v1/audio/speech \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer EMPTY" \
-  -d '{
-    "model": "inclusionAI/Ming-omni-tts-0.5B",
-    "input": "我会一直在这里陪着你，直到你慢慢、慢慢地沉入那个最温柔的梦里……好吗？",
-    "instructions": "{\"风格\":\"ASMR耳语，轻柔普通话，音量极低，语速极慢\"}",
-    "response_format": "wav",
-    "max_new_tokens": 200
-  }' \
-  --output ming_style.wav
-```
-
-Zero-shot cloning with reference audio and transcript:
-
-```bash
-curl -X POST http://127.0.0.1:8091/v1/audio/speech \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer EMPTY" \
-  -d '{
-    "model": "inclusionAI/Ming-omni-tts-0.5B",
-    "input": "我们的愿景是构建未来服务业的数字化基础设施。",
-    "task_type": "Base",
-    "ref_audio": "data:audio/wav;base64,<BASE64_WAV>",
-    "ref_text": "在此奉劝大家别乱打美白针。",
-    "response_format": "wav",
-    "max_new_tokens": 200
-  }' \
-  --output ming_zero_shot.wav
+  --port 8091 \
+  --enforce-eager
 ```
 
-Streaming PCM:
-
 ```bash
-curl -X POST http://127.0.0.1:8091/v1/audio/speech \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer EMPTY" \
-  -d '{
-    "model": "inclusionAI/Ming-omni-tts-0.5B",
-    "input": "你好，这是 Ming 在线流式语音合成测试。",
-    "instructions": "平静，普通话",
-    "response_format": "pcm",
-    "stream": true,
-    "max_new_tokens": 200
-  }' \
-  --output ming_streaming.pcm
+python examples/online_serving/text_to_speech/ming_tts/openai_speech_client.py \
+  --text "你好，这是 Ming 在线语音合成测试。" \
+  --max-new-tokens 200
 ```
 
-## Key Parameters
-
-| Parameter | Scope | Description |
-|---|---|---|
-| `--deploy-config` | Offline / online | Use `vllm_omni/deploy/ming_tts.yaml` for the two-stage Ming pipeline |
-| `--enforce-eager` | Offline / online | Recommended and used by the verified run |
-| `--case` | Offline | Built-in case name from `cases.yaml` |
-| `--streaming` | Offline | Uses `AsyncOmni` and async-chunk transfer |
-| `voice` | Online | Selects a built-in IP voice such as `灵小甄` |
-| `instructions` | Online | Free-form text or JSON-encoded Ming controls such as style, emotion, dialect, BGM, or environmental sound |
-| `ref_audio` | Online | Reference audio, usually sent as a data URL for HTTP requests |
-| `ref_text` | Online | Transcript paired with `ref_audio` for zero-shot cloning |
-| `task_type` | Online | Use `Base` for reference-audio cloning requests |
-| `response_format` | Online | `wav` for complete audio or `pcm` for streaming |
-| `stream` | Online | Set `true` with `response_format="pcm"` for streaming output |
-| `max_new_tokens` | Online | Upper bound for speech token generation |
-
-## Verified Results
-
-The following measurements came from the result summaries in
-`/home/aja/Music/mingE2E27may` for commit
-`4d923c708099939178e932ff153c63749b430fd1`. Each case used one warmup run and
-one measured run on 1x A100 40GB. Memory peak was not available in the captured
-stats.
-
-### Offline
-
-| Mode | Cases | E2E RTF | Elapsed range | TTFP |
-|---|---:|---:|---:|---:|
-| Blocking | 11 / 11 | 0.5011 - 0.6090, avg 0.5568 | 2.3541s - 15.0980s | N/A |
-| Async chunk streaming | 11 / 11 | 0.4936 - 0.6079, avg 0.5468 | 2.2731s - 14.8571s | 2.2692s - 4.7519s, avg 4.0078s |
-
-Offline blocking and async-chunk streaming both completed all 11 cases:
-`style`, `ip`, `bgm`, `tta`, `emotion`, `basic`, `dialect`, `zero_shot`,
-`podcast`, `speech_bgm`, and `speech_sound`.
-
-### Online
-
-Server startup was 110.01s. The `/v1/audio/speech` endpoint returned HTTP 200
-for the warmup request, 9 WAV speech cases, and one streaming PCM request.
-
-| Request group | Cases | E2E RTF / latency |
-|---|---:|---:|
-| WAV speech cases | 9 | RTF 0.5208 - 1.6646, avg 0.7622; elapsed 2.38s - 15.98s |
-| Streaming PCM smoke test | 1 | elapsed 2.43s; TTFP 2.423s |
-
-Online WAV cases verified: `style`, `ip`, `basic`, `emotion`, `dialect`,
-`zero_shot`, `podcast`, `speech_bgm`, and `speech_sound`.
-
-## Notes
+## Hardware
 
-- The deploy config sets `async_chunk: true`, `dtype: bfloat16`, and
-  `trust_remote_code: false`.
-- Stage 0 and Stage 1 both run on logical device `0` in the bundled config.
-- The verified online route skips `bgm` and `tta` because `/v1/audio/speech`
-  does not yet expose `prompt_mode=music` or `prompt_mode=tta`.
-- Reference-audio fixtures used by the validation come from
-  `inclusionAI/Ming-omni-tts/data/wavs`.
+Validated on NVIDIA A100 40GB and L4 class GPUs. Local CPU-only environments
+are suitable for static checks, but functional Ming generation requires CUDA.

From 1eabd8767dbf8b7c735ca5560772e76bb366f14a Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Sat, 30 May 2026 23:12:15 +0530
Subject: [PATCH 52/54] Remove redundant Ming DIT checks

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../model_executor/models/ming_utils/dit.py   | 30 ++-----------------
 1 file changed, 2 insertions(+), 28 deletions(-)

diff --git a/vllm_omni/model_executor/models/ming_utils/dit.py b/vllm_omni/model_executor/models/ming_utils/dit.py
index 0210e160568..3b4ab112230 100644
--- a/vllm_omni/model_executor/models/ming_utils/dit.py
+++ b/vllm_omni/model_executor/models/ming_utils/dit.py
@@ -11,19 +11,12 @@ def __init__(self, dim, eps=1e-6):
         super().__init__()
         self.eps = eps
         self.weight = nn.Parameter(torch.ones(dim))
-        self.native_rms_norm = float(torch.__version__[:3]) >= 2.4
 
     def forward(self, x):
-        if self.native_rms_norm:
-            if self.weight.dtype in [torch.float16, torch.bfloat16]:
-                x = x.to(self.weight.dtype)
-            return F.rms_norm(x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps)
-
-        variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        x = x * torch.rsqrt(variance + self.eps)
         if self.weight.dtype in [torch.float16, torch.bfloat16]:
             x = x.to(self.weight.dtype)
-        return x * self.weight
+        x = F.rms_norm(x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps)
+        return x
 
 
 class FeedForward(nn.Module):
@@ -52,9 +45,6 @@ def __init__(
         attn_mask_enabled=True,
     ):
         super().__init__()
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("SDPA requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-
         self.dim = dim
         self.heads = heads
         self.inner_dim = dim_head * heads
@@ -78,16 +68,6 @@ def __init__(
         self.attn_mask_enabled = attn_mask_enabled
 
     def forward(self, x, mask=None, rope=None):
-        if x.ndim != 3:
-            raise ValueError(f"Expected x rank-3 [Batch, Time, Dimension], got {tuple(x.shape)}")
-        if x.shape[-1] != self.dim:
-            raise ValueError(f"x feature dim mismatch: got {x.shape[-1]}, expected {self.dim}")
-        if mask is not None:
-            if mask.ndim != 2:
-                raise ValueError(f"Expected mask rank-2 [Batch, Time], got {tuple(mask.shape)}")
-            if mask.shape[0] != x.shape[0] or mask.shape[1] != x.shape[1]:
-                raise ValueError(f"Mask shape mismatch: got {tuple(mask.shape)}, expected {tuple(x.shape[:2])}")
-
         batch_size = x.shape[0]
         query = self.to_q(x)
         key = self.to_k(x)
@@ -95,11 +75,8 @@ def forward(self, x, mask=None, rope=None):
 
         inner_dim = key.shape[-1]
         head_dim = inner_dim // self.heads
-        # [Batch, Time, Dimension] -> [Batch, Heads, Time, HeadDimension].
         query = query.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
-        # [Batch, Time, Dimension] -> [Batch, Heads, Time, HeadDimension].
         key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
-        # [Batch, Time, Dimension] -> [Batch, Heads, Time, HeadDimension].
         value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
 
         if self.q_norm is not None:
@@ -135,7 +112,6 @@ def forward(self, x, mask=None, rope=None):
             final_output[valid_sample_indices] = x
             x = final_output
 
-        # [Batch, Heads, Time, HeadDimension] -> [Batch, Time, Dimension].
         x = x.transpose(1, 2).reshape(batch_size, -1, self.heads * head_dim)
         x = x.to(query.dtype)
         x = self.to_out[0](x)
@@ -198,8 +174,6 @@ def __init__(self, input_feature_size, hidden_size):
         self.cond_embedder = nn.Linear(input_feature_size, hidden_size)
 
     def forward(self, llm_cond):
-        if llm_cond.ndim != 3:
-            raise ValueError(f"Expected conditioning rank-3 [Batch, Time, Dimension], got {tuple(llm_cond.shape)}")
         return self.cond_embedder(llm_cond)
 
 

From 149b0c091c5c6161bd3661c114bb958d5752d5a2 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Mon, 1 Jun 2026 00:21:04 +0530
Subject: [PATCH 53/54] Fix Ming Flash Omni transformer compatibility

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../ming_flash_omni_thinker.py                | 18 +++++---
 .../modeling_bailing_moe_v2.py                |  2 +-
 .../configs/ming_flash_omni.py                |  2 +
 .../transformers_utils/processors/ming.py     | 41 +++++++++++++++----
 4 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py
index bde7477b945..ab5439c18b1 100644
--- a/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py
+++ b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py
@@ -418,18 +418,24 @@ def _call_hf_processor(
         if images is not None:
             image_outputs = hf_processor.image_processor(
                 images=images,
-                videos=None,
                 return_tensors="pt",
             )
             data.update(image_outputs)
 
         videos = mm_data.get("videos", None)
         if videos is not None:
-            video_outputs = hf_processor.image_processor(
-                images=None,
-                videos=videos,
-                return_tensors="pt",
-            )
+            video_processor = getattr(hf_processor, "video_processor", None)
+            if video_processor is not None:
+                video_outputs = video_processor(
+                    videos=videos,
+                    return_tensors="pt",
+                )
+            else:
+                video_outputs = hf_processor.image_processor(
+                    images=None,
+                    videos=videos,
+                    return_tensors="pt",
+                )
             # Rename keys to distinguish from images
             if "pixel_values" in video_outputs:
                 video_outputs["pixel_values_videos"] = video_outputs.pop("pixel_values")
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py b/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py
index ca7d00f5032..50c03866b0d 100644
--- a/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py
+++ b/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py
@@ -818,7 +818,7 @@ def compute_logits(
         hidden_states: torch.Tensor,
         sampling_metadata,
     ) -> torch.Tensor | None:
-        logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
+        logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def sample(
diff --git a/vllm_omni/transformers_utils/configs/ming_flash_omni.py b/vllm_omni/transformers_utils/configs/ming_flash_omni.py
index 408b208682f..ff5ce3ad2ff 100644
--- a/vllm_omni/transformers_utils/configs/ming_flash_omni.py
+++ b/vllm_omni/transformers_utils/configs/ming_flash_omni.py
@@ -27,6 +27,7 @@
 
 class BailingMoeV2Config(PretrainedConfig):
     model_type = "bailing_moe_v2"
+    ignore_keys_at_rope_validation = {"mrope_section"}
 
     def __init__(
         self,
@@ -237,6 +238,7 @@ def __init__(
 
 class BailingMM2Config(PretrainedConfig):
     model_type = "bailingmm_moe_v2_lite"
+    ignore_keys_at_rope_validation = {"mrope_section"}
     is_composition = True
     sub_configs: ClassVar = {"llm_config": AutoConfig}
 
diff --git a/vllm_omni/transformers_utils/processors/ming.py b/vllm_omni/transformers_utils/processors/ming.py
index 7f414b7268c..1a24cf266e4 100644
--- a/vllm_omni/transformers_utils/processors/ming.py
+++ b/vllm_omni/transformers_utils/processors/ming.py
@@ -23,6 +23,13 @@
 from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
+try:
+    from transformers import AutoVideoProcessor
+except ImportError:
+    AutoVideoProcessor = None
+
+_HAS_VIDEO_PROCESSOR = AutoVideoProcessor is not None
+
 DEFAULT_IMAGE_PATCH_TOKEN = "<imagePatch>"
 DEFAULT_IM_START_TOKEN = "<image>"
 DEFAULT_IM_END_TOKEN = "</image>"
@@ -155,13 +162,18 @@ class MingFlashOmniProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "audio_processor", "tokenizer"]
+    if _HAS_VIDEO_PROCESSOR:
+        attributes = ["image_processor", "video_processor", "audio_processor", "tokenizer"]
     image_processor_class = "AutoImageProcessor"
+    if _HAS_VIDEO_PROCESSOR:
+        video_processor_class = "AutoVideoProcessor"
     audio_processor_class = "AutoFeatureExtractor"
     tokenizer_class = "AutoTokenizer"
 
     def __init__(
         self,
         image_processor=None,
+        video_processor=None,
         audio_processor=None,
         tokenizer=None,
         merge_size: int = 2,
@@ -180,11 +192,14 @@ def __init__(
         self.image_token = PLACEHOLDER_IMAGE_TOKEN_IN_TEXT
         self.video_token = PLACEHOLDER_VIDEO_TOKEN_IN_TEXT
         self.audio_token = PLACEHOLDER_AUDIO_TOKEN_IN_TEXT
-        super().__init__(
+        processor_kwargs = dict(
             image_processor=image_processor,
             audio_processor=audio_processor,
             tokenizer=tokenizer,
         )
+        if _HAS_VIDEO_PROCESSOR:
+            processor_kwargs["video_processor"] = video_processor
+        super().__init__(**processor_kwargs)
 
         # Fall back to the tokenizer's own chat_template.
         if self.chat_template is None:
@@ -211,7 +226,6 @@ def __call__(
         if images is not None:
             image_outputs = self.image_processor(
                 images=images,
-                videos=None,
                 return_tensors="pt",
                 **kwargs.get("images_kwargs", {}),
             )
@@ -220,12 +234,20 @@ def __call__(
                 text = self._expand_image_tokens(text, image_outputs["image_grid_thw"])
 
         if videos is not None:
-            video_outputs = self.image_processor(
-                images=None,
-                videos=videos,
-                return_tensors="pt",
-                **kwargs.get("videos_kwargs", {}),
-            )
+            video_processor = getattr(self, "video_processor", None)
+            if video_processor is not None:
+                video_outputs = video_processor(
+                    videos=videos,
+                    return_tensors="pt",
+                    **kwargs.get("videos_kwargs", {}),
+                )
+            else:
+                video_outputs = self.image_processor(
+                    images=None,
+                    videos=videos,
+                    return_tensors="pt",
+                    **kwargs.get("videos_kwargs", {}),
+                )
             if "pixel_values" in video_outputs:
                 video_outputs["pixel_values_videos"] = video_outputs.pop("pixel_values")
             if "image_grid_thw" in video_outputs:
@@ -423,6 +445,9 @@ def model_input_names(self):
             + self.image_processor.model_input_names
             + self.audio_processor.model_input_names
         )
+        video_processor = getattr(self, "video_processor", None)
+        if video_processor is not None:
+            names += video_processor.model_input_names
         return list(dict.fromkeys(names))
 
 

From 55c1b1249086011150caa95ef3cb14611c91ef08 Mon Sep 17 00:00:00 2001
From: akshatvishu <akshatnayak197@gmail.com>
Date: Mon, 1 Jun 2026 00:42:05 +0530
Subject: [PATCH 54/54] Fix Ming Flash Omni talker input bridge

Signed-off-by: akshatvishu <akshatnayak197@gmail.com>
---
 .../test_qwen3_omni_streaming_helpers.py      | 31 +++++++++++
 .../stage_input_processors/ming_flash_omni.py | 55 +++++++++----------
 2 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/tests/model_executor/stage_input_processors/test_qwen3_omni_streaming_helpers.py b/tests/model_executor/stage_input_processors/test_qwen3_omni_streaming_helpers.py
index f11a4654ec2..6b8c5348b78 100644
--- a/tests/model_executor/stage_input_processors/test_qwen3_omni_streaming_helpers.py
+++ b/tests/model_executor/stage_input_processors/test_qwen3_omni_streaming_helpers.py
@@ -187,3 +187,34 @@ def test_thinker2talker_full_payload_packs_complete_tensors() -> None:
     assert payload["embed"]["prefill"].device.type == "cpu"
     assert payload["hidden_states"]["output"].device.type == "cpu"
     assert payload["next_stage_prompt_len"] > 0
+
+
+def test_ming_flash_omni_thinker2talker_smoke() -> None:
+    from vllm_omni.model_executor.stage_input_processors.ming_flash_omni import (
+        thinker2talker,
+        thinker2talker_token_only,
+    )
+
+    class _Out:
+        def __init__(self, text):
+            self.text = text
+
+    class _Wrap:
+        def __init__(self, text):
+            self.outputs = [_Out(text)]
+
+    class _Prompt:
+        def __init__(self, info):
+            self.additional_information = info
+
+    src = [_Wrap("hello world")]
+    prompt = _Prompt({"voice_name": "ZH_FEMALE", "prompt_text": "ref text"})
+    for func in (thinker2talker, thinker2talker_token_only):
+        out = func(src, prompt=prompt)
+        assert len(out) == 1
+        assert out[0]["prompt_token_ids"] == [0]
+        info = out[0]["additional_information"]
+        assert info["text"] == "hello world"
+        assert info["voice_name"] == "ZH_FEMALE"
+        assert info["prompt_text"] == "ref text"
+        assert info["ming_task"] == "omni"
diff --git a/vllm_omni/model_executor/stage_input_processors/ming_flash_omni.py b/vllm_omni/model_executor/stage_input_processors/ming_flash_omni.py
index dddca3a9e2d..8d9e86d655e 100644
--- a/vllm_omni/model_executor/stage_input_processors/ming_flash_omni.py
+++ b/vllm_omni/model_executor/stage_input_processors/ming_flash_omni.py
@@ -6,41 +6,16 @@
 
 from typing import Any
 
+import torch
 from vllm.inputs import TextPrompt
 
 from vllm_omni.inputs.data import OmniTokensPrompt
 
 
-def _validate_stage_inputs(stage_list, engine_input_source):
-    """Validate stage inputs and return the source engine outputs."""
-    if not engine_input_source:
-        raise ValueError("engine_input_source cannot be empty")
-
-    stage_id = engine_input_source[0]
-    if stage_id >= len(stage_list):
-        raise IndexError(f"Invalid stage_id: {stage_id}")
-
-    stage = stage_list[stage_id]
-    if stage.engine_outputs is None:
-        raise RuntimeError(f"Stage {stage_id} has no outputs yet")
-
-    return stage.engine_outputs
-
-
-def thinker2talker(
-    stage_list: list[Any],
-    engine_input_source: list[int],
+def _build_talker_inputs(
+    source_outputs: list[Any],
     prompt: OmniTokensPrompt | TextPrompt | None = None,
-    requires_multimodal_data: bool = False,
 ) -> list[OmniTokensPrompt]:
-    """Build talker stage inputs from thinker stage outputs.
-
-    Extracts the generated text from thinker output and constructs
-    a talker input prompt with the text and any speaker/instruction info
-    from the original request.
-    """
-    source_outputs = _validate_stage_inputs(stage_list, engine_input_source)
-
     if not isinstance(prompt, list):
         prompt = [prompt]
 
@@ -61,8 +36,6 @@ def thinker2talker(
         # the talker's spk_head wants a torch tensor.
         spk_emb = additional_info.get("spk_emb", None)
         if isinstance(spk_emb, list) and spk_emb and not hasattr(spk_emb[0], "device"):
-            import torch
-
             spk_emb = torch.tensor(spk_emb, dtype=torch.float32).unsqueeze(0)
 
         # Omni speech path mirrors upstream `omni_audio_generation`:
@@ -102,3 +75,25 @@ def thinker2talker(
         )
 
     return talker_inputs
+
+
+def thinker2talker(
+    source_outputs: list[Any],
+    prompt: OmniTokensPrompt | TextPrompt | None = None,
+    _requires_multimodal_data: bool = False,
+    _streaming_context: Any | None = None,
+) -> list[OmniTokensPrompt]:
+    """Build talker stage inputs from thinker stage outputs."""
+    return _build_talker_inputs(source_outputs, prompt)
+
+
+def thinker2talker_token_only(
+    source_outputs: list[Any],
+    prompt: OmniTokensPrompt | TextPrompt | None = None,
+    _requires_multimodal_data: bool = False,
+) -> list[OmniTokensPrompt]:
+    """Sync-side builder for the non-async-chunk thinker→talker path."""
+    return _build_talker_inputs(source_outputs, prompt)
+
+
+thinker2talker_token_only._is_sync_input = True