MaciejBalaNV · MaciejBalaNV · Jun 1, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/.buildkite/pipeline-intel.yaml b/.buildkite/pipeline-intel.yaml
@@ -10,7 +10,7 @@ steps:
           DOCKER_BUILDKIT: "1"
           # Buildkite will automatically replace this with the actual commit hash
           VLLM_IMAGE_TAG: "${BUILDKITE_COMMIT}"
-          VLLM_VERSION: "v0.21.0"
+          VLLM_VERSION: "v0.22.0"
         priority: 100
         timeout_in_minutes: 60
         soft_fail: false
diff --git a/benchmarks/build_dataset/sound_effect_smoke/en/meta.lst b/benchmarks/build_dataset/sound_effect_smoke/en/meta.lst
@@ -0,0 +1,10 @@
+se001|ocean waves crashing on a rocky beach|10.0
+se002|heavy rain on a metal roof at night|8.0
+se003|a dog barking in the distance|6.0
+se004|a crackling campfire in a quiet forest|10.0
+se005|footsteps on dry leaves|6.0
+se006|thunder rolling across a wide valley|8.0
+se007|a busy coffee shop with chatter and espresso machines|10.0
+se008|wind howling through tall pine trees|8.0
+se009|a small stream flowing over pebbles|10.0
+se010|a creaking wooden door slowly opening|6.0
diff --git a/benchmarks/build_dataset/ttsd_smoke/en/meta.lst b/benchmarks/build_dataset/ttsd_smoke/en/meta.lst
@@ -0,0 +1,10 @@
+ttsd001|prompt-wavs/spk1.wav|prompt-wavs/spk2.wav|[S1] Hello and welcome to the show today. [S2] Thanks for having me, it's a pleasure to be here. [S1] Let's dive right in.
+ttsd002|prompt-wavs/spk1.wav|prompt-wavs/spk2.wav|[S1] Did you finish the report? [S2] Almost — I'm just polishing the conclusion. [S1] Great, send it over when you're done.
+ttsd003|prompt-wavs/spk1.wav|prompt-wavs/spk2.wav|[S1] Have you tried that new restaurant downtown? [S2] Yes, the pasta is amazing. [S1] I'll book a table for Friday.
+ttsd004|prompt-wavs/spk1.wav|prompt-wavs/spk2.wav|[S1] What did you think of yesterday's meeting? [S2] Productive, but a bit long. [S1] Agreed, we should set an hour cap next time.
+ttsd005|prompt-wavs/spk1.wav|prompt-wavs/spk2.wav|[S1] The weather is beautiful today. [S2] Perfect for a hike. [S1] Want to drive out to the trails this afternoon?
+ttsd006|prompt-wavs/spk1.wav|prompt-wavs/spk2.wav|[S1] How did the demo go? [S2] Better than expected — the client signed off on the prototype. [S1] Fantastic news!
+ttsd007|prompt-wavs/spk1.wav|prompt-wavs/spk2.wav|[S1] Coffee or tea this morning? [S2] Coffee please, with a splash of milk. [S1] Coming right up.
+ttsd008|prompt-wavs/spk1.wav|prompt-wavs/spk2.wav|[S1] Did you read the email about the deadline change? [S2] I did, we now have until next Friday. [S1] That should give us enough buffer.
+ttsd009|prompt-wavs/spk1.wav|prompt-wavs/spk2.wav|[S1] Are you joining the team dinner tonight? [S2] I'll be there around seven. [S1] See you then.
+ttsd010|prompt-wavs/spk1.wav|prompt-wavs/spk2.wav|[S1] Have we backed up the database? [S2] Last backup ran two hours ago. [S1] Good, I'll trigger a fresh snapshot before the migration.
diff --git a/benchmarks/tts/bench_tts.py b/benchmarks/tts/bench_tts.py
@@ -53,6 +53,8 @@ def _vllm_omni_bin() -> str:
     "voice_clone": "seed-tts",
     "default_voice": "seed-tts-text",
     "voice_design": "seed-tts-design",
+    "dialogue": "ttsd",
+    "sound_effect": "sound-effect",
 }
 
 # Default design dataset path (bundled with the repo)
@@ -81,6 +83,7 @@ def build_bench_args(
     output_dir: str | None,
     result_filename: str | None,
     extra_cli_args: list[str],
+    output_len: int | None = None,
 ) -> list[str]:
     """Build the ``vllm bench serve --omni`` command for one (task, concurrency) run."""
     dataset_name = _TASK_TO_DATASET[task]
@@ -139,6 +142,9 @@ def build_bench_args(
     if wer_eval:
         cmd.append("--seed-tts-wer-eval")
 
+    if output_len is not None:
+        cmd += ["--hf-output-len", str(output_len)]
+
     if output_dir or result_filename:
         out_dir = output_dir or "."
         os.makedirs(out_dir, exist_ok=True)
@@ -238,6 +244,15 @@ def main() -> None:
     parser.add_argument("--host", default="localhost")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--model-configs", default=str(_DEFAULT_MODEL_CONFIGS), help="Path to model_configs.yaml")
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Override per-request max_new_tokens forwarded to the server "
+        "(seed-tts datasets default to 2048; the MOSS-TTS full talker takes "
+        "this as 'max_new_frames' so smaller values cap audio length per "
+        "request and keep bench runtime tractable).",
+    )
     parser.add_argument("extra", nargs=argparse.REMAINDER, help="Extra args passed directly to vllm bench serve")
     args = parser.parse_args()
 
@@ -292,6 +307,7 @@ def main() -> None:
                 output_dir=args.output_dir,
                 result_filename=result_filename,
                 extra_cli_args=args.extra or [],
+                output_len=args.output_len,
             )
             result = run_one_benchmark(cmd)
             if result is not None:

diff --git a/benchmarks/tts/model_configs.yaml b/benchmarks/tts/model_configs.yaml
@@ -43,3 +43,61 @@ models:
       voice_clone: {}
       default_voice:
         voice: default
+
+  # MOSS-TTS family. After the serving_speech.py variant-detection refactor
+  # all five variants now go through OpenAI /v1/audio/speech with the
+  # variant-specific request fields (ambient_sound, ref_audio_2, instructions).
+  OpenMOSS-Team/MOSS-TTS:
+    supported_tasks: [voice_clone]
+    backend: openai-audio-speech
+    endpoint: /v1/audio/speech
+    task_extra_body:
+      voice_clone: {}
+
+  # MOSS-TTS-v1.5: continued-training upgrade of MOSS-TTS 1.0 — same
+  # MossTTSDelay-8B architecture and API, so identical wiring. Adds 31-language
+  # synthesis via the `language` field and inline `[pause Xs]` markers.
+  OpenMOSS-Team/MOSS-TTS-v1.5:
+    supported_tasks: [voice_clone]
+    backend: openai-audio-speech
+    endpoint: /v1/audio/speech
+    task_extra_body:
+      voice_clone: {}
+
+  OpenMOSS-Team/MOSS-TTS-Realtime:
+    supported_tasks: [voice_clone]
+    backend: openai-audio-speech
+    endpoint: /v1/audio/speech
+    task_extra_body:
+      voice_clone: {}
+
+  # MOSS-TTSD synthesizes [S1]/[S2] dialogue. The seed-tts voice_clone dataset
+  # has single-speaker rows; bench piggybacks on it (both speakers share the
+  # same ref_audio). For real dialogue evaluation use the ttsd dataset.
+  OpenMOSS-Team/MOSS-TTSD-v1.0:
+    supported_tasks: [voice_clone, dialogue]
+    backend: openai-audio-speech
+    endpoint: /v1/audio/speech
+    task_extra_body:
+      voice_clone: {}
+      dialogue: {}
+
+  # MOSS-SoundEffect ignores text/ref_audio; expects ambient_sound +
+  # duration_seconds. The sound_effect dataset module emits those.
+  OpenMOSS-Team/MOSS-SoundEffect:
+    supported_tasks: [sound_effect]
+    backend: openai-audio-speech
+    endpoint: /v1/audio/speech
+    task_extra_body:
+      sound_effect:
+        duration_seconds: 10
+
+  # MOSS-VoiceGenerator: instructions-driven (no ref_audio). seed-tts-design
+  # dataset already emits {instructions, target_text}, so voice_design works
+  # out of the box once the serving validator accepts instructions.
+  OpenMOSS-Team/MOSS-VoiceGenerator:
+    supported_tasks: [voice_design]
+    backend: openai-audio-speech
+    endpoint: /v1/audio/speech
+    task_extra_body:
+      voice_design: {}
diff --git a/docs/design/qwen3_omni_tts_performance_optimization.md b/docs/design/qwen3_omni_tts_performance_optimization.md
@@ -411,6 +411,8 @@ Notes:
 - `runtime.max_batch_size` controls stage-level batching.
 - Thinker/Talker commonly use `enforce_eager: false` for CUDA Graph paths.
 - Code2Wav often remains eager (`enforce_eager: true`) depending on runtime behavior.
+- Qwen3-Omni defaults `VLLM_USE_FLASHINFER_MOE_FP16=0`. The Triton has been more stable & faster
+ than the FlashInfer CUTLASS unquantized MoE backend on recent vLLM rebases.
 
 #### 2) Enable async chunk
 

diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
@@ -119,9 +119,10 @@ def add_parser(self, name, **kwargs):
                     exec_globals = {
                         "_FlexibleArgumentParser": _FlexibleArgumentParser,
                         "FlexibleArgumentParser": _FlexibleArgumentParser,
+                        "TrackingArgumentParser": _FlexibleArgumentParser,  # only needs add_argument, not tracking
+                        "TrackingNamespace": None,  # type annotation only
                         "make_arg_parser": lambda parser: parser,  # no-op for doc
                         "_ensure_vllm_platform": lambda: None,  # no-op for doc
-                        "nullify_stage_engine_defaults": lambda parser: None,  # no-op for doc
                         "VLLM_SUBCMD_PARSER_EPILOG": "",
                         "logger": logger,
                         "DummySubparsers": DummySubparsers,

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -33,6 +33,7 @@ th {
 | `ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanPipeline` | Wan2.1-T2V, Wan2.2-T2V, Wan2.2-TI2V | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-T2V-14B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers`, `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanImageToVideoPipeline` | Wan2.2-I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
+| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V, T2V with sound | `nvidia/Cosmos3-Nano` | ✅︎ | | | |
 | `WanSpeechToVideoPipeline` | Wan2.2-S2V | `Wan-AI/Wan2.2-S2V-14B` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Wan22VACEPipeline` | Wan2.1-VACE | `Wan-AI/Wan2.1-VACE-1.3B-diffusers`, `Wan-AI/Wan2.1-VACE-14B-diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `LTX2Pipeline` | LTX-2-T2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | |

diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md
@@ -133,10 +133,12 @@ The following tables show which models support each feature:
 | **Stable-Diffusion3.5**  |     ❌     |     ✅      |           ❌           |       ✅        |         ✅         |          ❌          |   ❌    |             ✅             |      ✅ (decode)      |       ❌        |        ❌         |
 | **Z-Image**              |     ✅     |     ✅      |           ✅           |       ❓        |   ✅ (TP=2 only)   |          ❌          |   ✅    |             ❌             |      ✅ (decode)      |       ✅        |        ❌         |
 | **ERNIE-Image**          |     ❌     |     ✅      |           ✅           |       ❓        |         ✅         |          ❌          |   ✅    |             ✅             |          ❌           |       ❌        |        ❌         |
+| **Cosmos3**              |     ❌     |     ✅      |           ✅           |       ✅        |         ✅         |          ❌          |   ✅    |             ✅             |      ✅ (decode)      |       ✅        |        ❌         |
 
 > Notes:
 > 1. Nextstep_1(T2I) does not support cache acceleration methods such as TeaCache or Cache-DiT.
 > 2. `Tongyi-MAI/Z-Image-Turbo` and `SII-GAIR/daVinci-MagiHuman-Base-1080p` are distilled models with minimal NFEs; CFG-Parallel is not necessary.
+> 3. Cosmos3 T2I uses `Cosmos3OmniDiffusersPipeline` with `modalities=["image"]`. Model-level CPU offload is not supported; use layerwise offload.
 
 ### VideoGen
 
@@ -149,6 +151,8 @@ The following tables show which models support each feature:
 | **Helios**                   |     ❌     |     ✅      |           ✅           |       ✅        |         ✅         |         ❌         |   ✅    |             ✅             |          ❌           |       ❌        |        ❌         |
 | **HunyuanVideo-1.5 T2V I2V** |     ❌     |     ✅      |           ✅           |       ✅        |         ✅         |         ❌         |   ✅    |             ✅             |  ✅ (encode/decode)   |       ✅        |        ❌         |
 | **DreamID-Omni**             |     ❌     |     ❌      |           ❌           |       ✅        |         ❌         |         ❌         |   ✅    |             ✅             |          ❌           |       ❌        |        ❌         |
+| **Cosmos3**                  |     ❌     |     ✅      |           ✅           |       ✅        |         ✅         |         ❌         |   ✅    |             ✅             |  ✅ (encode/decode)   |       ✅        |        ❌         |
+
 
 **Frame Interpolation Support**
 

diff --git a/docs/user_guide/examples/online_serving/qwen3_omni.md b/docs/user_guide/examples/online_serving/qwen3_omni.md
@@ -17,6 +17,9 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091
 
 The default deployment configuration situated at `vllm_omni/deploy/qwen3_omni_moe.yaml` is resolved and loaded
 automatically via the model registry, obviating the necessity for the `--deploy-config` flag in standard deployment topologies.
+The bundled Qwen3-Omni setup defaults `VLLM_USE_FLASHINFER_MOE_FP16=0`. This keeps the Thinker & Talker on vLLM's
+Triton unquantized MoE path and avoids the performance regression observed with the FlashInfer CUTLASS unquantized MoE
+backend.
 Asynchronous chunk streaming is **enabled by default** within the bundled configuration.
 
 To explicitly utilize a custom deployment YAML, specify the configuration path:
@@ -72,6 +75,12 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \
     --stage-overrides '{"1": {"gpu_memory_utilization": 0.5}}'
 ```
 
+To experiment with the FlashInfer FP16 MoE path, set `VLLM_USE_FLASHINFER_MOE_FP16=1` before launching the server:
+```bash
+VLLM_USE_FLASHINFER_MOE_FP16=1 \
+vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091
+```
+
 For the stage-based CLI, you usually do **not** need `--stage-overrides` for
 that kind of change. Since each command launches one stage, just pass the knob
 directly on that stage command:

diff --git a/examples/offline_inference/bagel/README.md b/examples/offline_inference/bagel/README.md
@@ -195,6 +195,101 @@ stages:
     devices: "0,1"
 ```
 
+### VAE Patch Parallelism
+
+[VAE Patch Parallelism](https://docs.vllm.ai/projects/vllm-omni/en/latest/user_guide/diffusion/parallelism/vae_patch_parallel.html) splits Bagel VAE **decode/encode** tiles across multiple GPUs on the **DiT stage**, reducing **per-GPU peak memory during VAE decode**. Use it when high-resolution `text2img` or `img2img` hits VAE OOM or large decode spikes.
+
+**Bagel-specific notes:**
+
+- Implemented in `BagelPipeline` via `DistributedAutoEncoder` (DiT stage only).
+- **Single-stage** is the simplest path: one DiT process with TP + VAE patch parallel.
+- **Two-stage**: enable on **stage 1 (DiT)** only; stage 0 (Thinker) keeps encoder-only `VAEEncoder` and does not use VAE patch parallel.
+- You need a DiT `world_size` ≥ `vae_patch_parallel_size` (typically `tensor_parallel_size=2` on that stage). VAE PP reuses the DiT process group; it is not a standalone second-GPU VAE worker.
+
+**Single-stage via deploy YAML** (recommended for `end2end.py`):
+
+```yaml
+pipeline: bagel_single_stage
+async_chunk: false
+
+stages:
+  - stage_id: 0
+    max_num_batched_tokens: 32768
+    max_num_seqs: 1
+    enforce_eager: true
+    trust_remote_code: true
+    enable_prefix_caching: false
+    devices: "0,1"
+    vae_use_tiling: true
+    parallel_config:
+      tensor_parallel_size: 2
+      vae_patch_parallel_size: 2
+    default_sampling_params:
+      seed: 52
+```
+
+```bash
+cd examples/offline_inference/bagel
+
+CUDA_VISIBLE_DEVICES=0,1 python end2end.py \
+    --model /path/to/BAGEL-7B-MoT \
+    --deploy-config /path/to/bagel_single_stage_vae_pp.yaml \
+    --modality text2img \
+    --prompts "A cute cat" \
+    --steps 10 \
+    --output ./out_vae_pp
+```
+
+**Single-stage via `Omni` kwargs** (same flags as online serving):
+
+```python
+from vllm_omni.entrypoints.omni import Omni
+
+omni = Omni(
+    model="ByteDance-Seed/BAGEL-7B-MoT",
+    deploy_config="vllm_omni/deploy/bagel_single_stage.yaml",
+    tensor_parallel_size=2,
+    vae_patch_parallel_size=2,
+    vae_use_tiling=True,
+)
+# Then call omni.generate(...) as in end2end.py
+```
+
+**Two-stage (VAE PP on DiT only):**
+
+```yaml
+stages:
+  - stage_id: 0
+    devices: "0"
+    # AR Thinker — no vae_patch_parallel here
+
+  - stage_id: 1
+    devices: "0,1"
+    vae_use_tiling: true
+    parallel_config:
+      tensor_parallel_size: 2
+      vae_patch_parallel_size: 2
+```
+
+```bash
+python end2end.py --model ByteDance-Seed/BAGEL-7B-MoT \
+    --deploy-config /path/to/bagel_vae_pp.yaml \
+    --modality text2img \
+    --prompts "A cute cat"
+```
+
+**Startup log checks:**
+
+```text
+INFO ... vae_patch_parallel_size=2 requires vae_use_tiling; automatically enabling it.
+```
+
+| Setting | Role |
+| :------ | :--- |
+| `parallel_config.tensor_parallel_size` | DiT world size / TP (must be ≥ `vae_patch_parallel_size`) |
+| `parallel_config.vae_patch_parallel_size` | Number of ranks for distributed VAE tiles (`1` = off) |
+| `vae_use_tiling` | Enable spatial tiling (auto-enabled when `vae_patch_parallel_size > 1`) |
+
 #### Hybrid Sharded Data Parallel (HSDP)
 
 For larger Bagel deployments on multiple GPUs, you can enable HSDP (Hybrid Sharded Data Parallel) by modifying the stage configuration (for example, [`bagel.yaml`](../../../vllm_omni/deploy/bagel.yaml)). HSDP shards transformer weights across GPUs to reduce per-GPU memory usage.

diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py
@@ -121,9 +121,6 @@ def parse_args():
         help="Temperature for text generation sampling (default: 0.3).",
     )
 
-    from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults
-
-    nullify_stage_engine_defaults(parser)
     args = parser.parse_args()
     return args
 
@@ -154,6 +151,7 @@ def main():
     from vllm_omni.entrypoints.omni import Omni
 
     omni_kwargs = vars(args).copy()
+    omni_kwargs.pop("model", None)
     deploy_config = args.deploy_config
     if args.think and deploy_config is None:
         deploy_config = "vllm_omni/deploy/bagel_think.yaml"
@@ -164,9 +162,7 @@ def main():
     if args.quantization:
         omni_kwargs["quantization_config"] = args.quantization
 
-    # Override CLI --model with the derived model_name.
-    omni_kwargs["model"] = model_name
-    omni = Omni(**omni_kwargs)
+    omni = Omni(model=model_name, **omni_kwargs)
 
     formatted_prompts = []
     for p in prompts:

diff --git a/examples/offline_inference/dynin_omni/end2end.py b/examples/offline_inference/dynin_omni/end2end.py
@@ -18,8 +18,6 @@
 import torch
 from PIL import Image
 
-from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults
-
 TASK_CHOICES = ("t2t", "t2i", "t2s", "i2i", "i2t", "s2t", "v2t")
 
 TASK_DEFAULT_RUNTIME = {
@@ -972,8 +970,6 @@ def parse_args(repo_root: Path) -> argparse.Namespace:
     parser.add_argument("--vq-model-audio-local-files-only", action=argparse.BooleanOptionalAction, default=None)
 
     parser.add_argument("--disable-hf-xet", action=argparse.BooleanOptionalAction, default=True)
-
-    nullify_stage_engine_defaults(parser)
     return parser.parse_args()
 
 

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -107,9 +107,6 @@ def parse_args():
         ),
     )
 
-    from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults
-
-    nullify_stage_engine_defaults(parser)
     return parser.parse_args()
 
 

diff --git a/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_image_summarize.py b/examples/offline_inference/mammothmodal2_preview/run_mammothmoda2_image_summarize.py
@@ -19,7 +19,6 @@
 from vllm.multimodal.image import convert_image_mode
 
 from vllm_omni import Omni
-from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults
 
 DEFAULT_SYSTEM = "You are a helpful assistant."
 DEFAULT_QUESTION = "Please summarize the content of this image."
@@ -49,7 +48,6 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Enable diffusion pipeline profiler to display stage durations.",
     )
-    nullify_stage_engine_defaults(parser)
     return parser.parse_args()
-Original file line number
+Diff line change
@@ Expand Up / @@ -107,9 +107,6 @@ def parse_args(): @@
             ),
         )
-        from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults
-        nullify_stage_engine_defaults(parser)
         return parser.parse_args()
@@ Expand Down @@