From 04c070659053af452c7b33b50402c4a2fb444fef Mon Sep 17 00:00:00 2001
From: rein yang <ruiruyang2@gmail.com>
Date: Wed, 13 May 2026 09:16:43 +0000
Subject: [PATCH 1/2] Set VLLM_USE_FLASHINFER_MOE_FP16=0 for Qwen3-Omni to
 avoid performance regression

Signed-off-by: rein yang <ruiruyang2@gmail.com>
---
 .../qwen3_omni_tts_performance_optimization.md       |  2 ++
 .../user_guide/examples/online_serving/qwen3_omni.md |  9 +++++++++
 vllm_omni/engine/stage_init_utils.py                 | 12 ++++++++++++
 3 files changed, 23 insertions(+)

diff --git a/docs/design/qwen3_omni_tts_performance_optimization.md b/docs/design/qwen3_omni_tts_performance_optimization.md
index 2f18a1b1bc0..e4548a17c69 100644
--- a/docs/design/qwen3_omni_tts_performance_optimization.md
+++ b/docs/design/qwen3_omni_tts_performance_optimization.md
@@ -411,6 +411,8 @@ Notes:
 - `runtime.max_batch_size` controls stage-level batching.
 - Thinker/Talker commonly use `enforce_eager: false` for CUDA Graph paths.
 - Code2Wav often remains eager (`enforce_eager: true`) depending on runtime behavior.
+- Qwen3-Omni defaults `VLLM_USE_FLASHINFER_MOE_FP16=0`. The Triton has been more stable & faster
+ than the FlashInfer CUTLASS unquantized MoE backend on recent vLLM rebases.
 
 #### 2) Enable async chunk
 
diff --git a/docs/user_guide/examples/online_serving/qwen3_omni.md b/docs/user_guide/examples/online_serving/qwen3_omni.md
index 413c80b7e50..1973b198bb4 100644
--- a/docs/user_guide/examples/online_serving/qwen3_omni.md
+++ b/docs/user_guide/examples/online_serving/qwen3_omni.md
@@ -17,6 +17,9 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091
 
 The default deployment configuration situated at `vllm_omni/deploy/qwen3_omni_moe.yaml` is resolved and loaded
 automatically via the model registry, obviating the necessity for the `--deploy-config` flag in standard deployment topologies.
+The bundled Qwen3-Omni setup defaults `VLLM_USE_FLASHINFER_MOE_FP16=0`. This keeps the Thinker & Talker on vLLM's
+Triton unquantized MoE path and avoids the performance regression observed with the FlashInfer CUTLASS unquantized MoE
+backend.
 Asynchronous chunk streaming is **enabled by default** within the bundled configuration.
 
 To explicitly utilize a custom deployment YAML, specify the configuration path:
@@ -72,6 +75,12 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \
     --stage-overrides '{"1": {"gpu_memory_utilization": 0.5}}'
 ```
 
+To experiment with the FlashInfer FP16 MoE path, set `VLLM_USE_FLASHINFER_MOE_FP16=1` before launching the server:
+```bash
+VLLM_USE_FLASHINFER_MOE_FP16=1 \
+vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091
+```
+
 For the stage-based CLI, you usually do **not** need `--stage-overrides` for
 that kind of change. Since each command launches one stage, just pass the knob
 directly on that stage command:
diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py
index ce68a23daa4..70b4ffe9f8f 100644
--- a/vllm_omni/engine/stage_init_utils.py
+++ b/vllm_omni/engine/stage_init_utils.py
@@ -411,6 +411,15 @@ def prepare_engine_environment() -> None:
         pass
 
 
+def _maybe_set_qwen3_omni_moe_env(engine_args_dict: dict[str, Any]) -> None:
+    if (
+        engine_args_dict.get("model_arch") == "Qwen3OmniMoeForConditionalGeneration"
+        and "VLLM_USE_FLASHINFER_MOE_FP16" not in os.environ
+    ):
+        os.environ["VLLM_USE_FLASHINFER_MOE_FP16"] = "0"
+        logger.info("[stage_init] Set VLLM_USE_FLASHINFER_MOE_FP16=0 for Qwen3-Omni stage")
+
+
 def split_devices_for_replicas(
     devices_str: str | None,
     num_replicas: int,
@@ -596,6 +605,9 @@ def build_engine_args_dict(
     default_sp = _to_dict(getattr(stage_config, "default_sampling_params", {}))
     engine_args_dict["has_sampling_extra_args"] = bool(default_sp.get("extra_args"))
 
+    # Set VLLM_USE_FLASHINFER_MOE_FP16=0 for Qwen3-Omni to avoid performance regression
+    _maybe_set_qwen3_omni_moe_env(engine_args_dict)
+
     return engine_args_dict
 
 

From 3d27e2bc2a97e7bbbb0e5056e9710de2ad5ded3a Mon Sep 17 00:00:00 2001
From: rein yang <ruiruyang2@gmail.com>
Date: Thu, 14 May 2026 09:40:54 +0000
Subject: [PATCH 2/2] add TODO

Signed-off-by: rein yang <ruiruyang2@gmail.com>
---
 vllm_omni/engine/stage_init_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py
index 70b4ffe9f8f..df8484328f7 100644
--- a/vllm_omni/engine/stage_init_utils.py
+++ b/vllm_omni/engine/stage_init_utils.py
@@ -605,6 +605,7 @@ def build_engine_args_dict(
     default_sp = _to_dict(getattr(stage_config, "default_sampling_params", {}))
     engine_args_dict["has_sampling_extra_args"] = bool(default_sp.get("extra_args"))
 
+    # TODO: Remove this after the performance regression is fixed
     # Set VLLM_USE_FLASHINFER_MOE_FP16=0 for Qwen3-Omni to avoid performance regression
     _maybe_set_qwen3_omni_moe_env(engine_args_dict)