From 04c070659053af452c7b33b50402c4a2fb444fef Mon Sep 17 00:00:00 2001 From: rein yang Date: Wed, 13 May 2026 09:16:43 +0000 Subject: [PATCH 1/2] Set VLLM_USE_FLASHINFER_MOE_FP16=0 for Qwen3-Omni to avoid performance regression Signed-off-by: rein yang --- .../qwen3_omni_tts_performance_optimization.md | 2 ++ .../user_guide/examples/online_serving/qwen3_omni.md | 9 +++++++++ vllm_omni/engine/stage_init_utils.py | 12 ++++++++++++ 3 files changed, 23 insertions(+) diff --git a/docs/design/qwen3_omni_tts_performance_optimization.md b/docs/design/qwen3_omni_tts_performance_optimization.md index 2f18a1b1bc0..e4548a17c69 100644 --- a/docs/design/qwen3_omni_tts_performance_optimization.md +++ b/docs/design/qwen3_omni_tts_performance_optimization.md @@ -411,6 +411,8 @@ Notes: - `runtime.max_batch_size` controls stage-level batching. - Thinker/Talker commonly use `enforce_eager: false` for CUDA Graph paths. - Code2Wav often remains eager (`enforce_eager: true`) depending on runtime behavior. +- Qwen3-Omni defaults `VLLM_USE_FLASHINFER_MOE_FP16=0`. The Triton has been more stable & faster + than the FlashInfer CUTLASS unquantized MoE backend on recent vLLM rebases. #### 2) Enable async chunk diff --git a/docs/user_guide/examples/online_serving/qwen3_omni.md b/docs/user_guide/examples/online_serving/qwen3_omni.md index 413c80b7e50..1973b198bb4 100644 --- a/docs/user_guide/examples/online_serving/qwen3_omni.md +++ b/docs/user_guide/examples/online_serving/qwen3_omni.md @@ -17,6 +17,9 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 The default deployment configuration situated at `vllm_omni/deploy/qwen3_omni_moe.yaml` is resolved and loaded automatically via the model registry, obviating the necessity for the `--deploy-config` flag in standard deployment topologies. +The bundled Qwen3-Omni setup defaults `VLLM_USE_FLASHINFER_MOE_FP16=0`. This keeps the Thinker & Talker on vLLM's +Triton unquantized MoE path and avoids the performance regression observed with the FlashInfer CUTLASS unquantized MoE +backend. Asynchronous chunk streaming is **enabled by default** within the bundled configuration. To explicitly utilize a custom deployment YAML, specify the configuration path: @@ -72,6 +75,12 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ --stage-overrides '{"1": {"gpu_memory_utilization": 0.5}}' ``` +To experiment with the FlashInfer FP16 MoE path, set `VLLM_USE_FLASHINFER_MOE_FP16=1` before launching the server: +```bash +VLLM_USE_FLASHINFER_MOE_FP16=1 \ +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 +``` + For the stage-based CLI, you usually do **not** need `--stage-overrides` for that kind of change. Since each command launches one stage, just pass the knob directly on that stage command: diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index ce68a23daa4..70b4ffe9f8f 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -411,6 +411,15 @@ def prepare_engine_environment() -> None: pass +def _maybe_set_qwen3_omni_moe_env(engine_args_dict: dict[str, Any]) -> None: + if ( + engine_args_dict.get("model_arch") == "Qwen3OmniMoeForConditionalGeneration" + and "VLLM_USE_FLASHINFER_MOE_FP16" not in os.environ + ): + os.environ["VLLM_USE_FLASHINFER_MOE_FP16"] = "0" + logger.info("[stage_init] Set VLLM_USE_FLASHINFER_MOE_FP16=0 for Qwen3-Omni stage") + + def split_devices_for_replicas( devices_str: str | None, num_replicas: int, @@ -596,6 +605,9 @@ def build_engine_args_dict( default_sp = _to_dict(getattr(stage_config, "default_sampling_params", {})) engine_args_dict["has_sampling_extra_args"] = bool(default_sp.get("extra_args")) + # Set VLLM_USE_FLASHINFER_MOE_FP16=0 for Qwen3-Omni to avoid performance regression + _maybe_set_qwen3_omni_moe_env(engine_args_dict) + return engine_args_dict From 3d27e2bc2a97e7bbbb0e5056e9710de2ad5ded3a Mon Sep 17 00:00:00 2001 From: rein yang Date: Thu, 14 May 2026 09:40:54 +0000 Subject: [PATCH 2/2] add TODO Signed-off-by: rein yang --- vllm_omni/engine/stage_init_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 70b4ffe9f8f..df8484328f7 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -605,6 +605,7 @@ def build_engine_args_dict( default_sp = _to_dict(getattr(stage_config, "default_sampling_params", {})) engine_args_dict["has_sampling_extra_args"] = bool(default_sp.get("extra_args")) + # TODO: Remove this after the performance regression is fixed # Set VLLM_USE_FLASHINFER_MOE_FP16=0 for Qwen3-Omni to avoid performance regression _maybe_set_qwen3_omni_moe_env(engine_args_dict)