From 54af13c3e14e2602ca95448db0cac121f49ef590 Mon Sep 17 00:00:00 2001
From: amy-why-3459 <wuhaiyan17@huawei.com>
Date: Mon, 20 Apr 2026 11:38:54 +0800
Subject: [PATCH] Fixed the issue where --no-async-chunk was not working.

Signed-off-by: amy-why-3459 <wuhaiyan17@huawei.com>
---
 tests/dfx/perf/tests/test_qwen_omni.json      |  3 ++-
 .../test_qwen3_omni_expansion.py              |  5 ++--
 tests/entrypoints/test_serve.py               | 24 ++++++++++++++++++-
 vllm_omni/entrypoints/omni_base.py            | 17 ++++++++++---
 4 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/tests/dfx/perf/tests/test_qwen_omni.json b/tests/dfx/perf/tests/test_qwen_omni.json
index 39fd266544a..ca3eb555708 100644
--- a/tests/dfx/perf/tests/test_qwen_omni.json
+++ b/tests/dfx/perf/tests/test_qwen_omni.json
@@ -2,7 +2,8 @@
     {
         "test_name": "test_qwen3_omni",
         "server_params": {
-            "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+            "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+            "extra_cli_args": ["--no-async-chunk"]
         },
         "benchmark_params": [
             {
diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
index acec0efde2e..3152a8f982d 100644
--- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
@@ -64,7 +64,6 @@ def get_async_chunk_config(default_path):
     return modify_stage_config(
         default_path,
         updates={
-            "async_chunk": True,
             "stages": {0: {"default_sampling_params.max_tokens": 2048}},
         },
     )
@@ -78,7 +77,9 @@ def get_async_chunk_config(default_path):
 
 test_params = [
     pytest.param(
-        OmniServerParams(model=model, stage_config_path=default_path, use_stage_cli=True),
+        OmniServerParams(
+            model=model, stage_config_path=default_path, use_stage_cli=True, server_args=["--no-async-chunk"]
+        ),
         id="default",
     ),
     pytest.param(
diff --git a/tests/entrypoints/test_serve.py b/tests/entrypoints/test_serve.py
index afa7fa82e4b..e60afc9cd7b 100644
--- a/tests/entrypoints/test_serve.py
+++ b/tests/entrypoints/test_serve.py
@@ -7,11 +7,33 @@
 import pytest
 from pytest_mock import MockerFixture
 
-from vllm_omni.entrypoints.cli.serve import run_headless
+from vllm_omni.entrypoints.cli.serve import OmniServeCommand, run_headless
+from vllm_omni.entrypoints.utils import detect_explicit_cli_keys
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 
+def test_serve_parser_accepts_no_async_chunk_and_marks_it_explicit() -> None:
+    """``--no-async-chunk`` should parse to ``async_chunk=False`` and mark the
+    shared deploy-level dest as explicitly provided by the user."""
+    try:
+        from vllm.utils.argparse_utils import FlexibleArgumentParser
+    except Exception as exc:
+        pytest.skip(f"Cannot build parser in this environment: {exc}")
+
+    root = FlexibleArgumentParser()
+    subparsers = root.add_subparsers(dest="subcommand")
+    cmd = OmniServeCommand()
+    serve_parser = cmd.subparser_init(subparsers)
+
+    argv = ["serve", "fake-model", "--omni", "--no-async-chunk"]
+    args = root.parse_args(argv)
+
+    assert args.async_chunk is False
+    explicit = detect_explicit_cli_keys(argv, serve_parser)
+    assert "async_chunk" in explicit
+
+
 def _make_headless_args() -> argparse.Namespace:
     return argparse.Namespace(
         model="fake-model",
diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py
index 023a2e16cfb..dca494efe72 100644
--- a/vllm_omni/entrypoints/omni_base.py
+++ b/vllm_omni/entrypoints/omni_base.py
@@ -122,7 +122,11 @@ def __init__(
         stage_init_timeout = kwargs.pop("stage_init_timeout", 300)
         init_timeout = kwargs.pop("init_timeout", 600)
         log_stats = kwargs.pop("log_stats", False)
-        async_chunk = kwargs.pop("async_chunk", False)
+        # NOTE: read-only lookup — must NOT pop. Popping here drops the key
+        # before it reaches ``StageConfigFactory._create_from_registry``, so
+        # ``--no-async-chunk`` (``async_chunk=False``) silently fails to
+        # override the deploy YAML's ``async_chunk: true`` default.
+        async_chunk = kwargs.get("async_chunk")
         output_modalities = kwargs.pop("output_modalities", None)
         diffusion_batch_size: int = kwargs.pop("diffusion_batch_size", 1)
 
@@ -132,7 +136,10 @@ def __init__(
         self._name = self.__class__.__name__
         self.model = model
         self.log_stats = log_stats
-        self.async_chunk = async_chunk
+        # Provisional value (mirrors the CLI/caller kwarg); the engine resolves
+        # pipeline + deploy YAML + CLI precedence below and the final value is
+        # re-assigned from ``self.engine.async_chunk`` after init.
+        self.async_chunk = bool(async_chunk) if async_chunk is not None else False
         self.output_modalities = output_modalities or []
         self.tts_batch_max_items: int = kwargs.pop("tts_batch_max_items", 32)
 
@@ -150,7 +157,11 @@ def __init__(
         self._weak_finalizer = weakref.finalize(self, _weak_shutdown_engine, self.engine)
         et = time.time()
         logger.info("[%s] AsyncOmniEngine initialized in %.2f seconds", self.__class__.__name__, et - st)
-        self.async_chunk = bool(self.async_chunk or getattr(self.engine, "async_chunk", False))
+        # Authoritative: ``AsyncOmniEngine`` resolves (pipeline + deploy YAML +
+        # CLI overrides) through ``StageConfigFactory`` and stores the final
+        # value on ``engine.async_chunk``; mirror it here so ``--no-async-chunk``
+        # (explicit ``False``) is not fallen-back-through by ``or``.
+        self.async_chunk = bool(getattr(self.engine, "async_chunk", False))
 
         self.request_states: dict[str, ClientRequestState] = {}