vllm-project · lishunyang12 · Apr 20, 2026 · Apr 20, 2026 · lishunyang12 · Apr 20, 2026
@@ -2,7 +2,8 @@
     {
         "test_name": "test_qwen3_omni",
         "server_params": {
-            "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+            "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+            "extra_cli_args": ["--no-async-chunk"]
         },
         "benchmark_params": [
             {

@@ -64,7 +64,6 @@ def get_async_chunk_config(default_path):
     return modify_stage_config(
         default_path,
         updates={
-            "async_chunk": True,
             "stages": {0: {"default_sampling_params.max_tokens": 2048}},
         },
     )
@@ -78,7 +77,9 @@ def get_async_chunk_config(default_path):
 
 test_params = [
     pytest.param(
-        OmniServerParams(model=model, stage_config_path=default_path, use_stage_cli=True),
+        OmniServerParams(
+            model=model, stage_config_path=default_path, use_stage_cli=True, server_args=["--no-async-chunk"]
+        ),
         id="default",
     ),
     pytest.param(

@@ -7,11 +7,33 @@
 import pytest
 from pytest_mock import MockerFixture
 
-from vllm_omni.entrypoints.cli.serve import run_headless
+from vllm_omni.entrypoints.cli.serve import OmniServeCommand, run_headless
+from vllm_omni.entrypoints.utils import detect_explicit_cli_keys
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 
+def test_serve_parser_accepts_no_async_chunk_and_marks_it_explicit() -> None:
+    """``--no-async-chunk`` should parse to ``async_chunk=False`` and mark the
+    shared deploy-level dest as explicitly provided by the user."""
+    try:
+        from vllm.utils.argparse_utils import FlexibleArgumentParser
+    except Exception as exc:
+        pytest.skip(f"Cannot build parser in this environment: {exc}")
+
+    root = FlexibleArgumentParser()
+    subparsers = root.add_subparsers(dest="subcommand")
+    cmd = OmniServeCommand()
+    serve_parser = cmd.subparser_init(subparsers)
+
+    argv = ["serve", "fake-model", "--omni", "--no-async-chunk"]
+    args = root.parse_args(argv)
+
+    assert args.async_chunk is False
+    explicit = detect_explicit_cli_keys(argv, serve_parser)
+    assert "async_chunk" in explicit
+
+
 def _make_headless_args() -> argparse.Namespace:
     return argparse.Namespace(
         model="fake-model",

@@ -122,7 +122,11 @@ def __init__(
         stage_init_timeout = kwargs.pop("stage_init_timeout", 300)
         init_timeout = kwargs.pop("init_timeout", 600)
         log_stats = kwargs.pop("log_stats", False)
-        async_chunk = kwargs.pop("async_chunk", False)
+        # NOTE: read-only lookup — must NOT pop. Popping here drops the key
+        # before it reaches ``StageConfigFactory._create_from_registry``, so
+        # ``--no-async-chunk`` (``async_chunk=False``) silently fails to
+        # override the deploy YAML's ``async_chunk: true`` default.
+        async_chunk = kwargs.get("async_chunk")
         output_modalities = kwargs.pop("output_modalities", None)
         diffusion_batch_size: int = kwargs.pop("diffusion_batch_size", 1)
 
@@ -132,7 +136,10 @@ def __init__(
         self._name = self.__class__.__name__
         self.model = model
         self.log_stats = log_stats
-        self.async_chunk = async_chunk
+        # Provisional value (mirrors the CLI/caller kwarg); the engine resolves
+        # pipeline + deploy YAML + CLI precedence below and the final value is
+        # re-assigned from ``self.engine.async_chunk`` after init.
+        self.async_chunk = bool(async_chunk) if async_chunk is not None else False
         self.output_modalities = output_modalities or []
         self.tts_batch_max_items: int = kwargs.pop("tts_batch_max_items", 32)
 
@@ -150,7 +157,11 @@ def __init__(
         self._weak_finalizer = weakref.finalize(self, _weak_shutdown_engine, self.engine)
         et = time.time()
         logger.info("[%s] AsyncOmniEngine initialized in %.2f seconds", self.__class__.__name__, et - st)
-        self.async_chunk = bool(self.async_chunk or getattr(self.engine, "async_chunk", False))
+        # Authoritative: ``AsyncOmniEngine`` resolves (pipeline + deploy YAML +
+        # CLI overrides) through ``StageConfigFactory`` and stores the final
+        # value on ``engine.async_chunk``; mirror it here so ``--no-async-chunk``
+        # (explicit ``False``) is not fallen-back-through by ``or``.
+        self.async_chunk = bool(getattr(self.engine, "async_chunk", False))
 
         self.request_states: dict[str, ClientRequestState] = {}