From 54af13c3e14e2602ca95448db0cac121f49ef590 Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Mon, 20 Apr 2026 11:38:54 +0800 Subject: [PATCH] Fixed the issue where --no-async-chunk was not working. Signed-off-by: amy-why-3459 --- tests/dfx/perf/tests/test_qwen_omni.json | 3 ++- .../test_qwen3_omni_expansion.py | 5 ++-- tests/entrypoints/test_serve.py | 24 ++++++++++++++++++- vllm_omni/entrypoints/omni_base.py | 17 ++++++++++--- 4 files changed, 42 insertions(+), 7 deletions(-) diff --git a/tests/dfx/perf/tests/test_qwen_omni.json b/tests/dfx/perf/tests/test_qwen_omni.json index 39fd266544a..ca3eb555708 100644 --- a/tests/dfx/perf/tests/test_qwen_omni.json +++ b/tests/dfx/perf/tests/test_qwen_omni.json @@ -2,7 +2,8 @@ { "test_name": "test_qwen3_omni", "server_params": { - "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct" + "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", + "extra_cli_args": ["--no-async-chunk"] }, "benchmark_params": [ { diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index acec0efde2e..3152a8f982d 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -64,7 +64,6 @@ def get_async_chunk_config(default_path): return modify_stage_config( default_path, updates={ - "async_chunk": True, "stages": {0: {"default_sampling_params.max_tokens": 2048}}, }, ) @@ -78,7 +77,9 @@ def get_async_chunk_config(default_path): test_params = [ pytest.param( - OmniServerParams(model=model, stage_config_path=default_path, use_stage_cli=True), + OmniServerParams( + model=model, stage_config_path=default_path, use_stage_cli=True, server_args=["--no-async-chunk"] + ), id="default", ), pytest.param( diff --git a/tests/entrypoints/test_serve.py b/tests/entrypoints/test_serve.py index afa7fa82e4b..e60afc9cd7b 100644 --- a/tests/entrypoints/test_serve.py +++ b/tests/entrypoints/test_serve.py @@ -7,11 +7,33 @@ import pytest from pytest_mock import MockerFixture -from vllm_omni.entrypoints.cli.serve import run_headless +from vllm_omni.entrypoints.cli.serve import OmniServeCommand, run_headless +from vllm_omni.entrypoints.utils import detect_explicit_cli_keys pytestmark = [pytest.mark.core_model, pytest.mark.cpu] +def test_serve_parser_accepts_no_async_chunk_and_marks_it_explicit() -> None: + """``--no-async-chunk`` should parse to ``async_chunk=False`` and mark the + shared deploy-level dest as explicitly provided by the user.""" + try: + from vllm.utils.argparse_utils import FlexibleArgumentParser + except Exception as exc: + pytest.skip(f"Cannot build parser in this environment: {exc}") + + root = FlexibleArgumentParser() + subparsers = root.add_subparsers(dest="subcommand") + cmd = OmniServeCommand() + serve_parser = cmd.subparser_init(subparsers) + + argv = ["serve", "fake-model", "--omni", "--no-async-chunk"] + args = root.parse_args(argv) + + assert args.async_chunk is False + explicit = detect_explicit_cli_keys(argv, serve_parser) + assert "async_chunk" in explicit + + def _make_headless_args() -> argparse.Namespace: return argparse.Namespace( model="fake-model", diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 023a2e16cfb..dca494efe72 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -122,7 +122,11 @@ def __init__( stage_init_timeout = kwargs.pop("stage_init_timeout", 300) init_timeout = kwargs.pop("init_timeout", 600) log_stats = kwargs.pop("log_stats", False) - async_chunk = kwargs.pop("async_chunk", False) + # NOTE: read-only lookup — must NOT pop. Popping here drops the key + # before it reaches ``StageConfigFactory._create_from_registry``, so + # ``--no-async-chunk`` (``async_chunk=False``) silently fails to + # override the deploy YAML's ``async_chunk: true`` default. + async_chunk = kwargs.get("async_chunk") output_modalities = kwargs.pop("output_modalities", None) diffusion_batch_size: int = kwargs.pop("diffusion_batch_size", 1) @@ -132,7 +136,10 @@ def __init__( self._name = self.__class__.__name__ self.model = model self.log_stats = log_stats - self.async_chunk = async_chunk + # Provisional value (mirrors the CLI/caller kwarg); the engine resolves + # pipeline + deploy YAML + CLI precedence below and the final value is + # re-assigned from ``self.engine.async_chunk`` after init. + self.async_chunk = bool(async_chunk) if async_chunk is not None else False self.output_modalities = output_modalities or [] self.tts_batch_max_items: int = kwargs.pop("tts_batch_max_items", 32) @@ -150,7 +157,11 @@ def __init__( self._weak_finalizer = weakref.finalize(self, _weak_shutdown_engine, self.engine) et = time.time() logger.info("[%s] AsyncOmniEngine initialized in %.2f seconds", self.__class__.__name__, et - st) - self.async_chunk = bool(self.async_chunk or getattr(self.engine, "async_chunk", False)) + # Authoritative: ``AsyncOmniEngine`` resolves (pipeline + deploy YAML + + # CLI overrides) through ``StageConfigFactory`` and stores the final + # value on ``engine.async_chunk``; mirror it here so ``--no-async-chunk`` + # (explicit ``False``) is not fallen-back-through by ``or``. + self.async_chunk = bool(getattr(self.engine, "async_chunk", False)) self.request_states: dict[str, ClientRequestState] = {}