From fc68909ed8f4ab2680805db143f3f943f362223d Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Sat, 25 Apr 2026 17:22:17 +0800 Subject: [PATCH 1/9] Use direct Omni construction in examples Signed-off-by: xiaohajiayou <923390377@qq.com> --- examples/offline_inference/bagel/end2end.py | 17 +++-------------- .../offline_inference/qwen2_5_omni/end2end.py | 6 +++++- .../offline_inference/qwen3_omni/end2end.py | 4 +++- .../qwen3_omni/end2end_async_chunk.py | 10 +++------- examples/offline_inference/qwen3_tts/end2end.py | 10 ++++++++-- 5 files changed, 22 insertions(+), 25 deletions(-) diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index a6ce1f1314f..f196cbaa95b 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -153,7 +153,7 @@ def main(): from vllm_omni.entrypoints.omni import Omni - omni_kwargs = {} + omni_kwargs = vars(args).copy() deploy_config = args.deploy_config if args.think and deploy_config is None: deploy_config = "vllm_omni/deploy/bagel_think.yaml" @@ -161,22 +161,11 @@ def main(): if deploy_config: omni_kwargs["deploy_config"] = deploy_config - omni_kwargs.update( - { - "log_stats": args.log_stats, - "init_sleep_seconds": args.init_sleep_seconds, - "batch_timeout": args.batch_timeout, - "init_timeout": args.init_timeout, - "shm_threshold_bytes": args.shm_threshold_bytes, - "worker_backend": args.worker_backend, - "ray_address": args.ray_address, - "enable_diffusion_pipeline_profiler": args.enable_diffusion_pipeline_profiler, - } - ) if args.quantization: omni_kwargs["quantization_config"] = args.quantization - omni = Omni.from_cli_args(args, model=model_name, **omni_kwargs) + omni_kwargs["model"] = model_name + omni = Omni(**omni_kwargs) formatted_prompts = [] for p in prompts: diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py index a65c554a9b0..ec98fd3add4 100644 --- a/examples/offline_inference/qwen2_5_omni/end2end.py +++ b/examples/offline_inference/qwen2_5_omni/end2end.py @@ -21,6 +21,7 @@ from vllm.sampling_params import SamplingParams from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni SEED = 42 @@ -325,7 +326,9 @@ def main(args): else: query_result = query_func() args.quantization_config = quantization_config - omni = Omni.from_cli_args(args, model=model_name) + omni_kwargs = vars(args).copy() + omni_kwargs["model"] = model_name + omni = Omni(**omni_kwargs) thinker_sampling_params = SamplingParams( temperature=0.0, # Deterministic - no randomness top_p=1.0, # Disable nucleus sampling @@ -550,6 +553,7 @@ def parse_args(): default=False, help="Use py_generator mode. The returned type of Omni.generate() is a Python Generator object.", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 04aa7914db1..59c7352c547 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -295,7 +295,9 @@ def main(args): else: query_result = query_func() - omni = Omni.from_cli_args(args, model=model_name) + omni_kwargs = vars(args).copy() + omni_kwargs["model"] = model_name + omni = Omni(**omni_kwargs) thinker_sampling_params = SamplingParams( temperature=0.9, diff --git a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py index 85c2da20b04..a5bb6c33df8 100644 --- a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py +++ b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py @@ -41,6 +41,7 @@ from vllm.multimodal.media.audio import load_audio from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.async_omni import AsyncOmni logger = logging.getLogger(__name__) @@ -382,13 +383,7 @@ async def run_all(args): print(f"[Info] Creating AsyncOmni with deploy_config={args.deploy_config}") async_omni = None try: - # ``from_cli_args`` expands vars(args) into kwargs and auto-captures - # ``_cli_explicit_keys`` from ``sys.argv[1:]`` so argparse defaults - # do not silently override deploy YAML values. Mirrors the - # ``EngineArgs.from_cli_args`` pattern used throughout vllm / - # vllm-omni. ``deploy_config=None`` (the default) falls through to - # the bundled ``vllm_omni/deploy/qwen3_omni_moe.yaml``. - async_omni = AsyncOmni.from_cli_args(args) + async_omni = AsyncOmni(**vars(args)) # Use default sampling params from stage config (they are pre-configured # in the YAML for each stage). @@ -594,6 +589,7 @@ def parse_args(): default=16000, help="Sampling rate for audio loading.", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/qwen3_tts/end2end.py b/examples/offline_inference/qwen3_tts/end2end.py index a042f7e4658..130e919fd82 100644 --- a/examples/offline_inference/qwen3_tts/end2end.py +++ b/examples/offline_inference/qwen3_tts/end2end.py @@ -18,6 +18,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni import AsyncOmni, Omni +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults logger = logging.getLogger(__name__) @@ -383,7 +384,9 @@ def main(args): output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) - omni = Omni.from_cli_args(args, model=model_name) + omni_kwargs = vars(args).copy() + omni_kwargs["model"] = model_name + omni = Omni(**omni_kwargs) batch_size = args.batch_size for batch_start in range(0, len(inputs), batch_size): @@ -399,7 +402,9 @@ async def main_streaming(args): output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) - omni = AsyncOmni.from_cli_args(args, model=model_name) + omni_kwargs = vars(args).copy() + omni_kwargs["model"] = model_name + omni = AsyncOmni(**omni_kwargs) for i, prompt in enumerate(inputs): request_id = str(i) @@ -541,6 +546,7 @@ def parse_args(): help="Number of prompts per batch (default: 1, sequential).", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() From 3e58af1ba9825b98711fdc3848eedeb975ca79f5 Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Sat, 25 Apr 2026 17:55:03 +0800 Subject: [PATCH 2/9] Remove legacy Omni CLI arg helper Signed-off-by: xiaohajiayou <923390377@qq.com> --- tests/entrypoints/test_omni_entrypoints.py | 9 ++-- vllm_omni/entrypoints/omni_base.py | 29 ----------- vllm_omni/entrypoints/utils.py | 60 ---------------------- 3 files changed, 6 insertions(+), 92 deletions(-) diff --git a/tests/entrypoints/test_omni_entrypoints.py b/tests/entrypoints/test_omni_entrypoints.py index 36a0839e773..cc573b89372 100644 --- a/tests/entrypoints/test_omni_entrypoints.py +++ b/tests/entrypoints/test_omni_entrypoints.py @@ -167,7 +167,10 @@ def _patch_engine(monkeypatch: pytest.MonkeyPatch, engine: FakeAsyncOmniEngine) monkeypatch.setattr("vllm_omni.entrypoints.omni_base.omni_snapshot_download", lambda model: model) -def test_from_cli_args_only_nulls_untyped_override_fields(monkeypatch: pytest.MonkeyPatch): +def test_direct_omni_with_nullified_parser_only_nulls_untyped_override_fields( + monkeypatch: pytest.MonkeyPatch, +): + from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni captured: dict[str, Any] = {} @@ -178,15 +181,15 @@ def fake_engine(*args: Any, **kwargs: Any) -> FakeAsyncOmniEngine: monkeypatch.setattr("vllm_omni.entrypoints.omni_base.AsyncOmniEngine", fake_engine) monkeypatch.setattr("vllm_omni.entrypoints.omni_base.omni_snapshot_download", lambda model: model) - monkeypatch.setattr("sys.argv", ["prog"]) parser = argparse.ArgumentParser() parser.add_argument("--gpu-memory-utilization", type=float, default=0.9) parser.add_argument("--hsdp-shard-size", type=int, default=-1) + nullify_stage_engine_defaults(parser) args = parser.parse_args([]) args.model = "fake-model" - Omni.from_cli_args(args, parser=parser) + Omni(**vars(args)) assert captured["gpu_memory_utilization"] is None assert captured["hsdp_shard_size"] == -1 diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 4147c802765..43e3306b05e 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -1,8 +1,6 @@ from __future__ import annotations -import argparse import os -import sys import time import types import weakref @@ -87,33 +85,6 @@ def omni_snapshot_download(model_id: str) -> str: class OmniBase(PDDisaggregationMixin): """Shared runtime foundation for AsyncOmni and Omni.""" - @classmethod - def from_cli_args( - cls, - args: argparse.Namespace, - *, - parser: argparse.ArgumentParser | None = None, - **overrides: Any, - ) -> OmniBase: - """Build from argparse. If ``parser`` is passed and not yet nullified, - un-typed engine fields are reset to ``None``.""" - kwargs: dict[str, Any] = {k: v for k, v in vars(args).items() if not k.startswith("_")} - - if parser is not None and not getattr(parser, "_omni_nullified", False): - from vllm_omni.engine.arg_utils import ( - deploy_override_field_names, - ) - from vllm_omni.entrypoints.utils import detect_explicit_cli_keys - - explicit = detect_explicit_cli_keys(sys.argv[1:], parser) or set() - override_dests = deploy_override_field_names() - for key in list(kwargs): - if key in override_dests and key not in explicit: - kwargs[key] = None - - kwargs.update(overrides) - return cls(**kwargs) - def __init__( self, model: str, diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index d728e76417c..02e37c32813 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -1,4 +1,3 @@ -import argparse import os import types from collections import Counter @@ -40,65 +39,6 @@ def _warn_deprecated_explicit_keys(kwargs: dict[str, Any]) -> None: } -def detect_explicit_cli_keys( - argv: list[str], - parser: argparse.ArgumentParser | None = None, -) -> set[str]: - """Walk ``argv`` and return the set of ``dest`` attribute names the user - explicitly provided (e.g. ``--max-num-seqs 64`` → ``max_num_seqs``). - - Used to distinguish user-typed CLI args from argparse default values so - deploy YAMLs are not silently overridden by parser defaults. Shared - across online (``vllm serve``) and offline (scripts, examples, tests, - CI) entry points — offline callers that parse CLI args via argparse - should invoke this on ``sys.argv[1:]`` and pass the result through to - ``AsyncOmni`` / ``Omni`` via the ``_cli_explicit_keys`` kwarg. - - When ``parser`` is provided, each token is looked up in the parser's - action table to find its real ``dest``. This correctly handles flags - with ``dest=`` overrides, alias flags (e.g. ``--usp`` / - ``--ulysses-degree`` both mapping to ``ulysses_degree``), and - ``--disable-foo`` / ``store_false`` patterns that map to a differently - named dest. Callers with access to an ``argparse.ArgumentParser`` should - always pass it. - - When ``parser`` is ``None``, a name-based heuristic is used as a - fallback (hyphens → underscores, plus a ``no_`` prefix strip for - ``argparse.BooleanOptionalAction``). This is correct for simple flags - but silently misidentifies ``--disable-X``-style flags and explicit - ``dest=`` overrides, so prefer the parser-aware form. - """ - if parser is not None: - dest_map: dict[str, str] = {} - for action in parser._actions: - for opt in action.option_strings: - dest_map[opt] = action.dest - explicit: set[str] = set() - for tok in argv: - if not tok.startswith("--"): - continue - flag = tok.split("=", 1)[0] - dest = dest_map.get(flag) - if dest is not None: - explicit.add(dest) - return explicit - - # Fallback: name-based heuristic (legacy path for callers without a parser). - explicit = set() - for tok in argv: - if not tok.startswith("--"): - continue - name = tok[2:].split("=", 1)[0] - if not name: - continue - attr = name.replace("-", "_") - explicit.add(attr) - # BooleanOptionalAction: --no-foo records as dest `foo`, not `no_foo`. - if attr.startswith("no_"): - explicit.add(attr[3:]) - return explicit - - def inject_omni_kv_config(stage: Any, omni_conn_cfg: dict[str, Any], omni_from: str, omni_to: str) -> None: """Inject connector configuration into stage engine arguments.""" # Prepare omni_kv_config dict From 9666c90f593930b9f9bd1cda49c65177ca487613 Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Sat, 25 Apr 2026 17:56:12 +0800 Subject: [PATCH 3/9] Update serve test for nullified defaults Signed-off-by: xiaohajiayou <923390377@qq.com> --- tests/entrypoints/test_serve.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/entrypoints/test_serve.py b/tests/entrypoints/test_serve.py index 999c0a4ea0e..6d8ab96bae1 100644 --- a/tests/entrypoints/test_serve.py +++ b/tests/entrypoints/test_serve.py @@ -8,14 +8,13 @@ from pytest_mock import MockerFixture from vllm_omni.entrypoints.cli.serve import OmniServeCommand, run_headless -from vllm_omni.entrypoints.utils import detect_explicit_cli_keys pytestmark = [pytest.mark.core_model, pytest.mark.cpu] -def test_serve_parser_accepts_no_async_chunk_and_marks_it_explicit() -> None: - """``--no-async-chunk`` should parse to ``async_chunk=False`` and mark the - shared deploy-level dest as explicitly provided by the user.""" +def test_serve_parser_accepts_no_async_chunk() -> None: + """``--no-async-chunk`` should parse after deploy-overriding parser + defaults are nullified.""" try: from vllm.utils.argparse_utils import FlexibleArgumentParser except Exception as exc: @@ -24,14 +23,12 @@ def test_serve_parser_accepts_no_async_chunk_and_marks_it_explicit() -> None: root = FlexibleArgumentParser() subparsers = root.add_subparsers(dest="subcommand") cmd = OmniServeCommand() - serve_parser = cmd.subparser_init(subparsers) + cmd.subparser_init(subparsers) argv = ["serve", "fake-model", "--omni", "--no-async-chunk"] args = root.parse_args(argv) assert args.async_chunk is False - explicit = detect_explicit_cli_keys(argv, serve_parser) - assert "async_chunk" in explicit def _make_headless_args() -> argparse.Namespace: From 679854f267974e1cae9d9c4d242d3efbc0ed2303 Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Sun, 26 Apr 2026 12:15:40 +0800 Subject: [PATCH 4/9] Add nullified defaults to remaining deploy-config examples Signed-off-by: xiaohajiayou <923390377@qq.com> --- examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py | 2 ++ examples/offline_inference/mimo_audio/end2end.py | 2 ++ examples/offline_inference/voxtral_tts/end2end.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py b/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py index a5dc564ec3b..57778ece12f 100644 --- a/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py +++ b/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py @@ -9,6 +9,7 @@ from vllm.assets.audio import AudioAsset from vllm.multimodal.media.audio import load_audio +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer @@ -44,6 +45,7 @@ def run_e2e(): required=True, help="Path to tokenizer directory (e.g., /CosyVoice-BlankEN).", ) + nullify_stage_engine_defaults(parser) args = parser.parse_args() # Ensure tokenizer directory exists if not os.path.exists(args.tokenizer): diff --git a/examples/offline_inference/mimo_audio/end2end.py b/examples/offline_inference/mimo_audio/end2end.py index 9c652fe2b05..d3728dcc55b 100644 --- a/examples/offline_inference/mimo_audio/end2end.py +++ b/examples/offline_inference/mimo_audio/end2end.py @@ -23,6 +23,7 @@ from vllm import SamplingParams from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniTokensPrompt @@ -438,6 +439,7 @@ def parse_args(): "vllm_omni/deploy/mimo_audio.yaml based on the HF model_type.", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/voxtral_tts/end2end.py b/examples/offline_inference/voxtral_tts/end2end.py index 0a6f88715a9..497ba607b90 100644 --- a/examples/offline_inference/voxtral_tts/end2end.py +++ b/examples/offline_inference/voxtral_tts/end2end.py @@ -30,6 +30,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni import AsyncOmni +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni logger = logging.getLogger(__name__) @@ -304,6 +305,7 @@ def parse_args() -> Namespace: default=None, help="CFG alpha for flow-matching guidance (default: use value from stage config, typically 1.2).", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() From 76adc0b0d16c43b0622d223c28a5b6174136e0b3 Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Sun, 26 Apr 2026 13:23:56 +0800 Subject: [PATCH 5/9] Restore deprecated from_cli_args shim Signed-off-by: xiaohajiayou <923390377@qq.com> --- examples/offline_inference/bagel/end2end.py | 1 + .../offline_inference/dynin_omni/end2end.py | 3 ++- .../offline_inference/fish_speech/end2end.py | 2 ++ .../offline_inference/omnivoice/end2end.py | 2 ++ .../offline_inference/qwen2_5_omni/end2end.py | 1 + .../offline_inference/qwen3_omni/end2end.py | 1 + .../offline_inference/qwen3_tts/end2end.py | 2 ++ examples/offline_inference/voxcpm/end2end.py | 2 ++ examples/offline_inference/voxcpm2/end2end.py | 2 ++ tests/entrypoints/test_omni_entrypoints.py | 21 +++++++++++++++++++ vllm_omni/entrypoints/omni_base.py | 21 +++++++++++++++++++ 11 files changed, 57 insertions(+), 1 deletion(-) diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index f196cbaa95b..3ce7ba8d57e 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -164,6 +164,7 @@ def main(): if args.quantization: omni_kwargs["quantization_config"] = args.quantization + # Override CLI --model with the derived model_name. omni_kwargs["model"] = model_name omni = Omni(**omni_kwargs) diff --git a/examples/offline_inference/dynin_omni/end2end.py b/examples/offline_inference/dynin_omni/end2end.py index 82cff0c0015..beba6be0c29 100644 --- a/examples/offline_inference/dynin_omni/end2end.py +++ b/examples/offline_inference/dynin_omni/end2end.py @@ -18,6 +18,8 @@ import torch from PIL import Image +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults + TASK_CHOICES = ("t2t", "t2i", "t2s", "i2i", "i2t", "s2t", "v2t") TASK_DEFAULT_RUNTIME = { @@ -970,7 +972,6 @@ def parse_args(repo_root: Path) -> argparse.Namespace: parser.add_argument("--vq-model-audio-local-files-only", action=argparse.BooleanOptionalAction, default=None) parser.add_argument("--disable-hf-xet", action=argparse.BooleanOptionalAction, default=True) - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/fish_speech/end2end.py b/examples/offline_inference/fish_speech/end2end.py index 60830d06b7f..94fb6d242d7 100644 --- a/examples/offline_inference/fish_speech/end2end.py +++ b/examples/offline_inference/fish_speech/end2end.py @@ -29,6 +29,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni import AsyncOmni, Omni +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.model_executor.models.fish_speech.dac_utils import DAC_HOP_LENGTH, DAC_SAMPLE_RATE from vllm_omni.model_executor.models.fish_speech.prompt_utils import ( build_fish_text_only_prompt_ids, @@ -265,6 +266,7 @@ def parse_args(): default=False, help="Stream audio chunks as they arrive via AsyncOmni.", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/omnivoice/end2end.py b/examples/offline_inference/omnivoice/end2end.py index cc6f585c50e..827fe552739 100644 --- a/examples/offline_inference/omnivoice/end2end.py +++ b/examples/offline_inference/omnivoice/end2end.py @@ -21,6 +21,7 @@ import numpy as np import soundfile as sf +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -79,6 +80,7 @@ def run_e2e(): default=600, help="Stage initialization timeout in seconds", ) + nullify_stage_engine_defaults(parser) args = parser.parse_args() if not os.path.exists(args.stage_config): diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py index ec98fd3add4..ad118b2b134 100644 --- a/examples/offline_inference/qwen2_5_omni/end2end.py +++ b/examples/offline_inference/qwen2_5_omni/end2end.py @@ -327,6 +327,7 @@ def main(args): query_result = query_func() args.quantization_config = quantization_config omni_kwargs = vars(args).copy() + # Override CLI --model with the derived model_name. omni_kwargs["model"] = model_name omni = Omni(**omni_kwargs) thinker_sampling_params = SamplingParams( diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 59c7352c547..506c99c2276 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -296,6 +296,7 @@ def main(args): query_result = query_func() omni_kwargs = vars(args).copy() + # Override CLI --model with the derived model_name. omni_kwargs["model"] = model_name omni = Omni(**omni_kwargs) diff --git a/examples/offline_inference/qwen3_tts/end2end.py b/examples/offline_inference/qwen3_tts/end2end.py index 130e919fd82..6d026fb86c1 100644 --- a/examples/offline_inference/qwen3_tts/end2end.py +++ b/examples/offline_inference/qwen3_tts/end2end.py @@ -385,6 +385,7 @@ def main(args): os.makedirs(output_dir, exist_ok=True) omni_kwargs = vars(args).copy() + # Override CLI --model with the derived model_name. omni_kwargs["model"] = model_name omni = Omni(**omni_kwargs) @@ -403,6 +404,7 @@ async def main_streaming(args): os.makedirs(output_dir, exist_ok=True) omni_kwargs = vars(args).copy() + # Override CLI --model with the derived model_name. omni_kwargs["model"] = model_name omni = AsyncOmni(**omni_kwargs) diff --git a/examples/offline_inference/voxcpm/end2end.py b/examples/offline_inference/voxcpm/end2end.py index 980410feaeb..7363f798451 100644 --- a/examples/offline_inference/voxcpm/end2end.py +++ b/examples/offline_inference/voxcpm/end2end.py @@ -12,6 +12,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni import AsyncOmni, Omni +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults REPO_ROOT = Path(__file__).resolve().parents[3] DEFAULT_SYNC_STAGE_CONFIG = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" @@ -185,6 +186,7 @@ def parse_args(): ) parser.add_argument("--stage-init-timeout", type=int, default=600, help="Stage initialization timeout in seconds.") parser.add_argument("--log-stats", action="store_true", help="Enable vLLM Omni stats logging.") + nullify_stage_engine_defaults(parser) args = parser.parse_args() if (args.ref_audio is None) != (args.ref_text is None): raise ValueError("Voice cloning requires --ref-audio and --ref-text together.") diff --git a/examples/offline_inference/voxcpm2/end2end.py b/examples/offline_inference/voxcpm2/end2end.py index 6b6bf78ddf1..b20c2ba9143 100644 --- a/examples/offline_inference/voxcpm2/end2end.py +++ b/examples/offline_inference/voxcpm2/end2end.py @@ -15,6 +15,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni import Omni +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults REPO_ROOT = Path(__file__).resolve().parents[3] DEFAULT_STAGE_CONFIGS_PATH = str(REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm2.yaml") @@ -71,6 +72,7 @@ def parse_args(): default=None, help="Optional transcript of --reference-audio (enables ref_continuation mode).", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/tests/entrypoints/test_omni_entrypoints.py b/tests/entrypoints/test_omni_entrypoints.py index cc573b89372..f1bb95ff870 100644 --- a/tests/entrypoints/test_omni_entrypoints.py +++ b/tests/entrypoints/test_omni_entrypoints.py @@ -193,6 +193,27 @@ def fake_engine(*args: Any, **kwargs: Any) -> FakeAsyncOmniEngine: assert captured["gpu_memory_utilization"] is None assert captured["hsdp_shard_size"] == -1 + assert "_cli_explicit_keys" not in captured + + +def test_from_cli_args_warns_and_forwards_without_internal_keys( + monkeypatch: pytest.MonkeyPatch, +): + captured: dict[str, Any] = {} + + def fake_engine(*args: Any, **kwargs: Any) -> FakeAsyncOmniEngine: + captured.update(kwargs) + return FakeAsyncOmniEngine() + + monkeypatch.setattr("vllm_omni.entrypoints.omni_base.AsyncOmniEngine", fake_engine) + monkeypatch.setattr("vllm_omni.entrypoints.omni_base.omni_snapshot_download", lambda model: model) + + args = argparse.Namespace(model="fake-model", gpu_memory_utilization=0.9, _cli_explicit_keys={"model"}) + with pytest.deprecated_call(match="from_cli_args"): + Omni.from_cli_args(args) + + assert captured["gpu_memory_utilization"] == 0.9 + assert "_cli_explicit_keys" not in captured def _make_base(): diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 43e3306b05e..652ac129a85 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -1,8 +1,10 @@ from __future__ import annotations +import argparse import os import time import types +import warnings import weakref from collections.abc import Sequence from typing import TYPE_CHECKING, Any, Literal @@ -85,6 +87,25 @@ def omni_snapshot_download(model_id: str) -> str: class OmniBase(PDDisaggregationMixin): """Shared runtime foundation for AsyncOmni and Omni.""" + @classmethod + def from_cli_args( + cls, + args: argparse.Namespace, + *, + parser: argparse.ArgumentParser | None = None, + **overrides: Any, + ) -> OmniBase: + warnings.warn( + "`from_cli_args()` is deprecated. Nullify deploy-overriding parser defaults " + "with `nullify_stage_engine_defaults(parser)` and construct Omni/AsyncOmni " + "directly from `vars(args)`.", + DeprecationWarning, + stacklevel=2, + ) + kwargs: dict[str, Any] = {k: v for k, v in vars(args).items() if not k.startswith("_")} + kwargs.update(overrides) + return cls(**kwargs) + def __init__( self, model: str, From b01ea074e6f6b9f39a64777daa696c448d99346a Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Mon, 27 Apr 2026 00:05:57 +0800 Subject: [PATCH 6/9] Preserve deprecated Omni from_cli_args behavior Signed-off-by: xiaohajiayou <923390377@qq.com> --- tests/entrypoints/test_omni_entrypoints.py | 28 +++++++++++++++++ vllm_omni/entrypoints/omni_base.py | 12 ++++++++ vllm_omni/entrypoints/utils.py | 35 ++++++++++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/tests/entrypoints/test_omni_entrypoints.py b/tests/entrypoints/test_omni_entrypoints.py index f1bb95ff870..251e54ab83d 100644 --- a/tests/entrypoints/test_omni_entrypoints.py +++ b/tests/entrypoints/test_omni_entrypoints.py @@ -216,6 +216,34 @@ def fake_engine(*args: Any, **kwargs: Any) -> FakeAsyncOmniEngine: assert "_cli_explicit_keys" not in captured +def test_deprecated_from_cli_args_preserves_legacy_parser_nulling( + monkeypatch: pytest.MonkeyPatch, +): + from vllm_omni.entrypoints.omni import Omni + + captured: dict[str, Any] = {} + + def fake_engine(*args: Any, **kwargs: Any) -> FakeAsyncOmniEngine: + captured.update(kwargs) + return FakeAsyncOmniEngine() + + monkeypatch.setattr("vllm_omni.entrypoints.omni_base.AsyncOmniEngine", fake_engine) + monkeypatch.setattr("vllm_omni.entrypoints.omni_base.omni_snapshot_download", lambda model: model) + monkeypatch.setattr("sys.argv", ["prog"]) + + parser = argparse.ArgumentParser() + parser.add_argument("--gpu-memory-utilization", type=float, default=0.9) + parser.add_argument("--hsdp-shard-size", type=int, default=-1) + args = parser.parse_args([]) + args.model = "fake-model" + + with pytest.deprecated_call(match="from_cli_args"): + Omni.from_cli_args(args, parser=parser) + + assert captured["gpu_memory_utilization"] is None + assert captured["hsdp_shard_size"] == -1 + + def _make_base(): from vllm_omni.entrypoints.omni_base import OmniBase diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 652ac129a85..3071df5ce9e 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -2,6 +2,7 @@ import argparse import os +import sys import time import types import warnings @@ -103,6 +104,17 @@ def from_cli_args( stacklevel=2, ) kwargs: dict[str, Any] = {k: v for k, v in vars(args).items() if not k.startswith("_")} + + if parser is not None and not getattr(parser, "_omni_nullified", False): + from vllm_omni.engine.arg_utils import deploy_override_field_names + from vllm_omni.entrypoints.utils import detect_explicit_cli_keys + + explicit = detect_explicit_cli_keys(sys.argv[1:], parser) or set() + override_dests = deploy_override_field_names() + for key in list(kwargs): + if key in override_dests and key not in explicit: + kwargs[key] = None + kwargs.update(overrides) return cls(**kwargs) diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index 02e37c32813..e91f5474778 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -1,3 +1,4 @@ +import argparse import os import types from collections import Counter @@ -39,6 +40,40 @@ def _warn_deprecated_explicit_keys(kwargs: dict[str, Any]) -> None: } +def detect_explicit_cli_keys( + argv: list[str], + parser: argparse.ArgumentParser | None = None, +) -> set[str]: + """Return argparse dest names explicitly provided on the CLI.""" + if parser is not None: + dest_map: dict[str, str] = {} + for action in parser._actions: + for opt in action.option_strings: + dest_map[opt] = action.dest + explicit: set[str] = set() + for tok in argv: + if not tok.startswith("--"): + continue + flag = tok.split("=", 1)[0] + dest = dest_map.get(flag) + if dest is not None: + explicit.add(dest) + return explicit + + explicit = set() + for tok in argv: + if not tok.startswith("--"): + continue + name = tok[2:].split("=", 1)[0] + if not name: + continue + attr = name.replace("-", "_") + explicit.add(attr) + if attr.startswith("no_"): + explicit.add(attr[3:]) + return explicit + + def inject_omni_kv_config(stage: Any, omni_conn_cfg: dict[str, Any], omni_from: str, omni_to: str) -> None: """Inject connector configuration into stage engine arguments.""" # Prepare omni_kv_config dict From c6e5835c8f8b82023d5efdd63d88310184c8ff7e Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Mon, 27 Apr 2026 00:08:26 +0800 Subject: [PATCH 7/9] Restore CLI helper documentation Signed-off-by: xiaohajiayou <923390377@qq.com> --- vllm_omni/entrypoints/omni_base.py | 8 ++++++++ vllm_omni/entrypoints/utils.py | 25 ++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 3071df5ce9e..9b8ae286963 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -96,6 +96,14 @@ def from_cli_args( parser: argparse.ArgumentParser | None = None, **overrides: Any, ) -> OmniBase: + """Deprecated argparse builder. + + Build from argparse. If ``parser`` is passed and not yet nullified, + un-typed engine fields are reset to ``None``. New callers should + nullify deploy-overriding parser defaults with + ``nullify_stage_engine_defaults(parser)`` and construct Omni/AsyncOmni + directly. + """ warnings.warn( "`from_cli_args()` is deprecated. Nullify deploy-overriding parser defaults " "with `nullify_stage_engine_defaults(parser)` and construct Omni/AsyncOmni " diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index e91f5474778..91704b10967 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -44,7 +44,28 @@ def detect_explicit_cli_keys( argv: list[str], parser: argparse.ArgumentParser | None = None, ) -> set[str]: - """Return argparse dest names explicitly provided on the CLI.""" + """Walk ``argv`` and return the set of ``dest`` attribute names the user + explicitly provided (e.g. ``--max-num-seqs 64`` -> ``max_num_seqs``). + + Used to distinguish user-typed CLI args from argparse default values so + deploy YAMLs are not silently overridden by parser defaults. Shared + across online (``vllm serve``) and offline (scripts, examples, tests, + CI) entry points. + + When ``parser`` is provided, each token is looked up in the parser's + action table to find its real ``dest``. This correctly handles flags + with ``dest=`` overrides, alias flags (e.g. ``--usp`` / + ``--ulysses-degree`` both mapping to ``ulysses_degree``), and + ``--disable-foo`` / ``store_false`` patterns that map to a differently + named dest. Callers with access to an ``argparse.ArgumentParser`` should + always pass it. + + When ``parser`` is ``None``, a name-based heuristic is used as a + fallback (hyphens -> underscores, plus a ``no_`` prefix strip for + ``argparse.BooleanOptionalAction``). This is correct for simple flags + but silently misidentifies ``--disable-X``-style flags and explicit + ``dest=`` overrides, so prefer the parser-aware form. + """ if parser is not None: dest_map: dict[str, str] = {} for action in parser._actions: @@ -60,6 +81,7 @@ def detect_explicit_cli_keys( explicit.add(dest) return explicit + # Fallback: name-based heuristic (legacy path for callers without a parser). explicit = set() for tok in argv: if not tok.startswith("--"): @@ -69,6 +91,7 @@ def detect_explicit_cli_keys( continue attr = name.replace("-", "_") explicit.add(attr) + # BooleanOptionalAction: --no-foo records as dest `foo`, not `no_foo`. if attr.startswith("no_"): explicit.add(attr[3:]) return explicit From 8bce4bf89bbd36564989186e97efc90ecdf15249 Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Mon, 27 Apr 2026 00:10:43 +0800 Subject: [PATCH 8/9] Restore exact CLI helper comments Signed-off-by: xiaohajiayou <923390377@qq.com> --- vllm_omni/entrypoints/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index 91704b10967..d728e76417c 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -45,12 +45,14 @@ def detect_explicit_cli_keys( parser: argparse.ArgumentParser | None = None, ) -> set[str]: """Walk ``argv`` and return the set of ``dest`` attribute names the user - explicitly provided (e.g. ``--max-num-seqs 64`` -> ``max_num_seqs``). + explicitly provided (e.g. ``--max-num-seqs 64`` → ``max_num_seqs``). Used to distinguish user-typed CLI args from argparse default values so deploy YAMLs are not silently overridden by parser defaults. Shared across online (``vllm serve``) and offline (scripts, examples, tests, - CI) entry points. + CI) entry points — offline callers that parse CLI args via argparse + should invoke this on ``sys.argv[1:]`` and pass the result through to + ``AsyncOmni`` / ``Omni`` via the ``_cli_explicit_keys`` kwarg. When ``parser`` is provided, each token is looked up in the parser's action table to find its real ``dest``. This correctly handles flags @@ -61,7 +63,7 @@ def detect_explicit_cli_keys( always pass it. When ``parser`` is ``None``, a name-based heuristic is used as a - fallback (hyphens -> underscores, plus a ``no_`` prefix strip for + fallback (hyphens → underscores, plus a ``no_`` prefix strip for ``argparse.BooleanOptionalAction``). This is correct for simple flags but silently misidentifies ``--disable-X``-style flags and explicit ``dest=`` overrides, so prefer the parser-aware form. From 221ddd2b332f61b5bdbbcc68b8ef1ed7d3809b9f Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Mon, 4 May 2026 22:58:31 +0800 Subject: [PATCH 9/9] Migrate remaining from_cli_args calls to direct Omni construction Replace Omni.from_cli_args(args, ...) with Omni(**vars(args)) in ming_flash_omni and ming_flash_omni_tts examples. Signed-off-by: xiaohajiayou <923390377@qq.com> --- examples/offline_inference/ming_flash_omni/end2end.py | 5 ++++- examples/offline_inference/ming_flash_omni_tts/end2end.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference/ming_flash_omni/end2end.py b/examples/offline_inference/ming_flash_omni/end2end.py index 6c8751839e2..de59d1b6690 100644 --- a/examples/offline_inference/ming_flash_omni/end2end.py +++ b/examples/offline_inference/ming_flash_omni/end2end.py @@ -303,7 +303,10 @@ def main(args): else: query_result = query_func(processor) - omni = Omni.from_cli_args(args, model=MODEL_NAME) + omni_kwargs = vars(args).copy() + # override CLI --model with derived model_name + omni_kwargs["model"] = MODEL_NAME + omni = Omni(**omni_kwargs) # Thinker sampling params thinker_sampling_params = SamplingParams( diff --git a/examples/offline_inference/ming_flash_omni_tts/end2end.py b/examples/offline_inference/ming_flash_omni_tts/end2end.py index 8a26d1b33ea..18fdc9d6159 100644 --- a/examples/offline_inference/ming_flash_omni_tts/end2end.py +++ b/examples/offline_inference/ming_flash_omni_tts/end2end.py @@ -96,7 +96,7 @@ def parse_args(): def main(): args = parse_args() - omni = Omni.from_cli_args(args, model=args.model) + omni = Omni(**vars(args)) messages = get_messages(args.case, args.text) decode_args = {