diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index a6ce1f1314f..3ce7ba8d57e 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -153,7 +153,7 @@ def main(): from vllm_omni.entrypoints.omni import Omni - omni_kwargs = {} + omni_kwargs = vars(args).copy() deploy_config = args.deploy_config if args.think and deploy_config is None: deploy_config = "vllm_omni/deploy/bagel_think.yaml" @@ -161,22 +161,12 @@ def main(): if deploy_config: omni_kwargs["deploy_config"] = deploy_config - omni_kwargs.update( - { - "log_stats": args.log_stats, - "init_sleep_seconds": args.init_sleep_seconds, - "batch_timeout": args.batch_timeout, - "init_timeout": args.init_timeout, - "shm_threshold_bytes": args.shm_threshold_bytes, - "worker_backend": args.worker_backend, - "ray_address": args.ray_address, - "enable_diffusion_pipeline_profiler": args.enable_diffusion_pipeline_profiler, - } - ) if args.quantization: omni_kwargs["quantization_config"] = args.quantization - omni = Omni.from_cli_args(args, model=model_name, **omni_kwargs) + # Override CLI --model with the derived model_name. + omni_kwargs["model"] = model_name + omni = Omni(**omni_kwargs) formatted_prompts = [] for p in prompts: diff --git a/examples/offline_inference/dynin_omni/end2end.py b/examples/offline_inference/dynin_omni/end2end.py index 82cff0c0015..beba6be0c29 100644 --- a/examples/offline_inference/dynin_omni/end2end.py +++ b/examples/offline_inference/dynin_omni/end2end.py @@ -18,6 +18,8 @@ import torch from PIL import Image +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults + TASK_CHOICES = ("t2t", "t2i", "t2s", "i2i", "i2t", "s2t", "v2t") TASK_DEFAULT_RUNTIME = { @@ -970,7 +972,6 @@ def parse_args(repo_root: Path) -> argparse.Namespace: parser.add_argument("--vq-model-audio-local-files-only", action=argparse.BooleanOptionalAction, default=None) parser.add_argument("--disable-hf-xet", action=argparse.BooleanOptionalAction, default=True) - from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/mimo_audio/end2end.py b/examples/offline_inference/mimo_audio/end2end.py index 9c652fe2b05..d3728dcc55b 100644 --- a/examples/offline_inference/mimo_audio/end2end.py +++ b/examples/offline_inference/mimo_audio/end2end.py @@ -23,6 +23,7 @@ from vllm import SamplingParams from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniTokensPrompt @@ -438,6 +439,7 @@ def parse_args(): "vllm_omni/deploy/mimo_audio.yaml based on the HF model_type.", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/ming_flash_omni/end2end.py b/examples/offline_inference/ming_flash_omni/end2end.py index 6c8751839e2..de59d1b6690 100644 --- a/examples/offline_inference/ming_flash_omni/end2end.py +++ b/examples/offline_inference/ming_flash_omni/end2end.py @@ -303,7 +303,10 @@ def main(args): else: query_result = query_func(processor) - omni = Omni.from_cli_args(args, model=MODEL_NAME) + omni_kwargs = vars(args).copy() + # override CLI --model with derived model_name + omni_kwargs["model"] = MODEL_NAME + omni = Omni(**omni_kwargs) # Thinker sampling params thinker_sampling_params = SamplingParams( diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py index a65c554a9b0..ad118b2b134 100644 --- a/examples/offline_inference/qwen2_5_omni/end2end.py +++ b/examples/offline_inference/qwen2_5_omni/end2end.py @@ -21,6 +21,7 @@ from vllm.sampling_params import SamplingParams from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni SEED = 42 @@ -325,7 +326,10 @@ def main(args): else: query_result = query_func() args.quantization_config = quantization_config - omni = Omni.from_cli_args(args, model=model_name) + omni_kwargs = vars(args).copy() + # Override CLI --model with the derived model_name. + omni_kwargs["model"] = model_name + omni = Omni(**omni_kwargs) thinker_sampling_params = SamplingParams( temperature=0.0, # Deterministic - no randomness top_p=1.0, # Disable nucleus sampling @@ -550,6 +554,7 @@ def parse_args(): default=False, help="Use py_generator mode. The returned type of Omni.generate() is a Python Generator object.", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 04aa7914db1..506c99c2276 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -295,7 +295,10 @@ def main(args): else: query_result = query_func() - omni = Omni.from_cli_args(args, model=model_name) + omni_kwargs = vars(args).copy() + # Override CLI --model with the derived model_name. + omni_kwargs["model"] = model_name + omni = Omni(**omni_kwargs) thinker_sampling_params = SamplingParams( temperature=0.9, diff --git a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py index 85c2da20b04..a5bb6c33df8 100644 --- a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py +++ b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py @@ -41,6 +41,7 @@ from vllm.multimodal.media.audio import load_audio from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.async_omni import AsyncOmni logger = logging.getLogger(__name__) @@ -382,13 +383,7 @@ async def run_all(args): print(f"[Info] Creating AsyncOmni with deploy_config={args.deploy_config}") async_omni = None try: - # ``from_cli_args`` expands vars(args) into kwargs and auto-captures - # ``_cli_explicit_keys`` from ``sys.argv[1:]`` so argparse defaults - # do not silently override deploy YAML values. Mirrors the - # ``EngineArgs.from_cli_args`` pattern used throughout vllm / - # vllm-omni. ``deploy_config=None`` (the default) falls through to - # the bundled ``vllm_omni/deploy/qwen3_omni_moe.yaml``. - async_omni = AsyncOmni.from_cli_args(args) + async_omni = AsyncOmni(**vars(args)) # Use default sampling params from stage config (they are pre-configured # in the YAML for each stage). @@ -594,6 +589,7 @@ def parse_args(): default=16000, help="Sampling rate for audio loading.", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/text_to_speech/cosyvoice3/end2end.py b/examples/offline_inference/text_to_speech/cosyvoice3/end2end.py index c3b80152fde..0332ab38236 100644 --- a/examples/offline_inference/text_to_speech/cosyvoice3/end2end.py +++ b/examples/offline_inference/text_to_speech/cosyvoice3/end2end.py @@ -9,6 +9,7 @@ from vllm.assets.audio import AudioAsset from vllm.multimodal.media.audio import load_audio +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer @@ -44,6 +45,7 @@ def run_e2e(): required=True, help="Path to tokenizer directory (e.g., /CosyVoice-BlankEN).", ) + nullify_stage_engine_defaults(parser) args = parser.parse_args() # Ensure tokenizer directory exists if not os.path.exists(args.tokenizer): diff --git a/examples/offline_inference/text_to_speech/fish_speech/end2end.py b/examples/offline_inference/text_to_speech/fish_speech/end2end.py index 237aa8351b0..e63b844e76e 100644 --- a/examples/offline_inference/text_to_speech/fish_speech/end2end.py +++ b/examples/offline_inference/text_to_speech/fish_speech/end2end.py @@ -29,6 +29,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni import AsyncOmni, Omni +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.model_executor.models.fish_speech.dac_utils import DAC_HOP_LENGTH, DAC_SAMPLE_RATE from vllm_omni.model_executor.models.fish_speech.prompt_utils import ( build_fish_text_only_prompt_ids, @@ -266,6 +267,7 @@ def parse_args(): default=False, help="Stream audio chunks as they arrive via AsyncOmni.", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/text_to_speech/ming_flash_omni_tts/end2end.py b/examples/offline_inference/text_to_speech/ming_flash_omni_tts/end2end.py index 8a26d1b33ea..18fdc9d6159 100644 --- a/examples/offline_inference/text_to_speech/ming_flash_omni_tts/end2end.py +++ b/examples/offline_inference/text_to_speech/ming_flash_omni_tts/end2end.py @@ -96,7 +96,7 @@ def parse_args(): def main(): args = parse_args() - omni = Omni.from_cli_args(args, model=args.model) + omni = Omni(**vars(args)) messages = get_messages(args.case, args.text) decode_args = { diff --git a/examples/offline_inference/text_to_speech/omnivoice/end2end.py b/examples/offline_inference/text_to_speech/omnivoice/end2end.py index cc6f585c50e..827fe552739 100644 --- a/examples/offline_inference/text_to_speech/omnivoice/end2end.py +++ b/examples/offline_inference/text_to_speech/omnivoice/end2end.py @@ -21,6 +21,7 @@ import numpy as np import soundfile as sf +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -79,6 +80,7 @@ def run_e2e(): default=600, help="Stage initialization timeout in seconds", ) + nullify_stage_engine_defaults(parser) args = parser.parse_args() if not os.path.exists(args.stage_config): diff --git a/examples/offline_inference/text_to_speech/qwen3_tts/end2end.py b/examples/offline_inference/text_to_speech/qwen3_tts/end2end.py index a042f7e4658..6d026fb86c1 100644 --- a/examples/offline_inference/text_to_speech/qwen3_tts/end2end.py +++ b/examples/offline_inference/text_to_speech/qwen3_tts/end2end.py @@ -18,6 +18,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni import AsyncOmni, Omni +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults logger = logging.getLogger(__name__) @@ -383,7 +384,10 @@ def main(args): output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) - omni = Omni.from_cli_args(args, model=model_name) + omni_kwargs = vars(args).copy() + # Override CLI --model with the derived model_name. + omni_kwargs["model"] = model_name + omni = Omni(**omni_kwargs) batch_size = args.batch_size for batch_start in range(0, len(inputs), batch_size): @@ -399,7 +403,10 @@ async def main_streaming(args): output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) - omni = AsyncOmni.from_cli_args(args, model=model_name) + omni_kwargs = vars(args).copy() + # Override CLI --model with the derived model_name. + omni_kwargs["model"] = model_name + omni = AsyncOmni(**omni_kwargs) for i, prompt in enumerate(inputs): request_id = str(i) @@ -541,6 +548,7 @@ def parse_args(): help="Number of prompts per batch (default: 1, sequential).", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/text_to_speech/voxcpm/end2end.py b/examples/offline_inference/text_to_speech/voxcpm/end2end.py index d92fbc55cd4..dce71f62107 100644 --- a/examples/offline_inference/text_to_speech/voxcpm/end2end.py +++ b/examples/offline_inference/text_to_speech/voxcpm/end2end.py @@ -12,6 +12,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni import AsyncOmni, Omni +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults REPO_ROOT = Path(__file__).resolve().parents[4] DEFAULT_SYNC_STAGE_CONFIG = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" @@ -185,6 +186,7 @@ def parse_args(): ) parser.add_argument("--stage-init-timeout", type=int, default=600, help="Stage initialization timeout in seconds.") parser.add_argument("--log-stats", action="store_true", help="Enable vLLM Omni stats logging.") + nullify_stage_engine_defaults(parser) args = parser.parse_args() if (args.ref_audio is None) != (args.ref_text is None): raise ValueError("Voice cloning requires --ref-audio and --ref-text together.") diff --git a/examples/offline_inference/text_to_speech/voxcpm2/end2end.py b/examples/offline_inference/text_to_speech/voxcpm2/end2end.py index a08510faf9d..c9ac766fc3c 100644 --- a/examples/offline_inference/text_to_speech/voxcpm2/end2end.py +++ b/examples/offline_inference/text_to_speech/voxcpm2/end2end.py @@ -15,6 +15,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni import Omni +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults REPO_ROOT = Path(__file__).resolve().parents[4] DEFAULT_STAGE_CONFIGS_PATH = str(REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm2.yaml") @@ -59,6 +60,7 @@ def parse_args(): default=None, help="Optional transcript of --ref-audio (enables continuation mode).", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/examples/offline_inference/text_to_speech/voxtral_tts/end2end.py b/examples/offline_inference/text_to_speech/voxtral_tts/end2end.py index a193d992042..a54277725bc 100644 --- a/examples/offline_inference/text_to_speech/voxtral_tts/end2end.py +++ b/examples/offline_inference/text_to_speech/voxtral_tts/end2end.py @@ -30,6 +30,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni import AsyncOmni +from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni logger = logging.getLogger(__name__) @@ -304,6 +305,7 @@ def parse_args() -> Namespace: default=None, help="CFG alpha for flow-matching guidance (default: use value from stage config, typically 1.2).", ) + nullify_stage_engine_defaults(parser) return parser.parse_args() diff --git a/tests/entrypoints/test_omni_entrypoints.py b/tests/entrypoints/test_omni_entrypoints.py index 36a0839e773..251e54ab83d 100644 --- a/tests/entrypoints/test_omni_entrypoints.py +++ b/tests/entrypoints/test_omni_entrypoints.py @@ -167,7 +167,58 @@ def _patch_engine(monkeypatch: pytest.MonkeyPatch, engine: FakeAsyncOmniEngine) monkeypatch.setattr("vllm_omni.entrypoints.omni_base.omni_snapshot_download", lambda model: model) -def test_from_cli_args_only_nulls_untyped_override_fields(monkeypatch: pytest.MonkeyPatch): +def test_direct_omni_with_nullified_parser_only_nulls_untyped_override_fields( + monkeypatch: pytest.MonkeyPatch, +): + from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults + from vllm_omni.entrypoints.omni import Omni + + captured: dict[str, Any] = {} + + def fake_engine(*args: Any, **kwargs: Any) -> FakeAsyncOmniEngine: + captured.update(kwargs) + return FakeAsyncOmniEngine() + + monkeypatch.setattr("vllm_omni.entrypoints.omni_base.AsyncOmniEngine", fake_engine) + monkeypatch.setattr("vllm_omni.entrypoints.omni_base.omni_snapshot_download", lambda model: model) + + parser = argparse.ArgumentParser() + parser.add_argument("--gpu-memory-utilization", type=float, default=0.9) + parser.add_argument("--hsdp-shard-size", type=int, default=-1) + nullify_stage_engine_defaults(parser) + args = parser.parse_args([]) + args.model = "fake-model" + + Omni(**vars(args)) + + assert captured["gpu_memory_utilization"] is None + assert captured["hsdp_shard_size"] == -1 + assert "_cli_explicit_keys" not in captured + + +def test_from_cli_args_warns_and_forwards_without_internal_keys( + monkeypatch: pytest.MonkeyPatch, +): + captured: dict[str, Any] = {} + + def fake_engine(*args: Any, **kwargs: Any) -> FakeAsyncOmniEngine: + captured.update(kwargs) + return FakeAsyncOmniEngine() + + monkeypatch.setattr("vllm_omni.entrypoints.omni_base.AsyncOmniEngine", fake_engine) + monkeypatch.setattr("vllm_omni.entrypoints.omni_base.omni_snapshot_download", lambda model: model) + + args = argparse.Namespace(model="fake-model", gpu_memory_utilization=0.9, _cli_explicit_keys={"model"}) + with pytest.deprecated_call(match="from_cli_args"): + Omni.from_cli_args(args) + + assert captured["gpu_memory_utilization"] == 0.9 + assert "_cli_explicit_keys" not in captured + + +def test_deprecated_from_cli_args_preserves_legacy_parser_nulling( + monkeypatch: pytest.MonkeyPatch, +): from vllm_omni.entrypoints.omni import Omni captured: dict[str, Any] = {} @@ -186,7 +237,8 @@ def fake_engine(*args: Any, **kwargs: Any) -> FakeAsyncOmniEngine: args = parser.parse_args([]) args.model = "fake-model" - Omni.from_cli_args(args, parser=parser) + with pytest.deprecated_call(match="from_cli_args"): + Omni.from_cli_args(args, parser=parser) assert captured["gpu_memory_utilization"] is None assert captured["hsdp_shard_size"] == -1 diff --git a/tests/entrypoints/test_serve.py b/tests/entrypoints/test_serve.py index 5f3415bb393..1267a189a32 100644 --- a/tests/entrypoints/test_serve.py +++ b/tests/entrypoints/test_serve.py @@ -8,14 +8,13 @@ from pytest_mock import MockerFixture from vllm_omni.entrypoints.cli.serve import OmniServeCommand, run_headless -from vllm_omni.entrypoints.utils import detect_explicit_cli_keys pytestmark = [pytest.mark.core_model, pytest.mark.cpu] -def test_serve_parser_accepts_no_async_chunk_and_marks_it_explicit() -> None: - """``--no-async-chunk`` should parse to ``async_chunk=False`` and mark the - shared deploy-level dest as explicitly provided by the user.""" +def test_serve_parser_accepts_no_async_chunk() -> None: + """``--no-async-chunk`` should parse after deploy-overriding parser + defaults are nullified.""" try: from vllm.utils.argparse_utils import FlexibleArgumentParser except Exception as exc: @@ -24,14 +23,12 @@ def test_serve_parser_accepts_no_async_chunk_and_marks_it_explicit() -> None: root = FlexibleArgumentParser() subparsers = root.add_subparsers(dest="subcommand") cmd = OmniServeCommand() - serve_parser = cmd.subparser_init(subparsers) + cmd.subparser_init(subparsers) argv = ["serve", "fake-model", "--omni", "--no-async-chunk"] args = root.parse_args(argv) assert args.async_chunk is False - explicit = detect_explicit_cli_keys(argv, serve_parser) - assert "async_chunk" in explicit def _make_headless_args() -> argparse.Namespace: diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 86674206ee7..531c62d2635 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -5,6 +5,7 @@ import sys import time import types +import warnings import weakref from collections.abc import Sequence from typing import TYPE_CHECKING, Any, Literal @@ -101,8 +102,21 @@ def from_cli_args( parser: argparse.ArgumentParser | None = None, **overrides: Any, ) -> OmniBase: - """Build from argparse. If ``parser`` is passed and not yet nullified, - un-typed engine fields are reset to ``None``.""" + """Deprecated argparse builder. + + Build from argparse. If ``parser`` is passed and not yet nullified, + un-typed engine fields are reset to ``None``. New callers should + nullify deploy-overriding parser defaults with + ``nullify_stage_engine_defaults(parser)`` and construct Omni/AsyncOmni + directly. + """ + warnings.warn( + "`from_cli_args()` is deprecated. Nullify deploy-overriding parser defaults " + "with `nullify_stage_engine_defaults(parser)` and construct Omni/AsyncOmni " + "directly from `vars(args)`.", + DeprecationWarning, + stacklevel=2, + ) kwargs: dict[str, Any] = {k: v for k, v in vars(args).items() if not k.startswith("_")} if parser is not None and not getattr(parser, "_omni_nullified", False):