diff --git a/tests/entrypoints/test_async_omni_diffusion_config.py b/tests/entrypoints/test_async_omni_diffusion_config.py index 83b465fdb47..235add5725f 100644 --- a/tests/entrypoints/test_async_omni_diffusion_config.py +++ b/tests/entrypoints/test_async_omni_diffusion_config.py @@ -4,6 +4,7 @@ import pytest from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm_omni.config.stage_config import deploy_override_field_names from vllm_omni.engine.async_omni_engine import AsyncOmniEngine from vllm_omni.entrypoints.cli.serve import OmniServeCommand, _create_default_diffusion_stage_cfg @@ -30,6 +31,15 @@ def test_default_stage_config_includes_cache_backend(): assert engine_args["model_stage"] == "diffusion" +def test_default_stage_config_ignores_none_deploy_overrides(): + """Ensure nullified deploy override defaults do not alter diffusion defaults.""" + baseline = AsyncOmniEngine._create_default_diffusion_stage_cfg({})[0] + nullified_overrides = {name: None for name in deploy_override_field_names()} + stage_cfg = AsyncOmniEngine._create_default_diffusion_stage_cfg(nullified_overrides)[0] + + assert stage_cfg == baseline + + def test_default_cache_config_used_when_missing(): """Ensure default cache_config is synthesized when only backend is given.""" stage_cfg = AsyncOmniEngine._create_default_diffusion_stage_cfg( diff --git a/tests/helpers/stage_config.py b/tests/helpers/stage_config.py index 310af917459..23d80c82658 100644 --- a/tests/helpers/stage_config.py +++ b/tests/helpers/stage_config.py @@ -516,6 +516,7 @@ def delete_by_path(config_dict: dict, path: str) -> None: "max_num_seqs": 1, "gpu_memory_utilization": 0.9, "enforce_eager": True, + "enable_prefix_caching": False, "max_num_batched_tokens": 16384, "max_model_len": 16384, "skip_mm_profiling": True, diff --git a/tests/test_arg_utils.py b/tests/test_arg_utils.py index ae640b2d861..2fd5cf302e0 100644 --- a/tests/test_arg_utils.py +++ b/tests/test_arg_utils.py @@ -369,8 +369,8 @@ def _build_full_serve_parser(): def test_nullify_stage_engine_defaults_resets_inherited_defaults(): import argparse + from vllm_omni.config.stage_config import deploy_override_field_names from vllm_omni.engine.arg_utils import ( - deploy_override_field_names, nullify_stage_engine_defaults, ) diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py index 7abe8fc8693..6799fb80acc 100644 --- a/tests/test_config_factory.py +++ b/tests/test_config_factory.py @@ -93,6 +93,7 @@ def test_to_omegaconf_basic(self): assert omega_config.engine_args.worker_type == "ar" assert omega_config.final_output is True assert omega_config.final_output_type == "text" + assert "max_num_seqs" not in omega_config.engine_args # Legacy field name for backward compatibility assert omega_config.engine_input_source == [] @@ -146,6 +147,24 @@ def test_to_omegaconf_max_num_seqs_in_engine_args(self): omega_config = config.to_omegaconf() assert omega_config.engine_args.max_num_seqs == 32 + def test_to_omegaconf_omits_none_deploy_overrides_for_engine_args(self): + """None deploy overrides must fall through to EngineArgs defaults.""" + from vllm_omni.config.stage_config import deploy_override_field_names + + config = StageConfig( + stage_id=0, + model_stage="thinker", + runtime_overrides={name: None for name in deploy_override_field_names()}, + ) + + omega_config = config.to_omegaconf() + engine_args = dict(omega_config.engine_args) + + assert "devices" not in engine_args + assert "max_batch_size" not in engine_args + for name in deploy_override_field_names() - {"devices"}: + assert name not in engine_args + class TestModelPipeline: """Tests for ModelPipeline class.""" @@ -806,21 +825,80 @@ def test_register_and_lookup(self): class TestDeployConfigLoading: - def test_load_deploy_config(self): + def test_deploy_override_fields_include_deploy_schema_fields(self): + from vllm_omni.config.stage_config import deploy_override_field_names + + expected_fields = { + "async_chunk", + "async_scheduling", + "compilation_config", + "config_format", + "data_parallel_size", + "devices", + "disable_hybrid_kv_cache_manager", + "distributed_executor_backend", + "dtype", + "enable_chunked_prefill", + "enable_flashinfer_autotune", + "enable_prefix_caching", + "enforce_eager", + "gpu_memory_utilization", + "load_format", + "max_model_len", + "max_num_batched_tokens", + "max_num_seqs", + "mm_processor_cache_gb", + "pipeline_parallel_size", + "profiler_config", + "quantization", + "skip_mm_profiling", + "subtalker_sampling_params", + "tensor_parallel_size", + "tokenizer_mode", + "trust_remote_code", + } + + actual_fields = deploy_override_field_names() + assert expected_fields == actual_fields, ( + f"added={actual_fields - expected_fields}, removed={expected_fields - actual_fields}" + ) + + def test_load_qwen3_omni_moe_deploy_config(self): from pathlib import Path from vllm_omni.config.stage_config import load_deploy_config deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" - if not deploy_path.exists(): - pytest.skip("Deploy config not found") - deploy = load_deploy_config(deploy_path) assert len(deploy.stages) == 3 assert deploy.async_chunk is True assert deploy.connectors is not None assert deploy.platforms is not None + def test_load_voxtral_tts_deploy_config_schema_fields(self): + from pathlib import Path + + from vllm_omni.config.stage_config import load_deploy_config + + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "voxtral_tts.yaml" + deploy = load_deploy_config(deploy_path) + assert deploy.stages[0].config_format == "mistral" + assert deploy.stages[0].load_format == "mistral" + assert deploy.stages[0].tokenizer_mode == "mistral" + assert not any( + name in deploy.stages[0].engine_extras for name in ("config_format", "load_format", "tokenizer_mode") + ) + + def test_load_ming_flash_omni_deploy_config_schema_fields(self): + from pathlib import Path + + from vllm_omni.config.stage_config import load_deploy_config + + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "ming_flash_omni.yaml" + deploy = load_deploy_config(deploy_path) + assert deploy.stages[0].compilation_config == {"pass_config": {"fuse_allreduce_rms": False}} + assert "compilation_config" not in deploy.stages[0].engine_extras + def test_merge_pipeline_deploy(self): from pathlib import Path @@ -1171,7 +1249,8 @@ def test_ci_inherits_from_main(self): deploy = load_deploy_config(ci_path) assert len(deploy.stages) == 3 # CI overrides - assert deploy.stages[0].engine_extras.get("load_format") == "dummy" + assert deploy.stages[0].load_format == "dummy" + assert "load_format" not in deploy.stages[0].engine_extras assert deploy.stages[0].max_num_seqs == 5 # Inherited from base assert deploy.stages[0].gpu_memory_utilization == 0.9 @@ -1376,7 +1455,7 @@ def test_typed_kwarg_overrides_yaml(self): def test_none_value_skipped_yaml_wins(self): stages = self._stages({"max_num_seqs": None}) assert stages[2].runtime_overrides.get("max_num_seqs") is None - assert stages[2].yaml_engine_args.get("max_num_seqs") == 1 + assert "max_num_seqs" not in stages[2].yaml_engine_args def test_empty_kwargs_yaml_only(self): stages = self._stages({}) diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index d4e33667723..ad2639ab33b 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -399,19 +399,41 @@ class StageDeployConfig: the top level of ``DeployConfig`` and propagated to every stage. """ + # === Omni fields === + # Stage identity and Omni runtime placement. stage_id: int - max_num_seqs: int = 64 - gpu_memory_utilization: float = 0.9 - tensor_parallel_size: int = 1 - enforce_eager: bool = False - max_num_batched_tokens: int = 32768 - max_model_len: int | None = None - async_scheduling: bool | None = None - devices: str = "0" + devices: str | None = None + + # Inter-stage connector wiring and request defaults. output_connectors: dict[str, str] | None = None input_connectors: dict[str, str] | None = None default_sampling_params: dict[str, Any] | None = None subtalker_sampling_params: dict[str, Any] | None = None + + # === vLLM EngineArgs fields === + # Parallelism and scheduler/memory capacity. + tensor_parallel_size: int | None = None + gpu_memory_utilization: float | None = None + max_num_seqs: int | None = None + max_num_batched_tokens: int | None = None + max_model_len: int | None = None + + # Execution, scheduling, and KV/cache behavior. + enforce_eager: bool | None = None + async_scheduling: bool | None = None + disable_hybrid_kv_cache_manager: bool | None = None + mm_processor_cache_gb: float | None = None + + # Compilation, profiling, tokenizer/config parsing, and model loading. + compilation_config: dict[str, Any] | None = None + profiler_config: dict[str, Any] | None = None + skip_mm_profiling: bool | None = None + enable_flashinfer_autotune: bool | None = None + config_format: str | None = None + load_format: str | None = None + tokenizer_mode: str | None = None + + # Pass-through vLLM EngineArgs fields that are not represented above. engine_extras: dict[str, Any] = field(default_factory=dict) @@ -436,14 +458,14 @@ class DeployConfig: pipeline: str | None = None # === Pipeline-wide engine settings (applied uniformly to every stage) === - trust_remote_code: bool = True + trust_remote_code: bool | None = None distributed_executor_backend: str | None = None dtype: str | None = None quantization: str | None = None - enable_prefix_caching: bool = False + enable_prefix_caching: bool | None = None enable_chunked_prefill: bool | None = None - data_parallel_size: int = 1 - pipeline_parallel_size: int = 1 + data_parallel_size: int | None = None + pipeline_parallel_size: int | None = None _STAGE_NON_ENGINE_KEYS = frozenset( @@ -465,10 +487,10 @@ def _parse_stage_deploy(stage_data: dict[str, Any]) -> StageDeployConfig: """Parse a single stage entry from deploy YAML into StageDeployConfig.""" if "engine_args" in stage_data: engine_args = dict(stage_data["engine_args"]) - devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices", "0")) + devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices")) else: engine_args = {k: v for k, v in stage_data.items() if k not in _STAGE_NON_ENGINE_KEYS and k != "stage_id"} - devices = stage_data.get("devices", "0") + devices = stage_data.get("devices") kwargs: dict[str, Any] = {"stage_id": stage_data["stage_id"], "devices": devices} for name, f in _STAGE_DEPLOY_FIELDS.items(): @@ -687,6 +709,15 @@ def _select_processor_funcs( ) +def deploy_override_field_names() -> frozenset[str]: + """Return deploy-schema fields whose CLI defaults must not override YAML.""" + return ( + frozenset(_STAGE_DEPLOY_FIELDS) + | frozenset(_PIPELINE_WIDE_ENGINE_FIELDS) + | frozenset({"async_chunk", "devices"}) + ) + + def _build_engine_args( ps: StagePipelineConfig, ds: StageDeployConfig | None, @@ -802,7 +833,7 @@ def merge_pipeline_deploy( engine_args["async_scheduling"] = sched_cls is OmniARAsyncScheduler extras = _build_extras(ps, ds) runtime: dict[str, Any] = {"process": True} - if ds is not None: + if ds is not None and ds.devices is not None: runtime["devices"] = ds.devices result.append( @@ -865,13 +896,13 @@ def to_omegaconf(self) -> Any: # CLI overrides take precedence over YAML defaults for key, value in self.runtime_overrides.items(): - if key not in ("devices", "max_batch_size"): + if value is not None and key not in ("devices", "max_batch_size"): engine_args[key] = value # Build runtime config from YAML defaults + CLI overrides runtime: dict[str, Any] = dict(self.yaml_runtime) runtime.setdefault("process", True) - if "devices" in self.runtime_overrides: + if self.runtime_overrides.get("devices") is not None: runtime["devices"] = self.runtime_overrides["devices"] # Legacy compat: migrate runtime.max_batch_size → engine_args.max_num_seqs @@ -887,8 +918,6 @@ def to_omegaconf(self) -> Any: effective_mbs = int(cli_mbs or legacy_mbs or 1) engine_args.setdefault("max_num_seqs", effective_mbs) - engine_args.setdefault("max_num_seqs", 1) - # Build full config dict config_dict: dict[str, Any] = { "stage_id": self.stage_id, diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml index 9d2f1f8fffa..8de6f9305ba 100644 --- a/vllm_omni/deploy/bagel.yaml +++ b/vllm_omni/deploy/bagel.yaml @@ -10,8 +10,11 @@ async_chunk: false stages: - stage_id: 0 + max_num_batched_tokens: 32768 max_num_seqs: 3 gpu_memory_utilization: 0.45 + trust_remote_code: true + enable_prefix_caching: false devices: "0" default_sampling_params: temperature: 0.4 @@ -23,8 +26,11 @@ stages: repetition_penalty: 1.05 - stage_id: 1 + max_num_batched_tokens: 32768 max_num_seqs: 1 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false devices: "0" input_connectors: from_stage_0: shared_memory_connector diff --git a/vllm_omni/deploy/bagel_single_stage.yaml b/vllm_omni/deploy/bagel_single_stage.yaml index 8470124ec78..bcfbad253a5 100644 --- a/vllm_omni/deploy/bagel_single_stage.yaml +++ b/vllm_omni/deploy/bagel_single_stage.yaml @@ -16,7 +16,11 @@ async_chunk: false stages: - stage_id: 0 + max_num_batched_tokens: 32768 max_num_seqs: 1 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false devices: "0" default_sampling_params: seed: 52 diff --git a/vllm_omni/deploy/cosyvoice3.yaml b/vllm_omni/deploy/cosyvoice3.yaml index 53e3eb3f301..4bfd4ab859d 100644 --- a/vllm_omni/deploy/cosyvoice3.yaml +++ b/vllm_omni/deploy/cosyvoice3.yaml @@ -27,9 +27,12 @@ connectors: stages: - stage_id: 0 + max_num_batched_tokens: 32768 max_num_seqs: 1 gpu_memory_utilization: 0.4 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false devices: "0" output_connectors: to_stage_1: connector_of_shared_memory @@ -45,9 +48,12 @@ stages: skip_mm_profiling: true - stage_id: 1 + max_num_batched_tokens: 32768 max_num_seqs: 1 gpu_memory_utilization: 0.2 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false max_model_len: 32768 devices: "0" input_connectors: diff --git a/vllm_omni/deploy/fish_qwen3_omni.yaml b/vllm_omni/deploy/fish_qwen3_omni.yaml index a5bee925b68..5b0c44988a0 100644 --- a/vllm_omni/deploy/fish_qwen3_omni.yaml +++ b/vllm_omni/deploy/fish_qwen3_omni.yaml @@ -24,6 +24,8 @@ stages: max_num_seqs: 4 gpu_memory_utilization: 0.6 enforce_eager: false + trust_remote_code: true + enable_prefix_caching: false async_scheduling: false # vLLM >=0.19 requires max_num_batched_tokens >= max_model_len when # enable_chunked_prefill=false. Bumped from legacy 3072 to match @@ -46,6 +48,8 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.1 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false async_scheduling: false max_num_batched_tokens: 16384 max_model_len: 16384 diff --git a/vllm_omni/deploy/glm_image.yaml b/vllm_omni/deploy/glm_image.yaml index 28b88fb429a..099df1b1508 100644 --- a/vllm_omni/deploy/glm_image.yaml +++ b/vllm_omni/deploy/glm_image.yaml @@ -18,6 +18,8 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.6 enforce_eager: false + trust_remote_code: true + enable_prefix_caching: false max_num_batched_tokens: 32768 devices: "0" default_sampling_params: @@ -32,8 +34,11 @@ stages: # Stage 1: Diffusion (DiT + VAE) # Receives prior_token_ids from AR, performs denoising + VAE decode. - stage_id: 1 + max_num_batched_tokens: 32768 max_num_seqs: 1 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false devices: "1" default_sampling_params: seed: 42 diff --git a/vllm_omni/deploy/mimo_audio.yaml b/vllm_omni/deploy/mimo_audio.yaml index f5e704f9bd4..a92e905f70e 100644 --- a/vllm_omni/deploy/mimo_audio.yaml +++ b/vllm_omni/deploy/mimo_audio.yaml @@ -25,6 +25,8 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.3 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false max_num_batched_tokens: 8192 max_model_len: 8192 devices: "0" @@ -42,6 +44,8 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.2 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false async_scheduling: false max_num_batched_tokens: 8192 max_model_len: 8192 diff --git a/vllm_omni/deploy/moss_tts_nano.yaml b/vllm_omni/deploy/moss_tts_nano.yaml index 585e244ca4a..2c8fc54c057 100644 --- a/vllm_omni/deploy/moss_tts_nano.yaml +++ b/vllm_omni/deploy/moss_tts_nano.yaml @@ -19,6 +19,7 @@ stages: max_num_seqs: 4 gpu_memory_utilization: 0.3 enforce_eager: true + enable_prefix_caching: false max_num_batched_tokens: 4096 max_model_len: 4096 devices: "0" diff --git a/vllm_omni/deploy/qwen2_5_omni.yaml b/vllm_omni/deploy/qwen2_5_omni.yaml index 7ab87e59052..487ceefdddb 100644 --- a/vllm_omni/deploy/qwen2_5_omni.yaml +++ b/vllm_omni/deploy/qwen2_5_omni.yaml @@ -19,9 +19,12 @@ async_chunk: false stages: - stage_id: 0 + max_num_batched_tokens: 32768 max_num_seqs: 1 gpu_memory_utilization: 0.8 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false mm_processor_cache_gb: 0 devices: "0" default_sampling_params: @@ -33,9 +36,12 @@ stages: repetition_penalty: 1.1 - stage_id: 1 + max_num_batched_tokens: 32768 max_num_seqs: 1 gpu_memory_utilization: 0.8 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false devices: "1" default_sampling_params: temperature: 0.9 @@ -46,9 +52,12 @@ stages: repetition_penalty: 1.05 - stage_id: 2 + max_num_batched_tokens: 32768 max_num_seqs: 1 gpu_memory_utilization: 0.15 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false enable_flashinfer_autotune: false async_scheduling: false devices: "0" diff --git a/vllm_omni/deploy/qwen3_omni_moe.yaml b/vllm_omni/deploy/qwen3_omni_moe.yaml index bbc8e11400a..445437c0fa5 100644 --- a/vllm_omni/deploy/qwen3_omni_moe.yaml +++ b/vllm_omni/deploy/qwen3_omni_moe.yaml @@ -22,7 +22,11 @@ connectors: stages: - stage_id: 0 + max_num_batched_tokens: 32768 + max_num_seqs: 64 gpu_memory_utilization: 0.9 + trust_remote_code: true + enable_prefix_caching: false devices: "0" default_sampling_params: temperature: 0.4 @@ -33,7 +37,11 @@ stages: repetition_penalty: 1.05 - stage_id: 1 + max_num_batched_tokens: 32768 + max_num_seqs: 64 gpu_memory_utilization: 0.6 + trust_remote_code: true + enable_prefix_caching: false devices: "1" input_connectors: from_stage_0: connector_of_shared_memory @@ -45,10 +53,13 @@ stages: repetition_penalty: 1.05 - stage_id: 2 + max_num_batched_tokens: 51200 + max_num_seqs: 64 gpu_memory_utilization: 0.1 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false async_scheduling: false - max_num_batched_tokens: 51200 devices: "1" input_connectors: from_stage_1: connector_of_shared_memory diff --git a/vllm_omni/deploy/qwen3_tts.yaml b/vllm_omni/deploy/qwen3_tts.yaml index 51839cab1be..4bf13540314 100644 --- a/vllm_omni/deploy/qwen3_tts.yaml +++ b/vllm_omni/deploy/qwen3_tts.yaml @@ -31,6 +31,8 @@ stages: - stage_id: 0 max_num_seqs: 10 gpu_memory_utilization: 0.3 + trust_remote_code: true + enable_prefix_caching: false async_scheduling: true max_num_batched_tokens: 512 max_model_len: 4096 @@ -53,6 +55,8 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.3 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false async_scheduling: true # Must be divisible by num_code_groups and cover (left_context + chunk). # Prefill length is Q * num_frames (e.g. 16 * 2148 = 34368); keep diff --git a/vllm_omni/deploy/voxcpm2.yaml b/vllm_omni/deploy/voxcpm2.yaml index b49906710df..71ef148242a 100644 --- a/vllm_omni/deploy/voxcpm2.yaml +++ b/vllm_omni/deploy/voxcpm2.yaml @@ -16,6 +16,8 @@ stages: max_num_seqs: 4 gpu_memory_utilization: 0.9 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false async_scheduling: true max_num_batched_tokens: 4096 max_model_len: 4096 diff --git a/vllm_omni/deploy/voxtral_tts.yaml b/vllm_omni/deploy/voxtral_tts.yaml index 87d999c67e0..929daddb13f 100644 --- a/vllm_omni/deploy/voxtral_tts.yaml +++ b/vllm_omni/deploy/voxtral_tts.yaml @@ -21,9 +21,12 @@ connectors: stages: - stage_id: 0 + max_num_batched_tokens: 32768 max_num_seqs: 32 gpu_memory_utilization: 0.8 enforce_eager: false + trust_remote_code: true + enable_prefix_caching: false async_scheduling: true max_model_len: 4096 devices: "0" @@ -48,6 +51,8 @@ stages: max_num_seqs: 32 gpu_memory_utilization: 0.1 enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false async_scheduling: false max_num_batched_tokens: 65536 max_model_len: 65536 diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index 3f16c329e27..6c10c750053 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -456,50 +456,12 @@ class OrchestratorArgs: } ) -_DEPLOY_ENGINE_ARG_OVERRIDE_FIELDS: frozenset[str] = frozenset( - { - # Capacity / scheduling. - "async_scheduling", - "max_model_len", - "max_num_batched_tokens", - "max_num_seqs", - # Memory / parallelism. - "data_parallel_size", - "gpu_memory_utilization", - "pipeline_parallel_size", - "tensor_parallel_size", - # Execution / loading. - "enforce_eager", - "distributed_executor_backend", - "dtype", - "quantization", - "trust_remote_code", - # Caching / chunking. - "async_chunk", - "enable_prefix_caching", - "enable_chunked_prefill", - # Model-specific engine extras. - "subtalker_sampling_params", - } -) - -_DEPLOY_RUNTIME_OVERRIDE_FIELDS: frozenset[str] = frozenset( - { - "devices", - } -) - def orchestrator_field_names() -> frozenset[str]: """Return the names of every field on OrchestratorArgs.""" return frozenset(f.name for f in fields(OrchestratorArgs)) -def deploy_override_field_names() -> frozenset[str]: - """Return kwargs whose parser defaults must not override deploy YAML.""" - return _DEPLOY_ENGINE_ARG_OVERRIDE_FIELDS | _DEPLOY_RUNTIME_OVERRIDE_FIELDS - - def internal_blacklist_keys() -> frozenset[str]: """Return the set of CLI keys that must never be forwarded as per-stage engine overrides. @@ -653,6 +615,8 @@ def nullify_stage_engine_defaults(parser: argparse.ArgumentParser) -> None: """Reset stage-level engine flag defaults to ``None``; preserve real default in help text. Only deploy-YAML override fields are touched. Idempotent.""" + from vllm_omni.config.stage_config import deploy_override_field_names + override_dests = deploy_override_field_names() for action in parser._actions: diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index f1d1e90a897..86674206ee7 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -106,9 +106,7 @@ def from_cli_args( kwargs: dict[str, Any] = {k: v for k, v in vars(args).items() if not k.startswith("_")} if parser is not None and not getattr(parser, "_omni_nullified", False): - from vllm_omni.engine.arg_utils import ( - deploy_override_field_names, - ) + from vllm_omni.config.stage_config import deploy_override_field_names from vllm_omni.entrypoints.utils import detect_explicit_cli_keys explicit = detect_explicit_cli_keys(sys.argv[1:], parser) or set()