vllm-project · hsliuustc0106 · May 6, 2026 · May 1, 2026 · May 2, 2026 · May 2, 2026
@@ -4,6 +4,7 @@
 import pytest
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
+from vllm_omni.config.stage_config import deploy_override_field_names
 from vllm_omni.engine.async_omni_engine import AsyncOmniEngine
 from vllm_omni.entrypoints.cli.serve import OmniServeCommand, _create_default_diffusion_stage_cfg
 
@@ -30,6 +31,15 @@ def test_default_stage_config_includes_cache_backend():
     assert engine_args["model_stage"] == "diffusion"
 
 
+def test_default_stage_config_ignores_none_deploy_overrides():
+    """Ensure nullified deploy override defaults do not alter diffusion defaults."""
+    baseline = AsyncOmniEngine._create_default_diffusion_stage_cfg({})[0]
+    nullified_overrides = {name: None for name in deploy_override_field_names()}
+    stage_cfg = AsyncOmniEngine._create_default_diffusion_stage_cfg(nullified_overrides)[0]
+
+    assert stage_cfg == baseline
+
+
 def test_default_cache_config_used_when_missing():
     """Ensure default cache_config is synthesized when only backend is given."""
     stage_cfg = AsyncOmniEngine._create_default_diffusion_stage_cfg(

@@ -516,6 +516,7 @@ def delete_by_path(config_dict: dict, path: str) -> None:
                 "max_num_seqs": 1,
                 "gpu_memory_utilization": 0.9,
                 "enforce_eager": True,
+                "enable_prefix_caching": False,
                 "max_num_batched_tokens": 16384,
                 "max_model_len": 16384,
                 "skip_mm_profiling": True,

@@ -369,8 +369,8 @@ def _build_full_serve_parser():
 def test_nullify_stage_engine_defaults_resets_inherited_defaults():
     import argparse
 
+    from vllm_omni.config.stage_config import deploy_override_field_names
     from vllm_omni.engine.arg_utils import (
-        deploy_override_field_names,
         nullify_stage_engine_defaults,
     )
 

@@ -93,6 +93,7 @@ def test_to_omegaconf_basic(self):
         assert omega_config.engine_args.worker_type == "ar"
         assert omega_config.final_output is True
         assert omega_config.final_output_type == "text"
+        assert "max_num_seqs" not in omega_config.engine_args
         # Legacy field name for backward compatibility
         assert omega_config.engine_input_source == []
 
@@ -146,6 +147,24 @@ def test_to_omegaconf_max_num_seqs_in_engine_args(self):
         omega_config = config.to_omegaconf()
         assert omega_config.engine_args.max_num_seqs == 32
 
+    def test_to_omegaconf_omits_none_deploy_overrides_for_engine_args(self):
+        """None deploy overrides must fall through to EngineArgs defaults."""
+        from vllm_omni.config.stage_config import deploy_override_field_names
+
+        config = StageConfig(
+            stage_id=0,
+            model_stage="thinker",
+            runtime_overrides={name: None for name in deploy_override_field_names()},
+        )
+
+        omega_config = config.to_omegaconf()
+        engine_args = dict(omega_config.engine_args)
+
+        assert "devices" not in engine_args
+        assert "max_batch_size" not in engine_args
+        for name in deploy_override_field_names() - {"devices"}:
+            assert name not in engine_args
+
 
 class TestModelPipeline:
     """Tests for ModelPipeline class."""
@@ -806,21 +825,80 @@ def test_register_and_lookup(self):
 
 
 class TestDeployConfigLoading:
-    def test_load_deploy_config(self):
+    def test_deploy_override_fields_include_deploy_schema_fields(self):
+        from vllm_omni.config.stage_config import deploy_override_field_names
+
+        expected_fields = {
+            "async_chunk",
+            "async_scheduling",
+            "compilation_config",
+            "config_format",
+            "data_parallel_size",
+            "devices",
+            "disable_hybrid_kv_cache_manager",
+            "distributed_executor_backend",
+            "dtype",
+            "enable_chunked_prefill",
+            "enable_flashinfer_autotune",
+            "enable_prefix_caching",
+            "enforce_eager",
+            "gpu_memory_utilization",
+            "load_format",
+            "max_model_len",
+            "max_num_batched_tokens",
+            "max_num_seqs",
+            "mm_processor_cache_gb",
+            "pipeline_parallel_size",
+            "profiler_config",
+            "quantization",
+            "skip_mm_profiling",
+            "subtalker_sampling_params",
+            "tensor_parallel_size",
+            "tokenizer_mode",
+            "trust_remote_code",
+        }
+
+        actual_fields = deploy_override_field_names()
+        assert expected_fields == actual_fields, (
+            f"added={actual_fields - expected_fields}, removed={expected_fields - actual_fields}"
+        )
+
+    def test_load_qwen3_omni_moe_deploy_config(self):
         from pathlib import Path
 
         from vllm_omni.config.stage_config import load_deploy_config
 
         deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml"
-        if not deploy_path.exists():
-            pytest.skip("Deploy config not found")
-
         deploy = load_deploy_config(deploy_path)
         assert len(deploy.stages) == 3
         assert deploy.async_chunk is True
         assert deploy.connectors is not None
         assert deploy.platforms is not None
 
+    def test_load_voxtral_tts_deploy_config_schema_fields(self):
+        from pathlib import Path
+
+        from vllm_omni.config.stage_config import load_deploy_config
+
+        deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "voxtral_tts.yaml"
+        deploy = load_deploy_config(deploy_path)
+        assert deploy.stages[0].config_format == "mistral"
+        assert deploy.stages[0].load_format == "mistral"
+        assert deploy.stages[0].tokenizer_mode == "mistral"
+        assert not any(
+            name in deploy.stages[0].engine_extras for name in ("config_format", "load_format", "tokenizer_mode")
+        )
+
+    def test_load_ming_flash_omni_deploy_config_schema_fields(self):
+        from pathlib import Path
+
+        from vllm_omni.config.stage_config import load_deploy_config
+
+        deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "ming_flash_omni.yaml"
+        deploy = load_deploy_config(deploy_path)
+        assert deploy.stages[0].compilation_config == {"pass_config": {"fuse_allreduce_rms": False}}
+        assert "compilation_config" not in deploy.stages[0].engine_extras
+
     def test_merge_pipeline_deploy(self):
         from pathlib import Path
 
@@ -1171,7 +1249,8 @@ def test_ci_inherits_from_main(self):
         deploy = load_deploy_config(ci_path)
         assert len(deploy.stages) == 3
         # CI overrides
-        assert deploy.stages[0].engine_extras.get("load_format") == "dummy"
+        assert deploy.stages[0].load_format == "dummy"
+        assert "load_format" not in deploy.stages[0].engine_extras
         assert deploy.stages[0].max_num_seqs == 5
         # Inherited from base
         assert deploy.stages[0].gpu_memory_utilization == 0.9
@@ -1376,7 +1455,7 @@ def test_typed_kwarg_overrides_yaml(self):
     def test_none_value_skipped_yaml_wins(self):
         stages = self._stages({"max_num_seqs": None})
         assert stages[2].runtime_overrides.get("max_num_seqs") is None
-        assert stages[2].yaml_engine_args.get("max_num_seqs") == 1
+        assert "max_num_seqs" not in stages[2].yaml_engine_args
 
     def test_empty_kwargs_yaml_only(self):
         stages = self._stages({})

@@ -399,19 +399,41 @@ class StageDeployConfig:
     the top level of ``DeployConfig`` and propagated to every stage.
     """
 
+    # === Omni fields ===
+    # Stage identity and Omni runtime placement.
     stage_id: int
-    max_num_seqs: int = 64
-    gpu_memory_utilization: float = 0.9
-    tensor_parallel_size: int = 1
-    enforce_eager: bool = False
-    max_num_batched_tokens: int = 32768
-    max_model_len: int | None = None
-    async_scheduling: bool | None = None
-    devices: str = "0"
+    devices: str | None = None
+
+    # Inter-stage connector wiring and request defaults.
     output_connectors: dict[str, str] | None = None
     input_connectors: dict[str, str] | None = None
     default_sampling_params: dict[str, Any] | None = None
     subtalker_sampling_params: dict[str, Any] | None = None
+
+    # === vLLM EngineArgs fields ===
+    # Parallelism and scheduler/memory capacity.
+    tensor_parallel_size: int | None = None
+    gpu_memory_utilization: float | None = None
+    max_num_seqs: int | None = None
+    max_num_batched_tokens: int | None = None
+    max_model_len: int | None = None
+
+    # Execution, scheduling, and KV/cache behavior.
+    enforce_eager: bool | None = None
+    async_scheduling: bool | None = None
+    disable_hybrid_kv_cache_manager: bool | None = None
+    mm_processor_cache_gb: float | None = None
+
+    # Compilation, profiling, tokenizer/config parsing, and model loading.
+    compilation_config: dict[str, Any] | None = None
+    profiler_config: dict[str, Any] | None = None
+    skip_mm_profiling: bool | None = None
+    enable_flashinfer_autotune: bool | None = None
+    config_format: str | None = None
+    load_format: str | None = None
+    tokenizer_mode: str | None = None
+
+    # Pass-through vLLM EngineArgs fields that are not represented above.
     engine_extras: dict[str, Any] = field(default_factory=dict)
 
 
@@ -436,14 +458,14 @@ class DeployConfig:
     pipeline: str | None = None
 
     # === Pipeline-wide engine settings (applied uniformly to every stage) ===
-    trust_remote_code: bool = True
+    trust_remote_code: bool | None = None
     distributed_executor_backend: str | None = None
     dtype: str | None = None
     quantization: str | None = None
-    enable_prefix_caching: bool = False
+    enable_prefix_caching: bool | None = None
     enable_chunked_prefill: bool | None = None
-    data_parallel_size: int = 1
-    pipeline_parallel_size: int = 1
+    data_parallel_size: int | None = None
+    pipeline_parallel_size: int | None = None
 
 
 _STAGE_NON_ENGINE_KEYS = frozenset(
@@ -465,10 +487,10 @@ def _parse_stage_deploy(stage_data: dict[str, Any]) -> StageDeployConfig:
     """Parse a single stage entry from deploy YAML into StageDeployConfig."""
     if "engine_args" in stage_data:
         engine_args = dict(stage_data["engine_args"])
-        devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices", "0"))
+        devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices"))
     else:
         engine_args = {k: v for k, v in stage_data.items() if k not in _STAGE_NON_ENGINE_KEYS and k != "stage_id"}
-        devices = stage_data.get("devices", "0")
+        devices = stage_data.get("devices")
 
     kwargs: dict[str, Any] = {"stage_id": stage_data["stage_id"], "devices": devices}
     for name, f in _STAGE_DEPLOY_FIELDS.items():
@@ -687,6 +709,15 @@ def _select_processor_funcs(
 )
 
 
+def deploy_override_field_names() -> frozenset[str]:
+    """Return deploy-schema fields whose CLI defaults must not override YAML."""
+    return (
+        frozenset(_STAGE_DEPLOY_FIELDS)
+        | frozenset(_PIPELINE_WIDE_ENGINE_FIELDS)
+        | frozenset({"async_chunk", "devices"})
+    )
+
+
 def _build_engine_args(
     ps: StagePipelineConfig,
     ds: StageDeployConfig | None,
@@ -802,7 +833,7 @@ def merge_pipeline_deploy(
             engine_args["async_scheduling"] = sched_cls is OmniARAsyncScheduler
         extras = _build_extras(ps, ds)
         runtime: dict[str, Any] = {"process": True}
-        if ds is not None:
+        if ds is not None and ds.devices is not None:
             runtime["devices"] = ds.devices
 
         result.append(
@@ -865,13 +896,13 @@ def to_omegaconf(self) -> Any:
 
         # CLI overrides take precedence over YAML defaults
         for key, value in self.runtime_overrides.items():
-            if key not in ("devices", "max_batch_size"):
+            if value is not None and key not in ("devices", "max_batch_size"):
                 engine_args[key] = value
 
         # Build runtime config from YAML defaults + CLI overrides
         runtime: dict[str, Any] = dict(self.yaml_runtime)
         runtime.setdefault("process", True)
-        if "devices" in self.runtime_overrides:
+        if self.runtime_overrides.get("devices") is not None:
             runtime["devices"] = self.runtime_overrides["devices"]
 
         # Legacy compat: migrate runtime.max_batch_size → engine_args.max_num_seqs
@@ -887,8 +918,6 @@ def to_omegaconf(self) -> Any:
             effective_mbs = int(cli_mbs or legacy_mbs or 1)
             engine_args.setdefault("max_num_seqs", effective_mbs)
 
-        engine_args.setdefault("max_num_seqs", 1)
-
         # Build full config dict
         config_dict: dict[str, Any] = {
             "stage_id": self.stage_id,

@@ -10,8 +10,11 @@ async_chunk: false
 
 stages:
   - stage_id: 0
+    max_num_batched_tokens: 32768
     max_num_seqs: 3
     gpu_memory_utilization: 0.45
+    trust_remote_code: true
+    enable_prefix_caching: false
     devices: "0"
     default_sampling_params:
       temperature: 0.4
@@ -23,8 +26,11 @@ stages:
       repetition_penalty: 1.05
 
   - stage_id: 1
+    max_num_batched_tokens: 32768
     max_num_seqs: 1
     enforce_eager: true
+    trust_remote_code: true
+    enable_prefix_caching: false
     devices: "0"
     input_connectors:
       from_stage_0: shared_memory_connector

@@ -16,7 +16,11 @@ async_chunk: false
 
 stages:
   - stage_id: 0
+    max_num_batched_tokens: 32768
     max_num_seqs: 1
+    enforce_eager: true
+    trust_remote_code: true
+    enable_prefix_caching: false
     devices: "0"
     default_sampling_params:
       seed: 52
@@ -27,9 +27,12 @@ connectors:
 
 stages:
   - stage_id: 0
+    max_num_batched_tokens: 32768
     max_num_seqs: 1
     gpu_memory_utilization: 0.4
     enforce_eager: true
+    trust_remote_code: true
+    enable_prefix_caching: false
     devices: "0"
     output_connectors:
       to_stage_1: connector_of_shared_memory
@@ -45,9 +48,12 @@ stages:
     skip_mm_profiling: true
 
   - stage_id: 1
+    max_num_batched_tokens: 32768
     max_num_seqs: 1
     gpu_memory_utilization: 0.2
     enforce_eager: true
+    trust_remote_code: true
+    enable_prefix_caching: false
     max_model_len: 32768
     devices: "0"
     input_connectors:

@@ -24,6 +24,8 @@ stages:
     max_num_seqs: 4
     gpu_memory_utilization: 0.6
     enforce_eager: false
+    trust_remote_code: true
+    enable_prefix_caching: false
     async_scheduling: false
     # vLLM >=0.19 requires max_num_batched_tokens >= max_model_len when
     # enable_chunked_prefill=false. Bumped from legacy 3072 to match
@@ -46,6 +48,8 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.1
     enforce_eager: true
+    trust_remote_code: true
+    enable_prefix_caching: false
     async_scheduling: false
     max_num_batched_tokens: 16384
     max_model_len: 16384