From 6478b2c42f4fc44ba4276d527438b33c63892158 Mon Sep 17 00:00:00 2001
From: xiaohajiayou <923390377@qq.com>
Date: Fri, 1 May 2026 17:39:21 +0800
Subject: [PATCH 1/8] Reapply deploy override field derivation

Reapply the deploy override field derivation that was reverted in #3287 and make prefix-cache behavior explicit in deploy configs. This preserves the config refactor while restoring the previous Omni behavior where deploy stages do not accidentally fall through to vLLM's model-dependent prefix-cache default.

Signed-off-by: xiaohajiayou <923390377@qq.com>
---
 tests/e2e/online_serving/test_mimo_audio.py   |  1 -
 .../test_async_omni_diffusion_config.py       | 10 +++
 tests/helpers/stage_config.py                 |  1 +
 tests/test_arg_utils.py                       |  2 +-
 tests/test_config_factory.py                  | 69 ++++++++++++++++++-
 vllm_omni/config/stage_config.py              | 44 ++++++++----
 vllm_omni/deploy/bagel.yaml                   |  2 +
 vllm_omni/deploy/bagel_single_stage.yaml      |  1 +
 vllm_omni/deploy/cosyvoice3.yaml              |  4 +-
 vllm_omni/deploy/fish_qwen3_omni.yaml         |  2 +
 vllm_omni/deploy/glm_image.yaml               |  2 +
 vllm_omni/deploy/mimo_audio.yaml              |  2 +
 vllm_omni/deploy/moss_tts_nano.yaml           |  1 +
 vllm_omni/deploy/qwen2_5_omni.yaml            |  3 +
 vllm_omni/deploy/qwen3_omni_moe.yaml          |  3 +
 vllm_omni/deploy/qwen3_tts.yaml               |  2 +
 vllm_omni/deploy/voxcpm2.yaml                 |  2 +
 vllm_omni/deploy/voxtral_tts.yaml             |  2 +
 vllm_omni/engine/arg_utils.py                 | 40 +----------
 vllm_omni/engine/async_omni_engine.py         |  5 +-
 vllm_omni/entrypoints/omni_base.py            |  4 +-
 21 files changed, 142 insertions(+), 60 deletions(-)

diff --git a/tests/e2e/online_serving/test_mimo_audio.py b/tests/e2e/online_serving/test_mimo_audio.py
index 38ee721f434..df00c64161e 100644
--- a/tests/e2e/online_serving/test_mimo_audio.py
+++ b/tests/e2e/online_serving/test_mimo_audio.py
@@ -83,7 +83,6 @@ def get_max_batch_size(size_type="few"):
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
-@pytest.mark.skip(reason="CI failed 8571")
 def test_audio_to_text_audio_001(omni_server, openai_client) -> None:
     """
     Test audio and text input processing and text/audio output generation via OpenAI API.
diff --git a/tests/entrypoints/test_async_omni_diffusion_config.py b/tests/entrypoints/test_async_omni_diffusion_config.py
index 83b465fdb47..235add5725f 100644
--- a/tests/entrypoints/test_async_omni_diffusion_config.py
+++ b/tests/entrypoints/test_async_omni_diffusion_config.py
@@ -4,6 +4,7 @@
 import pytest
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
+from vllm_omni.config.stage_config import deploy_override_field_names
 from vllm_omni.engine.async_omni_engine import AsyncOmniEngine
 from vllm_omni.entrypoints.cli.serve import OmniServeCommand, _create_default_diffusion_stage_cfg
 
@@ -30,6 +31,15 @@ def test_default_stage_config_includes_cache_backend():
     assert engine_args["model_stage"] == "diffusion"
 
 
+def test_default_stage_config_ignores_none_deploy_overrides():
+    """Ensure nullified deploy override defaults do not alter diffusion defaults."""
+    baseline = AsyncOmniEngine._create_default_diffusion_stage_cfg({})[0]
+    nullified_overrides = {name: None for name in deploy_override_field_names()}
+    stage_cfg = AsyncOmniEngine._create_default_diffusion_stage_cfg(nullified_overrides)[0]
+
+    assert stage_cfg == baseline
+
+
 def test_default_cache_config_used_when_missing():
     """Ensure default cache_config is synthesized when only backend is given."""
     stage_cfg = AsyncOmniEngine._create_default_diffusion_stage_cfg(
diff --git a/tests/helpers/stage_config.py b/tests/helpers/stage_config.py
index 2bb017b811f..ba7ea0c50c2 100644
--- a/tests/helpers/stage_config.py
+++ b/tests/helpers/stage_config.py
@@ -494,6 +494,7 @@ def delete_by_path(config_dict: dict, path: str) -> None:
                 "max_num_seqs": 1,
                 "gpu_memory_utilization": 0.9,
                 "enforce_eager": True,
+                "enable_prefix_caching": False,
                 "max_num_batched_tokens": 16384,
                 "max_model_len": 16384,
                 "skip_mm_profiling": True,
diff --git a/tests/test_arg_utils.py b/tests/test_arg_utils.py
index ae640b2d861..2fd5cf302e0 100644
--- a/tests/test_arg_utils.py
+++ b/tests/test_arg_utils.py
@@ -369,8 +369,8 @@ def _build_full_serve_parser():
 def test_nullify_stage_engine_defaults_resets_inherited_defaults():
     import argparse
 
+    from vllm_omni.config.stage_config import deploy_override_field_names
     from vllm_omni.engine.arg_utils import (
-        deploy_override_field_names,
         nullify_stage_engine_defaults,
     )
 
diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py
index 16d49034fa1..b56b9240e51 100644
--- a/tests/test_config_factory.py
+++ b/tests/test_config_factory.py
@@ -93,6 +93,7 @@ def test_to_omegaconf_basic(self):
         assert omega_config.engine_args.worker_type == "ar"
         assert omega_config.final_output is True
         assert omega_config.final_output_type == "text"
+        assert "max_num_seqs" not in omega_config.engine_args
         # Legacy field name for backward compatibility
         assert omega_config.engine_input_source == []
 
@@ -146,6 +147,24 @@ def test_to_omegaconf_max_num_seqs_in_engine_args(self):
         omega_config = config.to_omegaconf()
         assert omega_config.engine_args.max_num_seqs == 32
 
+    def test_to_omegaconf_omits_none_deploy_overrides_for_engine_args(self):
+        """None deploy overrides must fall through to EngineArgs defaults."""
+        from vllm_omni.config.stage_config import deploy_override_field_names
+
+        config = StageConfig(
+            stage_id=0,
+            model_stage="thinker",
+            runtime_overrides={name: None for name in deploy_override_field_names()},
+        )
+
+        omega_config = config.to_omegaconf()
+        engine_args = dict(omega_config.engine_args)
+
+        assert "devices" not in engine_args
+        assert "max_batch_size" not in engine_args
+        for name in deploy_override_field_names() - {"devices"}:
+            assert name not in engine_args
+
 
 class TestModelPipeline:
     """Tests for ModelPipeline class."""
@@ -802,6 +821,40 @@ def test_register_and_lookup(self):
 
 
 class TestDeployConfigLoading:
+    def test_deploy_override_fields_include_deploy_schema_fields(self):
+        from vllm_omni.config.stage_config import deploy_override_field_names
+
+        expected_fields = {
+            "async_chunk",
+            "async_scheduling",
+            "config_format",
+            "data_parallel_size",
+            "devices",
+            "disable_hybrid_kv_cache_manager",
+            "distributed_executor_backend",
+            "dtype",
+            "enable_chunked_prefill",
+            "enable_flashinfer_autotune",
+            "enable_prefix_caching",
+            "enforce_eager",
+            "gpu_memory_utilization",
+            "load_format",
+            "max_model_len",
+            "max_num_batched_tokens",
+            "max_num_seqs",
+            "mm_processor_cache_gb",
+            "pipeline_parallel_size",
+            "profiler_config",
+            "quantization",
+            "skip_mm_profiling",
+            "subtalker_sampling_params",
+            "tensor_parallel_size",
+            "tokenizer_mode",
+            "trust_remote_code",
+        }
+
+        assert expected_fields == deploy_override_field_names()
+
     def test_load_deploy_config(self):
         from pathlib import Path
 
@@ -817,6 +870,17 @@ def test_load_deploy_config(self):
         assert deploy.connectors is not None
         assert deploy.platforms is not None
 
+        voxtral_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "voxtral_tts.yaml"
+        if voxtral_path.exists():
+            voxtral_deploy = load_deploy_config(voxtral_path)
+            assert voxtral_deploy.stages[0].config_format == "mistral"
+            assert voxtral_deploy.stages[0].load_format == "mistral"
+            assert voxtral_deploy.stages[0].tokenizer_mode == "mistral"
+            assert not any(
+                name in voxtral_deploy.stages[0].engine_extras
+                for name in ("config_format", "load_format", "tokenizer_mode")
+            )
+
     def test_merge_pipeline_deploy(self):
         from pathlib import Path
 
@@ -1011,7 +1075,8 @@ def test_ci_inherits_from_main(self):
         deploy = load_deploy_config(ci_path)
         assert len(deploy.stages) == 3
         # CI overrides
-        assert deploy.stages[0].engine_extras.get("load_format") == "dummy"
+        assert deploy.stages[0].load_format == "dummy"
+        assert "load_format" not in deploy.stages[0].engine_extras
         assert deploy.stages[0].max_num_seqs == 5
         # Inherited from base
         assert deploy.stages[0].gpu_memory_utilization == 0.9
@@ -1216,7 +1281,7 @@ def test_typed_kwarg_overrides_yaml(self):
     def test_none_value_skipped_yaml_wins(self):
         stages = self._stages({"max_num_seqs": None})
         assert stages[2].runtime_overrides.get("max_num_seqs") is None
-        assert stages[2].yaml_engine_args.get("max_num_seqs") == 1
+        assert "max_num_seqs" not in stages[2].yaml_engine_args
 
     def test_empty_kwargs_yaml_only(self):
         stages = self._stages({})
diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index 44cc83baea8..8a4a8073071 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -402,11 +402,11 @@ class StageDeployConfig:
     """
 
     stage_id: int
-    max_num_seqs: int = 64
-    gpu_memory_utilization: float = 0.9
-    tensor_parallel_size: int = 1
-    enforce_eager: bool = False
-    max_num_batched_tokens: int = 32768
+    max_num_seqs: int | None = None
+    gpu_memory_utilization: float | None = None
+    tensor_parallel_size: int | None = None
+    enforce_eager: bool | None = None
+    max_num_batched_tokens: int | None = None
     max_model_len: int | None = None
     async_scheduling: bool | None = None
     devices: str = "0"
@@ -414,6 +414,14 @@ class StageDeployConfig:
     input_connectors: dict[str, str] | None = None
     default_sampling_params: dict[str, Any] | None = None
     subtalker_sampling_params: dict[str, Any] | None = None
+    profiler_config: dict[str, Any] | None = None
+    disable_hybrid_kv_cache_manager: bool | None = None
+    mm_processor_cache_gb: float | None = None
+    skip_mm_profiling: bool | None = None
+    enable_flashinfer_autotune: bool | None = None
+    config_format: str | None = None
+    load_format: str | None = None
+    tokenizer_mode: str | None = None
     engine_extras: dict[str, Any] = field(default_factory=dict)
 
 
@@ -438,14 +446,14 @@ class DeployConfig:
     pipeline: str | None = None
 
     # === Pipeline-wide engine settings (applied uniformly to every stage) ===
-    trust_remote_code: bool = True
+    trust_remote_code: bool | None = None
     distributed_executor_backend: str | None = None
     dtype: str | None = None
     quantization: str | None = None
-    enable_prefix_caching: bool = False
+    enable_prefix_caching: bool | None = None
     enable_chunked_prefill: bool | None = None
-    data_parallel_size: int = 1
-    pipeline_parallel_size: int = 1
+    data_parallel_size: int | None = None
+    pipeline_parallel_size: int | None = None
 
 
 _STAGE_NON_ENGINE_KEYS = frozenset(
@@ -689,6 +697,18 @@ def _select_processor_funcs(
 )
 
 
+def deploy_override_field_names() -> frozenset[str]:
+    """Return deploy-schema fields whose CLI defaults must not override YAML."""
+    return (
+        frozenset(_STAGE_DEPLOY_FIELDS)
+        | frozenset(_PIPELINE_WIDE_ENGINE_FIELDS)
+        | {
+            "async_chunk",
+            "devices",
+        }
+    )
+
+
 def _build_engine_args(
     ps: StagePipelineConfig,
     ds: StageDeployConfig | None,
@@ -861,13 +881,15 @@ def to_omegaconf(self) -> Any:
 
         # CLI overrides take precedence over YAML defaults
         for key, value in self.runtime_overrides.items():
+            if value is None:
+                continue
             if key not in ("devices", "max_batch_size"):
                 engine_args[key] = value
 
         # Build runtime config from YAML defaults + CLI overrides
         runtime: dict[str, Any] = dict(self.yaml_runtime)
         runtime.setdefault("process", True)
-        if "devices" in self.runtime_overrides:
+        if self.runtime_overrides.get("devices") is not None:
             runtime["devices"] = self.runtime_overrides["devices"]
 
         # Legacy compat: migrate runtime.max_batch_size → engine_args.max_num_seqs
@@ -883,8 +905,6 @@ def to_omegaconf(self) -> Any:
             effective_mbs = int(cli_mbs or legacy_mbs or 1)
             engine_args.setdefault("max_num_seqs", effective_mbs)
 
-        engine_args.setdefault("max_num_seqs", 1)
-
         # Build full config dict
         config_dict: dict[str, Any] = {
             "stage_id": self.stage_id,
diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml
index 9d2f1f8fffa..2b42d4171e1 100644
--- a/vllm_omni/deploy/bagel.yaml
+++ b/vllm_omni/deploy/bagel.yaml
@@ -12,6 +12,7 @@ stages:
   - stage_id: 0
     max_num_seqs: 3
     gpu_memory_utilization: 0.45
+    enable_prefix_caching: false
     devices: "0"
     default_sampling_params:
       temperature: 0.4
@@ -25,6 +26,7 @@ stages:
   - stage_id: 1
     max_num_seqs: 1
     enforce_eager: true
+    enable_prefix_caching: false
     devices: "0"
     input_connectors:
       from_stage_0: shared_memory_connector
diff --git a/vllm_omni/deploy/bagel_single_stage.yaml b/vllm_omni/deploy/bagel_single_stage.yaml
index 8470124ec78..560d4670304 100644
--- a/vllm_omni/deploy/bagel_single_stage.yaml
+++ b/vllm_omni/deploy/bagel_single_stage.yaml
@@ -17,6 +17,7 @@ async_chunk: false
 stages:
   - stage_id: 0
     max_num_seqs: 1
+    enable_prefix_caching: false
     devices: "0"
     default_sampling_params:
       seed: 52
diff --git a/vllm_omni/deploy/cosyvoice3.yaml b/vllm_omni/deploy/cosyvoice3.yaml
index 53e3eb3f301..d708564ddee 100644
--- a/vllm_omni/deploy/cosyvoice3.yaml
+++ b/vllm_omni/deploy/cosyvoice3.yaml
@@ -40,7 +40,7 @@ stages:
       # near-identity repetition penalty forces vLLM to track
       # output_token_ids for RAS (stop-token logit logsumexp).
       repetition_penalty: 1.0001
-    disable_hybrid_kv_cache_manager: true
+    enable_prefix_caching: false
     mm_processor_cache_gb: 0
     skip_mm_profiling: true
 
@@ -54,5 +54,5 @@ stages:
       from_stage_0: connector_of_shared_memory
     default_sampling_params:
       max_tokens: 2048
-    disable_hybrid_kv_cache_manager: true
+    enable_prefix_caching: false
     skip_mm_profiling: true
diff --git a/vllm_omni/deploy/fish_qwen3_omni.yaml b/vllm_omni/deploy/fish_qwen3_omni.yaml
index a5bee925b68..d993a45b2f9 100644
--- a/vllm_omni/deploy/fish_qwen3_omni.yaml
+++ b/vllm_omni/deploy/fish_qwen3_omni.yaml
@@ -24,6 +24,7 @@ stages:
     max_num_seqs: 4
     gpu_memory_utilization: 0.6
     enforce_eager: false
+    enable_prefix_caching: false
     async_scheduling: false
     # vLLM >=0.19 requires max_num_batched_tokens >= max_model_len when
     # enable_chunked_prefill=false. Bumped from legacy 3072 to match
@@ -46,6 +47,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.1
     enforce_eager: true
+    enable_prefix_caching: false
     async_scheduling: false
     max_num_batched_tokens: 16384
     max_model_len: 16384
diff --git a/vllm_omni/deploy/glm_image.yaml b/vllm_omni/deploy/glm_image.yaml
index 28b88fb429a..ee5173ab78a 100644
--- a/vllm_omni/deploy/glm_image.yaml
+++ b/vllm_omni/deploy/glm_image.yaml
@@ -18,6 +18,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.6
     enforce_eager: false
+    enable_prefix_caching: false
     max_num_batched_tokens: 32768
     devices: "0"
     default_sampling_params:
@@ -34,6 +35,7 @@ stages:
   - stage_id: 1
     max_num_seqs: 1
     enforce_eager: true
+    enable_prefix_caching: false
     devices: "1"
     default_sampling_params:
       seed: 42
diff --git a/vllm_omni/deploy/mimo_audio.yaml b/vllm_omni/deploy/mimo_audio.yaml
index f5e704f9bd4..d4bb8dd9e35 100644
--- a/vllm_omni/deploy/mimo_audio.yaml
+++ b/vllm_omni/deploy/mimo_audio.yaml
@@ -25,6 +25,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.3
     enforce_eager: true
+    enable_prefix_caching: false
     max_num_batched_tokens: 8192
     max_model_len: 8192
     devices: "0"
@@ -42,6 +43,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.2
     enforce_eager: true
+    enable_prefix_caching: false
     async_scheduling: false
     max_num_batched_tokens: 8192
     max_model_len: 8192
diff --git a/vllm_omni/deploy/moss_tts_nano.yaml b/vllm_omni/deploy/moss_tts_nano.yaml
index 585e244ca4a..2c8fc54c057 100644
--- a/vllm_omni/deploy/moss_tts_nano.yaml
+++ b/vllm_omni/deploy/moss_tts_nano.yaml
@@ -19,6 +19,7 @@ stages:
     max_num_seqs: 4
     gpu_memory_utilization: 0.3
     enforce_eager: true
+    enable_prefix_caching: false
     max_num_batched_tokens: 4096
     max_model_len: 4096
     devices: "0"
diff --git a/vllm_omni/deploy/qwen2_5_omni.yaml b/vllm_omni/deploy/qwen2_5_omni.yaml
index 41aef0df6f6..f9183080a2c 100644
--- a/vllm_omni/deploy/qwen2_5_omni.yaml
+++ b/vllm_omni/deploy/qwen2_5_omni.yaml
@@ -22,6 +22,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.8
     enforce_eager: true
+    enable_prefix_caching: false
     mm_processor_cache_gb: 0
     devices: "0"
     default_sampling_params:
@@ -36,6 +37,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.8
     enforce_eager: true
+    enable_prefix_caching: false
     devices: "1"
     default_sampling_params:
       temperature: 0.9
@@ -49,6 +51,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.15
     enforce_eager: true
+    enable_prefix_caching: false
     enable_flashinfer_autotune: false
     async_scheduling: false
     devices: "0"
diff --git a/vllm_omni/deploy/qwen3_omni_moe.yaml b/vllm_omni/deploy/qwen3_omni_moe.yaml
index 39baed6bd7b..270d81cb73c 100644
--- a/vllm_omni/deploy/qwen3_omni_moe.yaml
+++ b/vllm_omni/deploy/qwen3_omni_moe.yaml
@@ -23,6 +23,7 @@ connectors:
 stages:
   - stage_id: 0
     gpu_memory_utilization: 0.9
+    enable_prefix_caching: false
     devices: "0"
     default_sampling_params:
       temperature: 0.4
@@ -34,6 +35,7 @@ stages:
 
   - stage_id: 1
     gpu_memory_utilization: 0.6
+    enable_prefix_caching: false
     devices: "1"
     input_connectors:
       from_stage_0: connector_of_shared_memory
@@ -47,6 +49,7 @@ stages:
   - stage_id: 2
     gpu_memory_utilization: 0.1
     enforce_eager: true
+    enable_prefix_caching: false
     async_scheduling: false
     max_num_batched_tokens: 51200
     devices: "1"
diff --git a/vllm_omni/deploy/qwen3_tts.yaml b/vllm_omni/deploy/qwen3_tts.yaml
index 522ea7c58c8..bb57c9eae66 100644
--- a/vllm_omni/deploy/qwen3_tts.yaml
+++ b/vllm_omni/deploy/qwen3_tts.yaml
@@ -31,6 +31,7 @@ stages:
   - stage_id: 0
     max_num_seqs: 10
     gpu_memory_utilization: 0.3
+    enable_prefix_caching: false
     async_scheduling: true
     max_num_batched_tokens: 512
     max_model_len: 4096
@@ -53,6 +54,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.3
     enforce_eager: true
+    enable_prefix_caching: false
     async_scheduling: true
     # Must be divisible by num_code_groups and cover (left_context + chunk).
     # Prefill length is Q * num_frames (e.g. 16 * 2148 = 34368); keep
diff --git a/vllm_omni/deploy/voxcpm2.yaml b/vllm_omni/deploy/voxcpm2.yaml
index b49906710df..cf4356cfb1a 100644
--- a/vllm_omni/deploy/voxcpm2.yaml
+++ b/vllm_omni/deploy/voxcpm2.yaml
@@ -16,10 +16,12 @@ stages:
     max_num_seqs: 4
     gpu_memory_utilization: 0.9
     enforce_eager: true
+    enable_prefix_caching: false
     async_scheduling: true
     max_num_batched_tokens: 4096
     max_model_len: 4096
     devices: "0"
+    trust_remote_code: true
     default_sampling_params:
       temperature: 0.0
       top_p: 1.0
diff --git a/vllm_omni/deploy/voxtral_tts.yaml b/vllm_omni/deploy/voxtral_tts.yaml
index 87d999c67e0..09524febe54 100644
--- a/vllm_omni/deploy/voxtral_tts.yaml
+++ b/vllm_omni/deploy/voxtral_tts.yaml
@@ -24,6 +24,7 @@ stages:
     max_num_seqs: 32
     gpu_memory_utilization: 0.8
     enforce_eager: false
+    enable_prefix_caching: false
     async_scheduling: true
     max_model_len: 4096
     devices: "0"
@@ -48,6 +49,7 @@ stages:
     max_num_seqs: 32
     gpu_memory_utilization: 0.1
     enforce_eager: true
+    enable_prefix_caching: false
     async_scheduling: false
     max_num_batched_tokens: 65536
     max_model_len: 65536
diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py
index 3f16c329e27..6c10c750053 100644
--- a/vllm_omni/engine/arg_utils.py
+++ b/vllm_omni/engine/arg_utils.py
@@ -456,50 +456,12 @@ class OrchestratorArgs:
     }
 )
 
-_DEPLOY_ENGINE_ARG_OVERRIDE_FIELDS: frozenset[str] = frozenset(
-    {
-        # Capacity / scheduling.
-        "async_scheduling",
-        "max_model_len",
-        "max_num_batched_tokens",
-        "max_num_seqs",
-        # Memory / parallelism.
-        "data_parallel_size",
-        "gpu_memory_utilization",
-        "pipeline_parallel_size",
-        "tensor_parallel_size",
-        # Execution / loading.
-        "enforce_eager",
-        "distributed_executor_backend",
-        "dtype",
-        "quantization",
-        "trust_remote_code",
-        # Caching / chunking.
-        "async_chunk",
-        "enable_prefix_caching",
-        "enable_chunked_prefill",
-        # Model-specific engine extras.
-        "subtalker_sampling_params",
-    }
-)
-
-_DEPLOY_RUNTIME_OVERRIDE_FIELDS: frozenset[str] = frozenset(
-    {
-        "devices",
-    }
-)
-
 
 def orchestrator_field_names() -> frozenset[str]:
     """Return the names of every field on OrchestratorArgs."""
     return frozenset(f.name for f in fields(OrchestratorArgs))
 
 
-def deploy_override_field_names() -> frozenset[str]:
-    """Return kwargs whose parser defaults must not override deploy YAML."""
-    return _DEPLOY_ENGINE_ARG_OVERRIDE_FIELDS | _DEPLOY_RUNTIME_OVERRIDE_FIELDS
-
-
 def internal_blacklist_keys() -> frozenset[str]:
     """Return the set of CLI keys that must never be forwarded as per-stage
     engine overrides.
@@ -653,6 +615,8 @@ def nullify_stage_engine_defaults(parser: argparse.ArgumentParser) -> None:
     """Reset stage-level engine flag defaults to ``None``; preserve real
     default in help text. Only deploy-YAML override fields are touched.
     Idempotent."""
+    from vllm_omni.config.stage_config import deploy_override_field_names
+
     override_dests = deploy_override_field_names()
 
     for action in parser._actions:
diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index 54c9d32d9ea..79aa988fc34 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -1324,6 +1324,9 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list:
 
         num_devices = max(1, int(parallel_config.world_size))
         devices = ",".join(str(i) for i in range(num_devices))
+        enforce_eager = kwargs.get("enforce_eager")
+        if enforce_eager is None:
+            enforce_eager = False
 
         stage_engine_args = {
             "max_num_seqs": 1,
@@ -1337,7 +1340,7 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list:
             "enable_cache_dit_summary": kwargs.get("enable_cache_dit_summary", False),
             "enable_cpu_offload": kwargs.get("enable_cpu_offload", False),
             "enable_layerwise_offload": kwargs.get("enable_layerwise_offload", False),
-            "enforce_eager": False if kwargs.get("enforce_eager") is None else kwargs.get("enforce_eager"),
+            "enforce_eager": enforce_eager,
             "boundary_ratio": kwargs.get("boundary_ratio", None),
             "flow_shift": kwargs.get("flow_shift", None),
             "diffusion_load_format": kwargs.get("diffusion_load_format", "default"),
diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py
index 4147c802765..c054b857651 100644
--- a/vllm_omni/entrypoints/omni_base.py
+++ b/vllm_omni/entrypoints/omni_base.py
@@ -100,9 +100,7 @@ def from_cli_args(
         kwargs: dict[str, Any] = {k: v for k, v in vars(args).items() if not k.startswith("_")}
 
         if parser is not None and not getattr(parser, "_omni_nullified", False):
-            from vllm_omni.engine.arg_utils import (
-                deploy_override_field_names,
-            )
+            from vllm_omni.config.stage_config import deploy_override_field_names
             from vllm_omni.entrypoints.utils import detect_explicit_cli_keys
 
             explicit = detect_explicit_cli_keys(sys.argv[1:], parser) or set()

From da464affba19688025f4eae4506808855d0f2b0c Mon Sep 17 00:00:00 2001
From: xiaohajiayou <923390377@qq.com>
Date: Sat, 2 May 2026 16:33:47 +0800
Subject: [PATCH 2/8] Fix mimo audio async chunk None handling

Signed-off-by: xiaohajiayou <923390377@qq.com>
---
 vllm_omni/deploy/bagel.yaml                            |  2 ++
 vllm_omni/deploy/bagel_single_stage.yaml               |  1 +
 .../stage_input_processors/mimo_audio.py               | 10 ++++++++--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml
index 2b42d4171e1..4dd12b6f000 100644
--- a/vllm_omni/deploy/bagel.yaml
+++ b/vllm_omni/deploy/bagel.yaml
@@ -12,6 +12,7 @@ stages:
   - stage_id: 0
     max_num_seqs: 3
     gpu_memory_utilization: 0.45
+    trust_remote_code: true
     enable_prefix_caching: false
     devices: "0"
     default_sampling_params:
@@ -26,6 +27,7 @@ stages:
   - stage_id: 1
     max_num_seqs: 1
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     devices: "0"
     input_connectors:
diff --git a/vllm_omni/deploy/bagel_single_stage.yaml b/vllm_omni/deploy/bagel_single_stage.yaml
index 560d4670304..858a7e8b66b 100644
--- a/vllm_omni/deploy/bagel_single_stage.yaml
+++ b/vllm_omni/deploy/bagel_single_stage.yaml
@@ -17,6 +17,7 @@ async_chunk: false
 stages:
   - stage_id: 0
     max_num_seqs: 1
+    trust_remote_code: true
     enable_prefix_caching: false
     devices: "0"
     default_sampling_params:
diff --git a/vllm_omni/model_executor/stage_input_processors/mimo_audio.py b/vllm_omni/model_executor/stage_input_processors/mimo_audio.py
index 96680b2dd94..9f868feed85 100644
--- a/vllm_omni/model_executor/stage_input_processors/mimo_audio.py
+++ b/vllm_omni/model_executor/stage_input_processors/mimo_audio.py
@@ -114,7 +114,7 @@ def _to_code_tensor(codes: Any) -> torch.Tensor | None:
 
 def llm2code2wav_async_chunk(
     transfer_manager: Any,
-    pooling_output: dict[str, Any],
+    pooling_output: dict[str, Any] | None,
     request: Any,
     is_finished: bool = False,
 ) -> dict[str, Any] | None:
@@ -132,7 +132,13 @@ def llm2code2wav_async_chunk(
 
     request_id = getattr(request, "external_req_id", None)
 
-    po_codes = pooling_output.get("codes", {})
+    if isinstance(pooling_output, dict):
+        po_codes = pooling_output.get("codes", {})
+    elif not is_finished:
+        return None
+    else:
+        po_codes = {}
+
     if "audio" not in po_codes:
         if is_finished:
             return _flush_remaining_codes(transfer_manager, request_id, chunk_size, left_context_size)

From 3675d561793cc95ceac16ea845f9843e36e7567b Mon Sep 17 00:00:00 2001
From: xiaohajiayou <923390377@qq.com>
Date: Sat, 2 May 2026 17:50:00 +0800
Subject: [PATCH 3/8] Restore deploy runtime defaults for migrated models

Signed-off-by: xiaohajiayou <923390377@qq.com>
---
 tests/e2e/online_serving/test_mimo_audio.py   |  1 +
 vllm_omni/config/stage_config.py              | 21 +++++++++++++------
 vllm_omni/deploy/bagel.yaml                   |  1 +
 vllm_omni/deploy/bagel_single_stage.yaml      |  1 +
 vllm_omni/deploy/mimo_audio.yaml              |  2 ++
 vllm_omni/deploy/qwen3_tts.yaml               |  2 ++
 vllm_omni/engine/async_omni_engine.py         |  5 +----
 .../stage_input_processors/mimo_audio.py      | 10 ++-------
 8 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/tests/e2e/online_serving/test_mimo_audio.py b/tests/e2e/online_serving/test_mimo_audio.py
index df00c64161e..38ee721f434 100644
--- a/tests/e2e/online_serving/test_mimo_audio.py
+++ b/tests/e2e/online_serving/test_mimo_audio.py
@@ -83,6 +83,7 @@ def get_max_batch_size(size_type="few"):
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
+@pytest.mark.skip(reason="CI failed 8571")
 def test_audio_to_text_audio_001(omni_server, openai_client) -> None:
     """
     Test audio and text input processing and text/audio output generation via OpenAI API.
diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index 8a4a8073071..cd082780159 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -401,7 +401,11 @@ class StageDeployConfig:
     the top level of ``DeployConfig`` and propagated to every stage.
     """
 
+    # Stage identity and GPU placement.
     stage_id: int
+    devices: str = "0"
+
+    # Scheduler and memory-capacity knobs passed to vLLM engine args.
     max_num_seqs: int | None = None
     gpu_memory_utilization: float | None = None
     tensor_parallel_size: int | None = None
@@ -409,19 +413,24 @@ class StageDeployConfig:
     max_num_batched_tokens: int | None = None
     max_model_len: int | None = None
     async_scheduling: bool | None = None
-    devices: str = "0"
-    output_connectors: dict[str, str] | None = None
-    input_connectors: dict[str, str] | None = None
-    default_sampling_params: dict[str, Any] | None = None
-    subtalker_sampling_params: dict[str, Any] | None = None
-    profiler_config: dict[str, Any] | None = None
     disable_hybrid_kv_cache_manager: bool | None = None
     mm_processor_cache_gb: float | None = None
+
+    # Profiling, tokenizer/config parsing, and model-loading behavior.
+    profiler_config: dict[str, Any] | None = None
     skip_mm_profiling: bool | None = None
     enable_flashinfer_autotune: bool | None = None
     config_format: str | None = None
     load_format: str | None = None
     tokenizer_mode: str | None = None
+
+    # Inter-stage connector wiring and default request sampling behavior.
+    output_connectors: dict[str, str] | None = None
+    input_connectors: dict[str, str] | None = None
+    default_sampling_params: dict[str, Any] | None = None
+    subtalker_sampling_params: dict[str, Any] | None = None
+
+    # Pass-through engine args that are not represented by explicit fields.
     engine_extras: dict[str, Any] = field(default_factory=dict)
 
 
diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml
index 4dd12b6f000..de2f2877b18 100644
--- a/vllm_omni/deploy/bagel.yaml
+++ b/vllm_omni/deploy/bagel.yaml
@@ -12,6 +12,7 @@ stages:
   - stage_id: 0
     max_num_seqs: 3
     gpu_memory_utilization: 0.45
+    enforce_eager: true
     trust_remote_code: true
     enable_prefix_caching: false
     devices: "0"
diff --git a/vllm_omni/deploy/bagel_single_stage.yaml b/vllm_omni/deploy/bagel_single_stage.yaml
index 858a7e8b66b..d7a0aca4a49 100644
--- a/vllm_omni/deploy/bagel_single_stage.yaml
+++ b/vllm_omni/deploy/bagel_single_stage.yaml
@@ -17,6 +17,7 @@ async_chunk: false
 stages:
   - stage_id: 0
     max_num_seqs: 1
+    enforce_eager: true
     trust_remote_code: true
     enable_prefix_caching: false
     devices: "0"
diff --git a/vllm_omni/deploy/mimo_audio.yaml b/vllm_omni/deploy/mimo_audio.yaml
index d4bb8dd9e35..a92e905f70e 100644
--- a/vllm_omni/deploy/mimo_audio.yaml
+++ b/vllm_omni/deploy/mimo_audio.yaml
@@ -25,6 +25,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.3
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     max_num_batched_tokens: 8192
     max_model_len: 8192
@@ -43,6 +44,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.2
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     async_scheduling: false
     max_num_batched_tokens: 8192
diff --git a/vllm_omni/deploy/qwen3_tts.yaml b/vllm_omni/deploy/qwen3_tts.yaml
index bb57c9eae66..599322d95c9 100644
--- a/vllm_omni/deploy/qwen3_tts.yaml
+++ b/vllm_omni/deploy/qwen3_tts.yaml
@@ -31,6 +31,7 @@ stages:
   - stage_id: 0
     max_num_seqs: 10
     gpu_memory_utilization: 0.3
+    trust_remote_code: true
     enable_prefix_caching: false
     async_scheduling: true
     max_num_batched_tokens: 512
@@ -54,6 +55,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.3
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     async_scheduling: true
     # Must be divisible by num_code_groups and cover (left_context + chunk).
diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index 79aa988fc34..54c9d32d9ea 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -1324,9 +1324,6 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list:
 
         num_devices = max(1, int(parallel_config.world_size))
         devices = ",".join(str(i) for i in range(num_devices))
-        enforce_eager = kwargs.get("enforce_eager")
-        if enforce_eager is None:
-            enforce_eager = False
 
         stage_engine_args = {
             "max_num_seqs": 1,
@@ -1340,7 +1337,7 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list:
             "enable_cache_dit_summary": kwargs.get("enable_cache_dit_summary", False),
             "enable_cpu_offload": kwargs.get("enable_cpu_offload", False),
             "enable_layerwise_offload": kwargs.get("enable_layerwise_offload", False),
-            "enforce_eager": enforce_eager,
+            "enforce_eager": False if kwargs.get("enforce_eager") is None else kwargs.get("enforce_eager"),
             "boundary_ratio": kwargs.get("boundary_ratio", None),
             "flow_shift": kwargs.get("flow_shift", None),
             "diffusion_load_format": kwargs.get("diffusion_load_format", "default"),
diff --git a/vllm_omni/model_executor/stage_input_processors/mimo_audio.py b/vllm_omni/model_executor/stage_input_processors/mimo_audio.py
index 9f868feed85..96680b2dd94 100644
--- a/vllm_omni/model_executor/stage_input_processors/mimo_audio.py
+++ b/vllm_omni/model_executor/stage_input_processors/mimo_audio.py
@@ -114,7 +114,7 @@ def _to_code_tensor(codes: Any) -> torch.Tensor | None:
 
 def llm2code2wav_async_chunk(
     transfer_manager: Any,
-    pooling_output: dict[str, Any] | None,
+    pooling_output: dict[str, Any],
     request: Any,
     is_finished: bool = False,
 ) -> dict[str, Any] | None:
@@ -132,13 +132,7 @@ def llm2code2wav_async_chunk(
 
     request_id = getattr(request, "external_req_id", None)
 
-    if isinstance(pooling_output, dict):
-        po_codes = pooling_output.get("codes", {})
-    elif not is_finished:
-        return None
-    else:
-        po_codes = {}
-
+    po_codes = pooling_output.get("codes", {})
     if "audio" not in po_codes:
         if is_finished:
             return _flush_remaining_codes(transfer_manager, request_id, chunk_size, left_context_size)

From 56049538e0d75ba15a3308c55401a8016d1aa9e0 Mon Sep 17 00:00:00 2001
From: xiaohajiayou <923390377@qq.com>
Date: Sun, 3 May 2026 00:05:38 +0800
Subject: [PATCH 4/8] Preserve deploy defaults for migrated configs

Signed-off-by: xiaohajiayou <923390377@qq.com>
---
 vllm_omni/deploy/bagel.yaml              | 4 ++--
 vllm_omni/deploy/bagel_single_stage.yaml | 1 +
 vllm_omni/deploy/cosyvoice3.yaml         | 6 ++++++
 vllm_omni/deploy/fish_qwen3_omni.yaml    | 2 ++
 vllm_omni/deploy/glm_image.yaml          | 3 +++
 vllm_omni/deploy/qwen2_5_omni.yaml       | 6 ++++++
 vllm_omni/deploy/qwen3_omni_moe.yaml     | 8 ++++++++
 vllm_omni/deploy/voxtral_tts.yaml        | 3 +++
 8 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml
index de2f2877b18..188bf948ca7 100644
--- a/vllm_omni/deploy/bagel.yaml
+++ b/vllm_omni/deploy/bagel.yaml
@@ -10,9 +10,9 @@ async_chunk: false
 
 stages:
   - stage_id: 0
+    max_num_batched_tokens: 32768
     max_num_seqs: 3
     gpu_memory_utilization: 0.45
-    enforce_eager: true
     trust_remote_code: true
     enable_prefix_caching: false
     devices: "0"
@@ -26,8 +26,8 @@ stages:
       repetition_penalty: 1.05
 
   - stage_id: 1
+    max_num_batched_tokens: 32768
     max_num_seqs: 1
-    enforce_eager: true
     trust_remote_code: true
     enable_prefix_caching: false
     devices: "0"
diff --git a/vllm_omni/deploy/bagel_single_stage.yaml b/vllm_omni/deploy/bagel_single_stage.yaml
index d7a0aca4a49..bcfbad253a5 100644
--- a/vllm_omni/deploy/bagel_single_stage.yaml
+++ b/vllm_omni/deploy/bagel_single_stage.yaml
@@ -16,6 +16,7 @@ async_chunk: false
 
 stages:
   - stage_id: 0
+    max_num_batched_tokens: 32768
     max_num_seqs: 1
     enforce_eager: true
     trust_remote_code: true
diff --git a/vllm_omni/deploy/cosyvoice3.yaml b/vllm_omni/deploy/cosyvoice3.yaml
index d708564ddee..dd1bd6b78c8 100644
--- a/vllm_omni/deploy/cosyvoice3.yaml
+++ b/vllm_omni/deploy/cosyvoice3.yaml
@@ -27,9 +27,11 @@ connectors:
 
 stages:
   - stage_id: 0
+    max_num_batched_tokens: 32768
     max_num_seqs: 1
     gpu_memory_utilization: 0.4
     enforce_eager: true
+    trust_remote_code: true
     devices: "0"
     output_connectors:
       to_stage_1: connector_of_shared_memory
@@ -40,19 +42,23 @@ stages:
       # near-identity repetition penalty forces vLLM to track
       # output_token_ids for RAS (stop-token logit logsumexp).
       repetition_penalty: 1.0001
+    disable_hybrid_kv_cache_manager: true
     enable_prefix_caching: false
     mm_processor_cache_gb: 0
     skip_mm_profiling: true
 
   - stage_id: 1
+    max_num_batched_tokens: 32768
     max_num_seqs: 1
     gpu_memory_utilization: 0.2
     enforce_eager: true
+    trust_remote_code: true
     max_model_len: 32768
     devices: "0"
     input_connectors:
       from_stage_0: connector_of_shared_memory
     default_sampling_params:
       max_tokens: 2048
+    disable_hybrid_kv_cache_manager: true
     enable_prefix_caching: false
     skip_mm_profiling: true
diff --git a/vllm_omni/deploy/fish_qwen3_omni.yaml b/vllm_omni/deploy/fish_qwen3_omni.yaml
index d993a45b2f9..5b0c44988a0 100644
--- a/vllm_omni/deploy/fish_qwen3_omni.yaml
+++ b/vllm_omni/deploy/fish_qwen3_omni.yaml
@@ -24,6 +24,7 @@ stages:
     max_num_seqs: 4
     gpu_memory_utilization: 0.6
     enforce_eager: false
+    trust_remote_code: true
     enable_prefix_caching: false
     async_scheduling: false
     # vLLM >=0.19 requires max_num_batched_tokens >= max_model_len when
@@ -47,6 +48,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.1
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     async_scheduling: false
     max_num_batched_tokens: 16384
diff --git a/vllm_omni/deploy/glm_image.yaml b/vllm_omni/deploy/glm_image.yaml
index ee5173ab78a..099df1b1508 100644
--- a/vllm_omni/deploy/glm_image.yaml
+++ b/vllm_omni/deploy/glm_image.yaml
@@ -18,6 +18,7 @@ stages:
     max_num_seqs: 1
     gpu_memory_utilization: 0.6
     enforce_eager: false
+    trust_remote_code: true
     enable_prefix_caching: false
     max_num_batched_tokens: 32768
     devices: "0"
@@ -33,8 +34,10 @@ stages:
   # Stage 1: Diffusion (DiT + VAE)
   # Receives prior_token_ids from AR, performs denoising + VAE decode.
   - stage_id: 1
+    max_num_batched_tokens: 32768
     max_num_seqs: 1
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     devices: "1"
     default_sampling_params:
diff --git a/vllm_omni/deploy/qwen2_5_omni.yaml b/vllm_omni/deploy/qwen2_5_omni.yaml
index f9183080a2c..bd602509a55 100644
--- a/vllm_omni/deploy/qwen2_5_omni.yaml
+++ b/vllm_omni/deploy/qwen2_5_omni.yaml
@@ -19,9 +19,11 @@ async_chunk: false
 
 stages:
   - stage_id: 0
+    max_num_batched_tokens: 32768
     max_num_seqs: 1
     gpu_memory_utilization: 0.8
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     mm_processor_cache_gb: 0
     devices: "0"
@@ -34,9 +36,11 @@ stages:
       repetition_penalty: 1.1
 
   - stage_id: 1
+    max_num_batched_tokens: 32768
     max_num_seqs: 1
     gpu_memory_utilization: 0.8
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     devices: "1"
     default_sampling_params:
@@ -48,9 +52,11 @@ stages:
       repetition_penalty: 1.05
 
   - stage_id: 2
+    max_num_batched_tokens: 32768
     max_num_seqs: 1
     gpu_memory_utilization: 0.15
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     enable_flashinfer_autotune: false
     async_scheduling: false
diff --git a/vllm_omni/deploy/qwen3_omni_moe.yaml b/vllm_omni/deploy/qwen3_omni_moe.yaml
index 270d81cb73c..a9fd09b41e4 100644
--- a/vllm_omni/deploy/qwen3_omni_moe.yaml
+++ b/vllm_omni/deploy/qwen3_omni_moe.yaml
@@ -22,7 +22,10 @@ connectors:
 
 stages:
   - stage_id: 0
+    max_num_batched_tokens: 32768
+    max_num_seqs: 64
     gpu_memory_utilization: 0.9
+    trust_remote_code: true
     enable_prefix_caching: false
     devices: "0"
     default_sampling_params:
@@ -34,7 +37,10 @@ stages:
       repetition_penalty: 1.05
 
   - stage_id: 1
+    max_num_batched_tokens: 32768
+    max_num_seqs: 64
     gpu_memory_utilization: 0.6
+    trust_remote_code: true
     enable_prefix_caching: false
     devices: "1"
     input_connectors:
@@ -47,8 +53,10 @@ stages:
       repetition_penalty: 1.05
 
   - stage_id: 2
+    max_num_seqs: 64
     gpu_memory_utilization: 0.1
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     async_scheduling: false
     max_num_batched_tokens: 51200
diff --git a/vllm_omni/deploy/voxtral_tts.yaml b/vllm_omni/deploy/voxtral_tts.yaml
index 09524febe54..929daddb13f 100644
--- a/vllm_omni/deploy/voxtral_tts.yaml
+++ b/vllm_omni/deploy/voxtral_tts.yaml
@@ -21,9 +21,11 @@ connectors:
 
 stages:
   - stage_id: 0
+    max_num_batched_tokens: 32768
     max_num_seqs: 32
     gpu_memory_utilization: 0.8
     enforce_eager: false
+    trust_remote_code: true
     enable_prefix_caching: false
     async_scheduling: true
     max_model_len: 4096
@@ -49,6 +51,7 @@ stages:
     max_num_seqs: 32
     gpu_memory_utilization: 0.1
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     async_scheduling: false
     max_num_batched_tokens: 65536

From 63a555846b4ea1d737e7f38443544c815f92ded0 Mon Sep 17 00:00:00 2001
From: xiaohajiayou <923390377@qq.com>
Date: Sun, 3 May 2026 01:18:55 +0800
Subject: [PATCH 5/8] Refactor StageDeployConfig: separate GPU resources &
 parallelism into own group

Move devices and tensor_parallel_size into a dedicated "GPU resources
and parallelism" section, leaving stage_id alone as stage identity.
Change devices default from "0" to None, and tighten the None check in
merge_pipeline_deploy to avoid writing a spurious "devices" key.

Signed-off-by: xiaohajiayou <923390377@qq.com>
---
 vllm_omni/config/stage_config.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index cd082780159..5cca40bacc4 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -401,14 +401,16 @@ class StageDeployConfig:
     the top level of ``DeployConfig`` and propagated to every stage.
     """
 
-    # Stage identity and GPU placement.
+    # Stage identity.
     stage_id: int
-    devices: str = "0"
+
+    # GPU resources and parallelism.
+    devices: str | None = None
+    tensor_parallel_size: int | None = None
 
     # Scheduler and memory-capacity knobs passed to vLLM engine args.
     max_num_seqs: int | None = None
     gpu_memory_utilization: float | None = None
-    tensor_parallel_size: int | None = None
     enforce_eager: bool | None = None
     max_num_batched_tokens: int | None = None
     max_model_len: int | None = None
@@ -484,10 +486,10 @@ def _parse_stage_deploy(stage_data: dict[str, Any]) -> StageDeployConfig:
     """Parse a single stage entry from deploy YAML into StageDeployConfig."""
     if "engine_args" in stage_data:
         engine_args = dict(stage_data["engine_args"])
-        devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices", "0"))
+        devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices"))
     else:
         engine_args = {k: v for k, v in stage_data.items() if k not in _STAGE_NON_ENGINE_KEYS and k != "stage_id"}
-        devices = stage_data.get("devices", "0")
+        devices = stage_data.get("devices")
 
     kwargs: dict[str, Any] = {"stage_id": stage_data["stage_id"], "devices": devices}
     for name, f in _STAGE_DEPLOY_FIELDS.items():
@@ -827,7 +829,7 @@ def merge_pipeline_deploy(
         engine_args = _build_engine_args(ps, ds, pipeline, deploy, next_stage_proc)
         extras = _build_extras(ps, ds)
         runtime: dict[str, Any] = {"process": True}
-        if ds is not None:
+        if ds is not None and ds.devices is not None:
             runtime["devices"] = ds.devices
 
         result.append(

From 3702299a3c02a0384bc1e1e262de108c90c915b6 Mon Sep 17 00:00:00 2001
From: xiaohajiayou <923390377@qq.com>
Date: Sun, 3 May 2026 16:05:06 +0800
Subject: [PATCH 6/8] Add compilation config to deploy stage schema

Signed-off-by: xiaohajiayou <923390377@qq.com>
---
 tests/test_config_factory.py     | 7 +++++++
 vllm_omni/config/stage_config.py | 1 +
 2 files changed, 8 insertions(+)

diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py
index 3d002ddf9e2..f58bf8156bf 100644
--- a/tests/test_config_factory.py
+++ b/tests/test_config_factory.py
@@ -827,6 +827,7 @@ def test_deploy_override_fields_include_deploy_schema_fields(self):
         expected_fields = {
             "async_chunk",
             "async_scheduling",
+            "compilation_config",
             "config_format",
             "data_parallel_size",
             "devices",
@@ -881,6 +882,12 @@ def test_load_deploy_config(self):
                 for name in ("config_format", "load_format", "tokenizer_mode")
             )
 
+        ming_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "ming_flash_omni.yaml"
+        if ming_path.exists():
+            ming_deploy = load_deploy_config(ming_path)
+            assert ming_deploy.stages[0].compilation_config == {"pass_config": {"fuse_allreduce_rms": False}}
+            assert "compilation_config" not in ming_deploy.stages[0].engine_extras
+
     def test_merge_pipeline_deploy(self):
         from pathlib import Path
 
diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index 5cca40bacc4..8fdd1e1daa5 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -419,6 +419,7 @@ class StageDeployConfig:
     mm_processor_cache_gb: float | None = None
 
     # Profiling, tokenizer/config parsing, and model-loading behavior.
+    compilation_config: dict[str, Any] | None = None
     profiler_config: dict[str, Any] | None = None
     skip_mm_profiling: bool | None = None
     enable_flashinfer_autotune: bool | None = None

From 52215e81386f994777219fa2cc0a7d36fddac1c2 Mon Sep 17 00:00:00 2001
From: xiaohajiayou <923390377@qq.com>
Date: Mon, 4 May 2026 11:39:12 +0800
Subject: [PATCH 7/8] Reorganize StageDeployConfig fields: Omni-specific vs
 vLLM EngineArgs

Split fields into two clear sections:
- Omni fields: stage identity, devices, connectors, sampling params
- vLLM EngineArgs fields: parallelism, scheduler/memory, compilation, etc.

Fix tests to use separate, descriptive test methods per deploy config.
Add enforce_eager: true to bagel deploy config.

Signed-off-by: xiaohajiayou <923390377@qq.com>
---
 tests/test_config_factory.py     | 42 +++++++++++++++++---------------
 vllm_omni/config/stage_config.py | 32 ++++++++++++------------
 vllm_omni/deploy/bagel.yaml      |  1 +
 3 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py
index f58bf8156bf..0783bf15aae 100644
--- a/tests/test_config_factory.py
+++ b/tests/test_config_factory.py
@@ -856,37 +856,41 @@ def test_deploy_override_fields_include_deploy_schema_fields(self):
 
         assert expected_fields == deploy_override_field_names()
 
-    def test_load_deploy_config(self):
+    def test_load_qwen3_omni_moe_deploy_config(self):
         from pathlib import Path
 
         from vllm_omni.config.stage_config import load_deploy_config
 
         deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml"
-        if not deploy_path.exists():
-            pytest.skip("Deploy config not found")
-
         deploy = load_deploy_config(deploy_path)
         assert len(deploy.stages) == 3
         assert deploy.async_chunk is True
         assert deploy.connectors is not None
         assert deploy.platforms is not None
 
-        voxtral_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "voxtral_tts.yaml"
-        if voxtral_path.exists():
-            voxtral_deploy = load_deploy_config(voxtral_path)
-            assert voxtral_deploy.stages[0].config_format == "mistral"
-            assert voxtral_deploy.stages[0].load_format == "mistral"
-            assert voxtral_deploy.stages[0].tokenizer_mode == "mistral"
-            assert not any(
-                name in voxtral_deploy.stages[0].engine_extras
-                for name in ("config_format", "load_format", "tokenizer_mode")
-            )
+    def test_load_voxtral_tts_deploy_config_schema_fields(self):
+        from pathlib import Path
+
+        from vllm_omni.config.stage_config import load_deploy_config
+
+        deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "voxtral_tts.yaml"
+        deploy = load_deploy_config(deploy_path)
+        assert deploy.stages[0].config_format == "mistral"
+        assert deploy.stages[0].load_format == "mistral"
+        assert deploy.stages[0].tokenizer_mode == "mistral"
+        assert not any(
+            name in deploy.stages[0].engine_extras for name in ("config_format", "load_format", "tokenizer_mode")
+        )
+
+    def test_load_ming_flash_omni_deploy_config_schema_fields(self):
+        from pathlib import Path
 
-        ming_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "ming_flash_omni.yaml"
-        if ming_path.exists():
-            ming_deploy = load_deploy_config(ming_path)
-            assert ming_deploy.stages[0].compilation_config == {"pass_config": {"fuse_allreduce_rms": False}}
-            assert "compilation_config" not in ming_deploy.stages[0].engine_extras
+        from vllm_omni.config.stage_config import load_deploy_config
+
+        deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "ming_flash_omni.yaml"
+        deploy = load_deploy_config(deploy_path)
+        assert deploy.stages[0].compilation_config == {"pass_config": {"fuse_allreduce_rms": False}}
+        assert "compilation_config" not in deploy.stages[0].engine_extras
 
     def test_merge_pipeline_deploy(self):
         from pathlib import Path
diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index 8fdd1e1daa5..8e910174d39 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -401,24 +401,32 @@ class StageDeployConfig:
     the top level of ``DeployConfig`` and propagated to every stage.
     """
 
-    # Stage identity.
+    # === Omni fields ===
+    # Stage identity and Omni runtime placement.
     stage_id: int
-
-    # GPU resources and parallelism.
     devices: str | None = None
-    tensor_parallel_size: int | None = None
 
-    # Scheduler and memory-capacity knobs passed to vLLM engine args.
-    max_num_seqs: int | None = None
+    # Inter-stage connector wiring and request defaults.
+    output_connectors: dict[str, str] | None = None
+    input_connectors: dict[str, str] | None = None
+    default_sampling_params: dict[str, Any] | None = None
+    subtalker_sampling_params: dict[str, Any] | None = None
+
+    # === vLLM EngineArgs fields ===
+    # Parallelism and scheduler/memory capacity.
+    tensor_parallel_size: int | None = None
     gpu_memory_utilization: float | None = None
-    enforce_eager: bool | None = None
+    max_num_seqs: int | None = None
     max_num_batched_tokens: int | None = None
     max_model_len: int | None = None
+
+    # Execution, scheduling, and KV/cache behavior.
+    enforce_eager: bool | None = None
     async_scheduling: bool | None = None
     disable_hybrid_kv_cache_manager: bool | None = None
     mm_processor_cache_gb: float | None = None
 
-    # Profiling, tokenizer/config parsing, and model-loading behavior.
+    # Compilation, profiling, tokenizer/config parsing, and model loading.
     compilation_config: dict[str, Any] | None = None
     profiler_config: dict[str, Any] | None = None
     skip_mm_profiling: bool | None = None
@@ -427,13 +435,7 @@ class StageDeployConfig:
     load_format: str | None = None
     tokenizer_mode: str | None = None
 
-    # Inter-stage connector wiring and default request sampling behavior.
-    output_connectors: dict[str, str] | None = None
-    input_connectors: dict[str, str] | None = None
-    default_sampling_params: dict[str, Any] | None = None
-    subtalker_sampling_params: dict[str, Any] | None = None
-
-    # Pass-through engine args that are not represented by explicit fields.
+    # Pass-through vLLM EngineArgs fields that are not represented above.
     engine_extras: dict[str, Any] = field(default_factory=dict)
 
 
diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml
index 188bf948ca7..6b27318b4de 100644
--- a/vllm_omni/deploy/bagel.yaml
+++ b/vllm_omni/deploy/bagel.yaml
@@ -29,6 +29,7 @@ stages:
     max_num_batched_tokens: 32768
     max_num_seqs: 1
     trust_remote_code: true
+    enforce_eager: true
     enable_prefix_caching: false
     devices: "0"
     input_connectors:

From ad83e5f7cd9bb08dfa16b297f267d0bbd558bae6 Mon Sep 17 00:00:00 2001
From: xiaohajiayou <923390377@qq.com>
Date: Mon, 4 May 2026 21:49:57 +0800
Subject: [PATCH 8/8] Tidy up: field ordering in deploy YAMLs and minor code
 cleanup

Reorder fields in bagel/cosyvoice3/voxcpm2 deploy YAMLs for consistency.
Simplify deploy_override_field_names and to_omegaconf in stage_config.
Add better assertion message in test_config_factory.

Signed-off-by: xiaohajiayou <923390377@qq.com>
---
 tests/test_config_factory.py     | 5 ++++-
 vllm_omni/config/stage_config.py | 9 ++-------
 vllm_omni/deploy/bagel.yaml      | 2 +-
 vllm_omni/deploy/cosyvoice3.yaml | 4 ++--
 vllm_omni/deploy/voxcpm2.yaml    | 2 +-
 5 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py
index 0783bf15aae..ac350daa541 100644
--- a/tests/test_config_factory.py
+++ b/tests/test_config_factory.py
@@ -854,7 +854,10 @@ def test_deploy_override_fields_include_deploy_schema_fields(self):
             "trust_remote_code",
         }
 
-        assert expected_fields == deploy_override_field_names()
+        actual_fields = deploy_override_field_names()
+        assert expected_fields == actual_fields, (
+            f"added={actual_fields - expected_fields}, removed={expected_fields - actual_fields}"
+        )
 
     def test_load_qwen3_omni_moe_deploy_config(self):
         from pathlib import Path
diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index 8e910174d39..59365d093d2 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -716,10 +716,7 @@ def deploy_override_field_names() -> frozenset[str]:
     return (
         frozenset(_STAGE_DEPLOY_FIELDS)
         | frozenset(_PIPELINE_WIDE_ENGINE_FIELDS)
-        | {
-            "async_chunk",
-            "devices",
-        }
+        | frozenset({"async_chunk", "devices"})
     )
 
 
@@ -895,9 +892,7 @@ def to_omegaconf(self) -> Any:
 
         # CLI overrides take precedence over YAML defaults
         for key, value in self.runtime_overrides.items():
-            if value is None:
-                continue
-            if key not in ("devices", "max_batch_size"):
+            if value is not None and key not in ("devices", "max_batch_size"):
                 engine_args[key] = value
 
         # Build runtime config from YAML defaults + CLI overrides
diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml
index 6b27318b4de..8de6f9305ba 100644
--- a/vllm_omni/deploy/bagel.yaml
+++ b/vllm_omni/deploy/bagel.yaml
@@ -28,8 +28,8 @@ stages:
   - stage_id: 1
     max_num_batched_tokens: 32768
     max_num_seqs: 1
-    trust_remote_code: true
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     devices: "0"
     input_connectors:
diff --git a/vllm_omni/deploy/cosyvoice3.yaml b/vllm_omni/deploy/cosyvoice3.yaml
index dd1bd6b78c8..4bfd4ab859d 100644
--- a/vllm_omni/deploy/cosyvoice3.yaml
+++ b/vllm_omni/deploy/cosyvoice3.yaml
@@ -32,6 +32,7 @@ stages:
     gpu_memory_utilization: 0.4
     enforce_eager: true
     trust_remote_code: true
+    enable_prefix_caching: false
     devices: "0"
     output_connectors:
       to_stage_1: connector_of_shared_memory
@@ -43,7 +44,6 @@ stages:
       # output_token_ids for RAS (stop-token logit logsumexp).
       repetition_penalty: 1.0001
     disable_hybrid_kv_cache_manager: true
-    enable_prefix_caching: false
     mm_processor_cache_gb: 0
     skip_mm_profiling: true
 
@@ -53,6 +53,7 @@ stages:
     gpu_memory_utilization: 0.2
     enforce_eager: true
     trust_remote_code: true
+    enable_prefix_caching: false
     max_model_len: 32768
     devices: "0"
     input_connectors:
@@ -60,5 +61,4 @@ stages:
     default_sampling_params:
       max_tokens: 2048
     disable_hybrid_kv_cache_manager: true
-    enable_prefix_caching: false
     skip_mm_profiling: true
diff --git a/vllm_omni/deploy/voxcpm2.yaml b/vllm_omni/deploy/voxcpm2.yaml
index cf4356cfb1a..71ef148242a 100644
--- a/vllm_omni/deploy/voxcpm2.yaml
+++ b/vllm_omni/deploy/voxcpm2.yaml
@@ -16,12 +16,12 @@ stages:
     max_num_seqs: 4
     gpu_memory_utilization: 0.9
     enforce_eager: true
+    trust_remote_code: true
     enable_prefix_caching: false
     async_scheduling: true
     max_num_batched_tokens: 4096
     max_model_len: 4096
     devices: "0"
-    trust_remote_code: true
     default_sampling_params:
       temperature: 0.0
       top_p: 1.0