From 6478b2c42f4fc44ba4276d527438b33c63892158 Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Fri, 1 May 2026 17:39:21 +0800 Subject: [PATCH 1/8] Reapply deploy override field derivation Reapply the deploy override field derivation that was reverted in #3287 and make prefix-cache behavior explicit in deploy configs. This preserves the config refactor while restoring the previous Omni behavior where deploy stages do not accidentally fall through to vLLM's model-dependent prefix-cache default. Signed-off-by: xiaohajiayou <923390377@qq.com> --- tests/e2e/online_serving/test_mimo_audio.py | 1 - .../test_async_omni_diffusion_config.py | 10 +++ tests/helpers/stage_config.py | 1 + tests/test_arg_utils.py | 2 +- tests/test_config_factory.py | 69 ++++++++++++++++++- vllm_omni/config/stage_config.py | 44 ++++++++---- vllm_omni/deploy/bagel.yaml | 2 + vllm_omni/deploy/bagel_single_stage.yaml | 1 + vllm_omni/deploy/cosyvoice3.yaml | 4 +- vllm_omni/deploy/fish_qwen3_omni.yaml | 2 + vllm_omni/deploy/glm_image.yaml | 2 + vllm_omni/deploy/mimo_audio.yaml | 2 + vllm_omni/deploy/moss_tts_nano.yaml | 1 + vllm_omni/deploy/qwen2_5_omni.yaml | 3 + vllm_omni/deploy/qwen3_omni_moe.yaml | 3 + vllm_omni/deploy/qwen3_tts.yaml | 2 + vllm_omni/deploy/voxcpm2.yaml | 2 + vllm_omni/deploy/voxtral_tts.yaml | 2 + vllm_omni/engine/arg_utils.py | 40 +---------- vllm_omni/engine/async_omni_engine.py | 5 +- vllm_omni/entrypoints/omni_base.py | 4 +- 21 files changed, 142 insertions(+), 60 deletions(-) diff --git a/tests/e2e/online_serving/test_mimo_audio.py b/tests/e2e/online_serving/test_mimo_audio.py index 38ee721f434..df00c64161e 100644 --- a/tests/e2e/online_serving/test_mimo_audio.py +++ b/tests/e2e/online_serving/test_mimo_audio.py @@ -83,7 +83,6 @@ def get_max_batch_size(size_type="few"): @pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1) @pytest.mark.parametrize("omni_server", test_params, indirect=True) -@pytest.mark.skip(reason="CI failed 8571") def test_audio_to_text_audio_001(omni_server, openai_client) -> None: """ Test audio and text input processing and text/audio output generation via OpenAI API. diff --git a/tests/entrypoints/test_async_omni_diffusion_config.py b/tests/entrypoints/test_async_omni_diffusion_config.py index 83b465fdb47..235add5725f 100644 --- a/tests/entrypoints/test_async_omni_diffusion_config.py +++ b/tests/entrypoints/test_async_omni_diffusion_config.py @@ -4,6 +4,7 @@ import pytest from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm_omni.config.stage_config import deploy_override_field_names from vllm_omni.engine.async_omni_engine import AsyncOmniEngine from vllm_omni.entrypoints.cli.serve import OmniServeCommand, _create_default_diffusion_stage_cfg @@ -30,6 +31,15 @@ def test_default_stage_config_includes_cache_backend(): assert engine_args["model_stage"] == "diffusion" +def test_default_stage_config_ignores_none_deploy_overrides(): + """Ensure nullified deploy override defaults do not alter diffusion defaults.""" + baseline = AsyncOmniEngine._create_default_diffusion_stage_cfg({})[0] + nullified_overrides = {name: None for name in deploy_override_field_names()} + stage_cfg = AsyncOmniEngine._create_default_diffusion_stage_cfg(nullified_overrides)[0] + + assert stage_cfg == baseline + + def test_default_cache_config_used_when_missing(): """Ensure default cache_config is synthesized when only backend is given.""" stage_cfg = AsyncOmniEngine._create_default_diffusion_stage_cfg( diff --git a/tests/helpers/stage_config.py b/tests/helpers/stage_config.py index 2bb017b811f..ba7ea0c50c2 100644 --- a/tests/helpers/stage_config.py +++ b/tests/helpers/stage_config.py @@ -494,6 +494,7 @@ def delete_by_path(config_dict: dict, path: str) -> None: "max_num_seqs": 1, "gpu_memory_utilization": 0.9, "enforce_eager": True, + "enable_prefix_caching": False, "max_num_batched_tokens": 16384, "max_model_len": 16384, "skip_mm_profiling": True, diff --git a/tests/test_arg_utils.py b/tests/test_arg_utils.py index ae640b2d861..2fd5cf302e0 100644 --- a/tests/test_arg_utils.py +++ b/tests/test_arg_utils.py @@ -369,8 +369,8 @@ def _build_full_serve_parser(): def test_nullify_stage_engine_defaults_resets_inherited_defaults(): import argparse + from vllm_omni.config.stage_config import deploy_override_field_names from vllm_omni.engine.arg_utils import ( - deploy_override_field_names, nullify_stage_engine_defaults, ) diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py index 16d49034fa1..b56b9240e51 100644 --- a/tests/test_config_factory.py +++ b/tests/test_config_factory.py @@ -93,6 +93,7 @@ def test_to_omegaconf_basic(self): assert omega_config.engine_args.worker_type == "ar" assert omega_config.final_output is True assert omega_config.final_output_type == "text" + assert "max_num_seqs" not in omega_config.engine_args # Legacy field name for backward compatibility assert omega_config.engine_input_source == [] @@ -146,6 +147,24 @@ def test_to_omegaconf_max_num_seqs_in_engine_args(self): omega_config = config.to_omegaconf() assert omega_config.engine_args.max_num_seqs == 32 + def test_to_omegaconf_omits_none_deploy_overrides_for_engine_args(self): + """None deploy overrides must fall through to EngineArgs defaults.""" + from vllm_omni.config.stage_config import deploy_override_field_names + + config = StageConfig( + stage_id=0, + model_stage="thinker", + runtime_overrides={name: None for name in deploy_override_field_names()}, + ) + + omega_config = config.to_omegaconf() + engine_args = dict(omega_config.engine_args) + + assert "devices" not in engine_args + assert "max_batch_size" not in engine_args + for name in deploy_override_field_names() - {"devices"}: + assert name not in engine_args + class TestModelPipeline: """Tests for ModelPipeline class.""" @@ -802,6 +821,40 @@ def test_register_and_lookup(self): class TestDeployConfigLoading: + def test_deploy_override_fields_include_deploy_schema_fields(self): + from vllm_omni.config.stage_config import deploy_override_field_names + + expected_fields = { + "async_chunk", + "async_scheduling", + "config_format", + "data_parallel_size", + "devices", + "disable_hybrid_kv_cache_manager", + "distributed_executor_backend", + "dtype", + "enable_chunked_prefill", + "enable_flashinfer_autotune", + "enable_prefix_caching", + "enforce_eager", + "gpu_memory_utilization", + "load_format", + "max_model_len", + "max_num_batched_tokens", + "max_num_seqs", + "mm_processor_cache_gb", + "pipeline_parallel_size", + "profiler_config", + "quantization", + "skip_mm_profiling", + "subtalker_sampling_params", + "tensor_parallel_size", + "tokenizer_mode", + "trust_remote_code", + } + + assert expected_fields == deploy_override_field_names() + def test_load_deploy_config(self): from pathlib import Path @@ -817,6 +870,17 @@ def test_load_deploy_config(self): assert deploy.connectors is not None assert deploy.platforms is not None + voxtral_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "voxtral_tts.yaml" + if voxtral_path.exists(): + voxtral_deploy = load_deploy_config(voxtral_path) + assert voxtral_deploy.stages[0].config_format == "mistral" + assert voxtral_deploy.stages[0].load_format == "mistral" + assert voxtral_deploy.stages[0].tokenizer_mode == "mistral" + assert not any( + name in voxtral_deploy.stages[0].engine_extras + for name in ("config_format", "load_format", "tokenizer_mode") + ) + def test_merge_pipeline_deploy(self): from pathlib import Path @@ -1011,7 +1075,8 @@ def test_ci_inherits_from_main(self): deploy = load_deploy_config(ci_path) assert len(deploy.stages) == 3 # CI overrides - assert deploy.stages[0].engine_extras.get("load_format") == "dummy" + assert deploy.stages[0].load_format == "dummy" + assert "load_format" not in deploy.stages[0].engine_extras assert deploy.stages[0].max_num_seqs == 5 # Inherited from base assert deploy.stages[0].gpu_memory_utilization == 0.9 @@ -1216,7 +1281,7 @@ def test_typed_kwarg_overrides_yaml(self): def test_none_value_skipped_yaml_wins(self): stages = self._stages({"max_num_seqs": None}) assert stages[2].runtime_overrides.get("max_num_seqs") is None - assert stages[2].yaml_engine_args.get("max_num_seqs") == 1 + assert "max_num_seqs" not in stages[2].yaml_engine_args def test_empty_kwargs_yaml_only(self): stages = self._stages({}) diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index 44cc83baea8..8a4a8073071 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -402,11 +402,11 @@ class StageDeployConfig: """ stage_id: int - max_num_seqs: int = 64 - gpu_memory_utilization: float = 0.9 - tensor_parallel_size: int = 1 - enforce_eager: bool = False - max_num_batched_tokens: int = 32768 + max_num_seqs: int | None = None + gpu_memory_utilization: float | None = None + tensor_parallel_size: int | None = None + enforce_eager: bool | None = None + max_num_batched_tokens: int | None = None max_model_len: int | None = None async_scheduling: bool | None = None devices: str = "0" @@ -414,6 +414,14 @@ class StageDeployConfig: input_connectors: dict[str, str] | None = None default_sampling_params: dict[str, Any] | None = None subtalker_sampling_params: dict[str, Any] | None = None + profiler_config: dict[str, Any] | None = None + disable_hybrid_kv_cache_manager: bool | None = None + mm_processor_cache_gb: float | None = None + skip_mm_profiling: bool | None = None + enable_flashinfer_autotune: bool | None = None + config_format: str | None = None + load_format: str | None = None + tokenizer_mode: str | None = None engine_extras: dict[str, Any] = field(default_factory=dict) @@ -438,14 +446,14 @@ class DeployConfig: pipeline: str | None = None # === Pipeline-wide engine settings (applied uniformly to every stage) === - trust_remote_code: bool = True + trust_remote_code: bool | None = None distributed_executor_backend: str | None = None dtype: str | None = None quantization: str | None = None - enable_prefix_caching: bool = False + enable_prefix_caching: bool | None = None enable_chunked_prefill: bool | None = None - data_parallel_size: int = 1 - pipeline_parallel_size: int = 1 + data_parallel_size: int | None = None + pipeline_parallel_size: int | None = None _STAGE_NON_ENGINE_KEYS = frozenset( @@ -689,6 +697,18 @@ def _select_processor_funcs( ) +def deploy_override_field_names() -> frozenset[str]: + """Return deploy-schema fields whose CLI defaults must not override YAML.""" + return ( + frozenset(_STAGE_DEPLOY_FIELDS) + | frozenset(_PIPELINE_WIDE_ENGINE_FIELDS) + | { + "async_chunk", + "devices", + } + ) + + def _build_engine_args( ps: StagePipelineConfig, ds: StageDeployConfig | None, @@ -861,13 +881,15 @@ def to_omegaconf(self) -> Any: # CLI overrides take precedence over YAML defaults for key, value in self.runtime_overrides.items(): + if value is None: + continue if key not in ("devices", "max_batch_size"): engine_args[key] = value # Build runtime config from YAML defaults + CLI overrides runtime: dict[str, Any] = dict(self.yaml_runtime) runtime.setdefault("process", True) - if "devices" in self.runtime_overrides: + if self.runtime_overrides.get("devices") is not None: runtime["devices"] = self.runtime_overrides["devices"] # Legacy compat: migrate runtime.max_batch_size → engine_args.max_num_seqs @@ -883,8 +905,6 @@ def to_omegaconf(self) -> Any: effective_mbs = int(cli_mbs or legacy_mbs or 1) engine_args.setdefault("max_num_seqs", effective_mbs) - engine_args.setdefault("max_num_seqs", 1) - # Build full config dict config_dict: dict[str, Any] = { "stage_id": self.stage_id, diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml index 9d2f1f8fffa..2b42d4171e1 100644 --- a/vllm_omni/deploy/bagel.yaml +++ b/vllm_omni/deploy/bagel.yaml @@ -12,6 +12,7 @@ stages: - stage_id: 0 max_num_seqs: 3 gpu_memory_utilization: 0.45 + enable_prefix_caching: false devices: "0" default_sampling_params: temperature: 0.4 @@ -25,6 +26,7 @@ stages: - stage_id: 1 max_num_seqs: 1 enforce_eager: true + enable_prefix_caching: false devices: "0" input_connectors: from_stage_0: shared_memory_connector diff --git a/vllm_omni/deploy/bagel_single_stage.yaml b/vllm_omni/deploy/bagel_single_stage.yaml index 8470124ec78..560d4670304 100644 --- a/vllm_omni/deploy/bagel_single_stage.yaml +++ b/vllm_omni/deploy/bagel_single_stage.yaml @@ -17,6 +17,7 @@ async_chunk: false stages: - stage_id: 0 max_num_seqs: 1 + enable_prefix_caching: false devices: "0" default_sampling_params: seed: 52 diff --git a/vllm_omni/deploy/cosyvoice3.yaml b/vllm_omni/deploy/cosyvoice3.yaml index 53e3eb3f301..d708564ddee 100644 --- a/vllm_omni/deploy/cosyvoice3.yaml +++ b/vllm_omni/deploy/cosyvoice3.yaml @@ -40,7 +40,7 @@ stages: # near-identity repetition penalty forces vLLM to track # output_token_ids for RAS (stop-token logit logsumexp). repetition_penalty: 1.0001 - disable_hybrid_kv_cache_manager: true + enable_prefix_caching: false mm_processor_cache_gb: 0 skip_mm_profiling: true @@ -54,5 +54,5 @@ stages: from_stage_0: connector_of_shared_memory default_sampling_params: max_tokens: 2048 - disable_hybrid_kv_cache_manager: true + enable_prefix_caching: false skip_mm_profiling: true diff --git a/vllm_omni/deploy/fish_qwen3_omni.yaml b/vllm_omni/deploy/fish_qwen3_omni.yaml index a5bee925b68..d993a45b2f9 100644 --- a/vllm_omni/deploy/fish_qwen3_omni.yaml +++ b/vllm_omni/deploy/fish_qwen3_omni.yaml @@ -24,6 +24,7 @@ stages: max_num_seqs: 4 gpu_memory_utilization: 0.6 enforce_eager: false + enable_prefix_caching: false async_scheduling: false # vLLM >=0.19 requires max_num_batched_tokens >= max_model_len when # enable_chunked_prefill=false. Bumped from legacy 3072 to match @@ -46,6 +47,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.1 enforce_eager: true + enable_prefix_caching: false async_scheduling: false max_num_batched_tokens: 16384 max_model_len: 16384 diff --git a/vllm_omni/deploy/glm_image.yaml b/vllm_omni/deploy/glm_image.yaml index 28b88fb429a..ee5173ab78a 100644 --- a/vllm_omni/deploy/glm_image.yaml +++ b/vllm_omni/deploy/glm_image.yaml @@ -18,6 +18,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.6 enforce_eager: false + enable_prefix_caching: false max_num_batched_tokens: 32768 devices: "0" default_sampling_params: @@ -34,6 +35,7 @@ stages: - stage_id: 1 max_num_seqs: 1 enforce_eager: true + enable_prefix_caching: false devices: "1" default_sampling_params: seed: 42 diff --git a/vllm_omni/deploy/mimo_audio.yaml b/vllm_omni/deploy/mimo_audio.yaml index f5e704f9bd4..d4bb8dd9e35 100644 --- a/vllm_omni/deploy/mimo_audio.yaml +++ b/vllm_omni/deploy/mimo_audio.yaml @@ -25,6 +25,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.3 enforce_eager: true + enable_prefix_caching: false max_num_batched_tokens: 8192 max_model_len: 8192 devices: "0" @@ -42,6 +43,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.2 enforce_eager: true + enable_prefix_caching: false async_scheduling: false max_num_batched_tokens: 8192 max_model_len: 8192 diff --git a/vllm_omni/deploy/moss_tts_nano.yaml b/vllm_omni/deploy/moss_tts_nano.yaml index 585e244ca4a..2c8fc54c057 100644 --- a/vllm_omni/deploy/moss_tts_nano.yaml +++ b/vllm_omni/deploy/moss_tts_nano.yaml @@ -19,6 +19,7 @@ stages: max_num_seqs: 4 gpu_memory_utilization: 0.3 enforce_eager: true + enable_prefix_caching: false max_num_batched_tokens: 4096 max_model_len: 4096 devices: "0" diff --git a/vllm_omni/deploy/qwen2_5_omni.yaml b/vllm_omni/deploy/qwen2_5_omni.yaml index 41aef0df6f6..f9183080a2c 100644 --- a/vllm_omni/deploy/qwen2_5_omni.yaml +++ b/vllm_omni/deploy/qwen2_5_omni.yaml @@ -22,6 +22,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.8 enforce_eager: true + enable_prefix_caching: false mm_processor_cache_gb: 0 devices: "0" default_sampling_params: @@ -36,6 +37,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.8 enforce_eager: true + enable_prefix_caching: false devices: "1" default_sampling_params: temperature: 0.9 @@ -49,6 +51,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.15 enforce_eager: true + enable_prefix_caching: false enable_flashinfer_autotune: false async_scheduling: false devices: "0" diff --git a/vllm_omni/deploy/qwen3_omni_moe.yaml b/vllm_omni/deploy/qwen3_omni_moe.yaml index 39baed6bd7b..270d81cb73c 100644 --- a/vllm_omni/deploy/qwen3_omni_moe.yaml +++ b/vllm_omni/deploy/qwen3_omni_moe.yaml @@ -23,6 +23,7 @@ connectors: stages: - stage_id: 0 gpu_memory_utilization: 0.9 + enable_prefix_caching: false devices: "0" default_sampling_params: temperature: 0.4 @@ -34,6 +35,7 @@ stages: - stage_id: 1 gpu_memory_utilization: 0.6 + enable_prefix_caching: false devices: "1" input_connectors: from_stage_0: connector_of_shared_memory @@ -47,6 +49,7 @@ stages: - stage_id: 2 gpu_memory_utilization: 0.1 enforce_eager: true + enable_prefix_caching: false async_scheduling: false max_num_batched_tokens: 51200 devices: "1" diff --git a/vllm_omni/deploy/qwen3_tts.yaml b/vllm_omni/deploy/qwen3_tts.yaml index 522ea7c58c8..bb57c9eae66 100644 --- a/vllm_omni/deploy/qwen3_tts.yaml +++ b/vllm_omni/deploy/qwen3_tts.yaml @@ -31,6 +31,7 @@ stages: - stage_id: 0 max_num_seqs: 10 gpu_memory_utilization: 0.3 + enable_prefix_caching: false async_scheduling: true max_num_batched_tokens: 512 max_model_len: 4096 @@ -53,6 +54,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.3 enforce_eager: true + enable_prefix_caching: false async_scheduling: true # Must be divisible by num_code_groups and cover (left_context + chunk). # Prefill length is Q * num_frames (e.g. 16 * 2148 = 34368); keep diff --git a/vllm_omni/deploy/voxcpm2.yaml b/vllm_omni/deploy/voxcpm2.yaml index b49906710df..cf4356cfb1a 100644 --- a/vllm_omni/deploy/voxcpm2.yaml +++ b/vllm_omni/deploy/voxcpm2.yaml @@ -16,10 +16,12 @@ stages: max_num_seqs: 4 gpu_memory_utilization: 0.9 enforce_eager: true + enable_prefix_caching: false async_scheduling: true max_num_batched_tokens: 4096 max_model_len: 4096 devices: "0" + trust_remote_code: true default_sampling_params: temperature: 0.0 top_p: 1.0 diff --git a/vllm_omni/deploy/voxtral_tts.yaml b/vllm_omni/deploy/voxtral_tts.yaml index 87d999c67e0..09524febe54 100644 --- a/vllm_omni/deploy/voxtral_tts.yaml +++ b/vllm_omni/deploy/voxtral_tts.yaml @@ -24,6 +24,7 @@ stages: max_num_seqs: 32 gpu_memory_utilization: 0.8 enforce_eager: false + enable_prefix_caching: false async_scheduling: true max_model_len: 4096 devices: "0" @@ -48,6 +49,7 @@ stages: max_num_seqs: 32 gpu_memory_utilization: 0.1 enforce_eager: true + enable_prefix_caching: false async_scheduling: false max_num_batched_tokens: 65536 max_model_len: 65536 diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index 3f16c329e27..6c10c750053 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -456,50 +456,12 @@ class OrchestratorArgs: } ) -_DEPLOY_ENGINE_ARG_OVERRIDE_FIELDS: frozenset[str] = frozenset( - { - # Capacity / scheduling. - "async_scheduling", - "max_model_len", - "max_num_batched_tokens", - "max_num_seqs", - # Memory / parallelism. - "data_parallel_size", - "gpu_memory_utilization", - "pipeline_parallel_size", - "tensor_parallel_size", - # Execution / loading. - "enforce_eager", - "distributed_executor_backend", - "dtype", - "quantization", - "trust_remote_code", - # Caching / chunking. - "async_chunk", - "enable_prefix_caching", - "enable_chunked_prefill", - # Model-specific engine extras. - "subtalker_sampling_params", - } -) - -_DEPLOY_RUNTIME_OVERRIDE_FIELDS: frozenset[str] = frozenset( - { - "devices", - } -) - def orchestrator_field_names() -> frozenset[str]: """Return the names of every field on OrchestratorArgs.""" return frozenset(f.name for f in fields(OrchestratorArgs)) -def deploy_override_field_names() -> frozenset[str]: - """Return kwargs whose parser defaults must not override deploy YAML.""" - return _DEPLOY_ENGINE_ARG_OVERRIDE_FIELDS | _DEPLOY_RUNTIME_OVERRIDE_FIELDS - - def internal_blacklist_keys() -> frozenset[str]: """Return the set of CLI keys that must never be forwarded as per-stage engine overrides. @@ -653,6 +615,8 @@ def nullify_stage_engine_defaults(parser: argparse.ArgumentParser) -> None: """Reset stage-level engine flag defaults to ``None``; preserve real default in help text. Only deploy-YAML override fields are touched. Idempotent.""" + from vllm_omni.config.stage_config import deploy_override_field_names + override_dests = deploy_override_field_names() for action in parser._actions: diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 54c9d32d9ea..79aa988fc34 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1324,6 +1324,9 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list: num_devices = max(1, int(parallel_config.world_size)) devices = ",".join(str(i) for i in range(num_devices)) + enforce_eager = kwargs.get("enforce_eager") + if enforce_eager is None: + enforce_eager = False stage_engine_args = { "max_num_seqs": 1, @@ -1337,7 +1340,7 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list: "enable_cache_dit_summary": kwargs.get("enable_cache_dit_summary", False), "enable_cpu_offload": kwargs.get("enable_cpu_offload", False), "enable_layerwise_offload": kwargs.get("enable_layerwise_offload", False), - "enforce_eager": False if kwargs.get("enforce_eager") is None else kwargs.get("enforce_eager"), + "enforce_eager": enforce_eager, "boundary_ratio": kwargs.get("boundary_ratio", None), "flow_shift": kwargs.get("flow_shift", None), "diffusion_load_format": kwargs.get("diffusion_load_format", "default"), diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 4147c802765..c054b857651 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -100,9 +100,7 @@ def from_cli_args( kwargs: dict[str, Any] = {k: v for k, v in vars(args).items() if not k.startswith("_")} if parser is not None and not getattr(parser, "_omni_nullified", False): - from vllm_omni.engine.arg_utils import ( - deploy_override_field_names, - ) + from vllm_omni.config.stage_config import deploy_override_field_names from vllm_omni.entrypoints.utils import detect_explicit_cli_keys explicit = detect_explicit_cli_keys(sys.argv[1:], parser) or set() From da464affba19688025f4eae4506808855d0f2b0c Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Sat, 2 May 2026 16:33:47 +0800 Subject: [PATCH 2/8] Fix mimo audio async chunk None handling Signed-off-by: xiaohajiayou <923390377@qq.com> --- vllm_omni/deploy/bagel.yaml | 2 ++ vllm_omni/deploy/bagel_single_stage.yaml | 1 + .../stage_input_processors/mimo_audio.py | 10 ++++++++-- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml index 2b42d4171e1..4dd12b6f000 100644 --- a/vllm_omni/deploy/bagel.yaml +++ b/vllm_omni/deploy/bagel.yaml @@ -12,6 +12,7 @@ stages: - stage_id: 0 max_num_seqs: 3 gpu_memory_utilization: 0.45 + trust_remote_code: true enable_prefix_caching: false devices: "0" default_sampling_params: @@ -26,6 +27,7 @@ stages: - stage_id: 1 max_num_seqs: 1 enforce_eager: true + trust_remote_code: true enable_prefix_caching: false devices: "0" input_connectors: diff --git a/vllm_omni/deploy/bagel_single_stage.yaml b/vllm_omni/deploy/bagel_single_stage.yaml index 560d4670304..858a7e8b66b 100644 --- a/vllm_omni/deploy/bagel_single_stage.yaml +++ b/vllm_omni/deploy/bagel_single_stage.yaml @@ -17,6 +17,7 @@ async_chunk: false stages: - stage_id: 0 max_num_seqs: 1 + trust_remote_code: true enable_prefix_caching: false devices: "0" default_sampling_params: diff --git a/vllm_omni/model_executor/stage_input_processors/mimo_audio.py b/vllm_omni/model_executor/stage_input_processors/mimo_audio.py index 96680b2dd94..9f868feed85 100644 --- a/vllm_omni/model_executor/stage_input_processors/mimo_audio.py +++ b/vllm_omni/model_executor/stage_input_processors/mimo_audio.py @@ -114,7 +114,7 @@ def _to_code_tensor(codes: Any) -> torch.Tensor | None: def llm2code2wav_async_chunk( transfer_manager: Any, - pooling_output: dict[str, Any], + pooling_output: dict[str, Any] | None, request: Any, is_finished: bool = False, ) -> dict[str, Any] | None: @@ -132,7 +132,13 @@ def llm2code2wav_async_chunk( request_id = getattr(request, "external_req_id", None) - po_codes = pooling_output.get("codes", {}) + if isinstance(pooling_output, dict): + po_codes = pooling_output.get("codes", {}) + elif not is_finished: + return None + else: + po_codes = {} + if "audio" not in po_codes: if is_finished: return _flush_remaining_codes(transfer_manager, request_id, chunk_size, left_context_size) From 3675d561793cc95ceac16ea845f9843e36e7567b Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Sat, 2 May 2026 17:50:00 +0800 Subject: [PATCH 3/8] Restore deploy runtime defaults for migrated models Signed-off-by: xiaohajiayou <923390377@qq.com> --- tests/e2e/online_serving/test_mimo_audio.py | 1 + vllm_omni/config/stage_config.py | 21 +++++++++++++------ vllm_omni/deploy/bagel.yaml | 1 + vllm_omni/deploy/bagel_single_stage.yaml | 1 + vllm_omni/deploy/mimo_audio.yaml | 2 ++ vllm_omni/deploy/qwen3_tts.yaml | 2 ++ vllm_omni/engine/async_omni_engine.py | 5 +---- .../stage_input_processors/mimo_audio.py | 10 ++------- 8 files changed, 25 insertions(+), 18 deletions(-) diff --git a/tests/e2e/online_serving/test_mimo_audio.py b/tests/e2e/online_serving/test_mimo_audio.py index df00c64161e..38ee721f434 100644 --- a/tests/e2e/online_serving/test_mimo_audio.py +++ b/tests/e2e/online_serving/test_mimo_audio.py @@ -83,6 +83,7 @@ def get_max_batch_size(size_type="few"): @pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1) @pytest.mark.parametrize("omni_server", test_params, indirect=True) +@pytest.mark.skip(reason="CI failed 8571") def test_audio_to_text_audio_001(omni_server, openai_client) -> None: """ Test audio and text input processing and text/audio output generation via OpenAI API. diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index 8a4a8073071..cd082780159 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -401,7 +401,11 @@ class StageDeployConfig: the top level of ``DeployConfig`` and propagated to every stage. """ + # Stage identity and GPU placement. stage_id: int + devices: str = "0" + + # Scheduler and memory-capacity knobs passed to vLLM engine args. max_num_seqs: int | None = None gpu_memory_utilization: float | None = None tensor_parallel_size: int | None = None @@ -409,19 +413,24 @@ class StageDeployConfig: max_num_batched_tokens: int | None = None max_model_len: int | None = None async_scheduling: bool | None = None - devices: str = "0" - output_connectors: dict[str, str] | None = None - input_connectors: dict[str, str] | None = None - default_sampling_params: dict[str, Any] | None = None - subtalker_sampling_params: dict[str, Any] | None = None - profiler_config: dict[str, Any] | None = None disable_hybrid_kv_cache_manager: bool | None = None mm_processor_cache_gb: float | None = None + + # Profiling, tokenizer/config parsing, and model-loading behavior. + profiler_config: dict[str, Any] | None = None skip_mm_profiling: bool | None = None enable_flashinfer_autotune: bool | None = None config_format: str | None = None load_format: str | None = None tokenizer_mode: str | None = None + + # Inter-stage connector wiring and default request sampling behavior. + output_connectors: dict[str, str] | None = None + input_connectors: dict[str, str] | None = None + default_sampling_params: dict[str, Any] | None = None + subtalker_sampling_params: dict[str, Any] | None = None + + # Pass-through engine args that are not represented by explicit fields. engine_extras: dict[str, Any] = field(default_factory=dict) diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml index 4dd12b6f000..de2f2877b18 100644 --- a/vllm_omni/deploy/bagel.yaml +++ b/vllm_omni/deploy/bagel.yaml @@ -12,6 +12,7 @@ stages: - stage_id: 0 max_num_seqs: 3 gpu_memory_utilization: 0.45 + enforce_eager: true trust_remote_code: true enable_prefix_caching: false devices: "0" diff --git a/vllm_omni/deploy/bagel_single_stage.yaml b/vllm_omni/deploy/bagel_single_stage.yaml index 858a7e8b66b..d7a0aca4a49 100644 --- a/vllm_omni/deploy/bagel_single_stage.yaml +++ b/vllm_omni/deploy/bagel_single_stage.yaml @@ -17,6 +17,7 @@ async_chunk: false stages: - stage_id: 0 max_num_seqs: 1 + enforce_eager: true trust_remote_code: true enable_prefix_caching: false devices: "0" diff --git a/vllm_omni/deploy/mimo_audio.yaml b/vllm_omni/deploy/mimo_audio.yaml index d4bb8dd9e35..a92e905f70e 100644 --- a/vllm_omni/deploy/mimo_audio.yaml +++ b/vllm_omni/deploy/mimo_audio.yaml @@ -25,6 +25,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.3 enforce_eager: true + trust_remote_code: true enable_prefix_caching: false max_num_batched_tokens: 8192 max_model_len: 8192 @@ -43,6 +44,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.2 enforce_eager: true + trust_remote_code: true enable_prefix_caching: false async_scheduling: false max_num_batched_tokens: 8192 diff --git a/vllm_omni/deploy/qwen3_tts.yaml b/vllm_omni/deploy/qwen3_tts.yaml index bb57c9eae66..599322d95c9 100644 --- a/vllm_omni/deploy/qwen3_tts.yaml +++ b/vllm_omni/deploy/qwen3_tts.yaml @@ -31,6 +31,7 @@ stages: - stage_id: 0 max_num_seqs: 10 gpu_memory_utilization: 0.3 + trust_remote_code: true enable_prefix_caching: false async_scheduling: true max_num_batched_tokens: 512 @@ -54,6 +55,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.3 enforce_eager: true + trust_remote_code: true enable_prefix_caching: false async_scheduling: true # Must be divisible by num_code_groups and cover (left_context + chunk). diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 79aa988fc34..54c9d32d9ea 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1324,9 +1324,6 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list: num_devices = max(1, int(parallel_config.world_size)) devices = ",".join(str(i) for i in range(num_devices)) - enforce_eager = kwargs.get("enforce_eager") - if enforce_eager is None: - enforce_eager = False stage_engine_args = { "max_num_seqs": 1, @@ -1340,7 +1337,7 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list: "enable_cache_dit_summary": kwargs.get("enable_cache_dit_summary", False), "enable_cpu_offload": kwargs.get("enable_cpu_offload", False), "enable_layerwise_offload": kwargs.get("enable_layerwise_offload", False), - "enforce_eager": enforce_eager, + "enforce_eager": False if kwargs.get("enforce_eager") is None else kwargs.get("enforce_eager"), "boundary_ratio": kwargs.get("boundary_ratio", None), "flow_shift": kwargs.get("flow_shift", None), "diffusion_load_format": kwargs.get("diffusion_load_format", "default"), diff --git a/vllm_omni/model_executor/stage_input_processors/mimo_audio.py b/vllm_omni/model_executor/stage_input_processors/mimo_audio.py index 9f868feed85..96680b2dd94 100644 --- a/vllm_omni/model_executor/stage_input_processors/mimo_audio.py +++ b/vllm_omni/model_executor/stage_input_processors/mimo_audio.py @@ -114,7 +114,7 @@ def _to_code_tensor(codes: Any) -> torch.Tensor | None: def llm2code2wav_async_chunk( transfer_manager: Any, - pooling_output: dict[str, Any] | None, + pooling_output: dict[str, Any], request: Any, is_finished: bool = False, ) -> dict[str, Any] | None: @@ -132,13 +132,7 @@ def llm2code2wav_async_chunk( request_id = getattr(request, "external_req_id", None) - if isinstance(pooling_output, dict): - po_codes = pooling_output.get("codes", {}) - elif not is_finished: - return None - else: - po_codes = {} - + po_codes = pooling_output.get("codes", {}) if "audio" not in po_codes: if is_finished: return _flush_remaining_codes(transfer_manager, request_id, chunk_size, left_context_size) From 56049538e0d75ba15a3308c55401a8016d1aa9e0 Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Sun, 3 May 2026 00:05:38 +0800 Subject: [PATCH 4/8] Preserve deploy defaults for migrated configs Signed-off-by: xiaohajiayou <923390377@qq.com> --- vllm_omni/deploy/bagel.yaml | 4 ++-- vllm_omni/deploy/bagel_single_stage.yaml | 1 + vllm_omni/deploy/cosyvoice3.yaml | 6 ++++++ vllm_omni/deploy/fish_qwen3_omni.yaml | 2 ++ vllm_omni/deploy/glm_image.yaml | 3 +++ vllm_omni/deploy/qwen2_5_omni.yaml | 6 ++++++ vllm_omni/deploy/qwen3_omni_moe.yaml | 8 ++++++++ vllm_omni/deploy/voxtral_tts.yaml | 3 +++ 8 files changed, 31 insertions(+), 2 deletions(-) diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml index de2f2877b18..188bf948ca7 100644 --- a/vllm_omni/deploy/bagel.yaml +++ b/vllm_omni/deploy/bagel.yaml @@ -10,9 +10,9 @@ async_chunk: false stages: - stage_id: 0 + max_num_batched_tokens: 32768 max_num_seqs: 3 gpu_memory_utilization: 0.45 - enforce_eager: true trust_remote_code: true enable_prefix_caching: false devices: "0" @@ -26,8 +26,8 @@ stages: repetition_penalty: 1.05 - stage_id: 1 + max_num_batched_tokens: 32768 max_num_seqs: 1 - enforce_eager: true trust_remote_code: true enable_prefix_caching: false devices: "0" diff --git a/vllm_omni/deploy/bagel_single_stage.yaml b/vllm_omni/deploy/bagel_single_stage.yaml index d7a0aca4a49..bcfbad253a5 100644 --- a/vllm_omni/deploy/bagel_single_stage.yaml +++ b/vllm_omni/deploy/bagel_single_stage.yaml @@ -16,6 +16,7 @@ async_chunk: false stages: - stage_id: 0 + max_num_batched_tokens: 32768 max_num_seqs: 1 enforce_eager: true trust_remote_code: true diff --git a/vllm_omni/deploy/cosyvoice3.yaml b/vllm_omni/deploy/cosyvoice3.yaml index d708564ddee..dd1bd6b78c8 100644 --- a/vllm_omni/deploy/cosyvoice3.yaml +++ b/vllm_omni/deploy/cosyvoice3.yaml @@ -27,9 +27,11 @@ connectors: stages: - stage_id: 0 + max_num_batched_tokens: 32768 max_num_seqs: 1 gpu_memory_utilization: 0.4 enforce_eager: true + trust_remote_code: true devices: "0" output_connectors: to_stage_1: connector_of_shared_memory @@ -40,19 +42,23 @@ stages: # near-identity repetition penalty forces vLLM to track # output_token_ids for RAS (stop-token logit logsumexp). repetition_penalty: 1.0001 + disable_hybrid_kv_cache_manager: true enable_prefix_caching: false mm_processor_cache_gb: 0 skip_mm_profiling: true - stage_id: 1 + max_num_batched_tokens: 32768 max_num_seqs: 1 gpu_memory_utilization: 0.2 enforce_eager: true + trust_remote_code: true max_model_len: 32768 devices: "0" input_connectors: from_stage_0: connector_of_shared_memory default_sampling_params: max_tokens: 2048 + disable_hybrid_kv_cache_manager: true enable_prefix_caching: false skip_mm_profiling: true diff --git a/vllm_omni/deploy/fish_qwen3_omni.yaml b/vllm_omni/deploy/fish_qwen3_omni.yaml index d993a45b2f9..5b0c44988a0 100644 --- a/vllm_omni/deploy/fish_qwen3_omni.yaml +++ b/vllm_omni/deploy/fish_qwen3_omni.yaml @@ -24,6 +24,7 @@ stages: max_num_seqs: 4 gpu_memory_utilization: 0.6 enforce_eager: false + trust_remote_code: true enable_prefix_caching: false async_scheduling: false # vLLM >=0.19 requires max_num_batched_tokens >= max_model_len when @@ -47,6 +48,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.1 enforce_eager: true + trust_remote_code: true enable_prefix_caching: false async_scheduling: false max_num_batched_tokens: 16384 diff --git a/vllm_omni/deploy/glm_image.yaml b/vllm_omni/deploy/glm_image.yaml index ee5173ab78a..099df1b1508 100644 --- a/vllm_omni/deploy/glm_image.yaml +++ b/vllm_omni/deploy/glm_image.yaml @@ -18,6 +18,7 @@ stages: max_num_seqs: 1 gpu_memory_utilization: 0.6 enforce_eager: false + trust_remote_code: true enable_prefix_caching: false max_num_batched_tokens: 32768 devices: "0" @@ -33,8 +34,10 @@ stages: # Stage 1: Diffusion (DiT + VAE) # Receives prior_token_ids from AR, performs denoising + VAE decode. - stage_id: 1 + max_num_batched_tokens: 32768 max_num_seqs: 1 enforce_eager: true + trust_remote_code: true enable_prefix_caching: false devices: "1" default_sampling_params: diff --git a/vllm_omni/deploy/qwen2_5_omni.yaml b/vllm_omni/deploy/qwen2_5_omni.yaml index f9183080a2c..bd602509a55 100644 --- a/vllm_omni/deploy/qwen2_5_omni.yaml +++ b/vllm_omni/deploy/qwen2_5_omni.yaml @@ -19,9 +19,11 @@ async_chunk: false stages: - stage_id: 0 + max_num_batched_tokens: 32768 max_num_seqs: 1 gpu_memory_utilization: 0.8 enforce_eager: true + trust_remote_code: true enable_prefix_caching: false mm_processor_cache_gb: 0 devices: "0" @@ -34,9 +36,11 @@ stages: repetition_penalty: 1.1 - stage_id: 1 + max_num_batched_tokens: 32768 max_num_seqs: 1 gpu_memory_utilization: 0.8 enforce_eager: true + trust_remote_code: true enable_prefix_caching: false devices: "1" default_sampling_params: @@ -48,9 +52,11 @@ stages: repetition_penalty: 1.05 - stage_id: 2 + max_num_batched_tokens: 32768 max_num_seqs: 1 gpu_memory_utilization: 0.15 enforce_eager: true + trust_remote_code: true enable_prefix_caching: false enable_flashinfer_autotune: false async_scheduling: false diff --git a/vllm_omni/deploy/qwen3_omni_moe.yaml b/vllm_omni/deploy/qwen3_omni_moe.yaml index 270d81cb73c..a9fd09b41e4 100644 --- a/vllm_omni/deploy/qwen3_omni_moe.yaml +++ b/vllm_omni/deploy/qwen3_omni_moe.yaml @@ -22,7 +22,10 @@ connectors: stages: - stage_id: 0 + max_num_batched_tokens: 32768 + max_num_seqs: 64 gpu_memory_utilization: 0.9 + trust_remote_code: true enable_prefix_caching: false devices: "0" default_sampling_params: @@ -34,7 +37,10 @@ stages: repetition_penalty: 1.05 - stage_id: 1 + max_num_batched_tokens: 32768 + max_num_seqs: 64 gpu_memory_utilization: 0.6 + trust_remote_code: true enable_prefix_caching: false devices: "1" input_connectors: @@ -47,8 +53,10 @@ stages: repetition_penalty: 1.05 - stage_id: 2 + max_num_seqs: 64 gpu_memory_utilization: 0.1 enforce_eager: true + trust_remote_code: true enable_prefix_caching: false async_scheduling: false max_num_batched_tokens: 51200 diff --git a/vllm_omni/deploy/voxtral_tts.yaml b/vllm_omni/deploy/voxtral_tts.yaml index 09524febe54..929daddb13f 100644 --- a/vllm_omni/deploy/voxtral_tts.yaml +++ b/vllm_omni/deploy/voxtral_tts.yaml @@ -21,9 +21,11 @@ connectors: stages: - stage_id: 0 + max_num_batched_tokens: 32768 max_num_seqs: 32 gpu_memory_utilization: 0.8 enforce_eager: false + trust_remote_code: true enable_prefix_caching: false async_scheduling: true max_model_len: 4096 @@ -49,6 +51,7 @@ stages: max_num_seqs: 32 gpu_memory_utilization: 0.1 enforce_eager: true + trust_remote_code: true enable_prefix_caching: false async_scheduling: false max_num_batched_tokens: 65536 From 63a555846b4ea1d737e7f38443544c815f92ded0 Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Sun, 3 May 2026 01:18:55 +0800 Subject: [PATCH 5/8] Refactor StageDeployConfig: separate GPU resources & parallelism into own group Move devices and tensor_parallel_size into a dedicated "GPU resources and parallelism" section, leaving stage_id alone as stage identity. Change devices default from "0" to None, and tighten the None check in merge_pipeline_deploy to avoid writing a spurious "devices" key. Signed-off-by: xiaohajiayou <923390377@qq.com> --- vllm_omni/config/stage_config.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index cd082780159..5cca40bacc4 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -401,14 +401,16 @@ class StageDeployConfig: the top level of ``DeployConfig`` and propagated to every stage. """ - # Stage identity and GPU placement. + # Stage identity. stage_id: int - devices: str = "0" + + # GPU resources and parallelism. + devices: str | None = None + tensor_parallel_size: int | None = None # Scheduler and memory-capacity knobs passed to vLLM engine args. max_num_seqs: int | None = None gpu_memory_utilization: float | None = None - tensor_parallel_size: int | None = None enforce_eager: bool | None = None max_num_batched_tokens: int | None = None max_model_len: int | None = None @@ -484,10 +486,10 @@ def _parse_stage_deploy(stage_data: dict[str, Any]) -> StageDeployConfig: """Parse a single stage entry from deploy YAML into StageDeployConfig.""" if "engine_args" in stage_data: engine_args = dict(stage_data["engine_args"]) - devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices", "0")) + devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices")) else: engine_args = {k: v for k, v in stage_data.items() if k not in _STAGE_NON_ENGINE_KEYS and k != "stage_id"} - devices = stage_data.get("devices", "0") + devices = stage_data.get("devices") kwargs: dict[str, Any] = {"stage_id": stage_data["stage_id"], "devices": devices} for name, f in _STAGE_DEPLOY_FIELDS.items(): @@ -827,7 +829,7 @@ def merge_pipeline_deploy( engine_args = _build_engine_args(ps, ds, pipeline, deploy, next_stage_proc) extras = _build_extras(ps, ds) runtime: dict[str, Any] = {"process": True} - if ds is not None: + if ds is not None and ds.devices is not None: runtime["devices"] = ds.devices result.append( From 3702299a3c02a0384bc1e1e262de108c90c915b6 Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Sun, 3 May 2026 16:05:06 +0800 Subject: [PATCH 6/8] Add compilation config to deploy stage schema Signed-off-by: xiaohajiayou <923390377@qq.com> --- tests/test_config_factory.py | 7 +++++++ vllm_omni/config/stage_config.py | 1 + 2 files changed, 8 insertions(+) diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py index 3d002ddf9e2..f58bf8156bf 100644 --- a/tests/test_config_factory.py +++ b/tests/test_config_factory.py @@ -827,6 +827,7 @@ def test_deploy_override_fields_include_deploy_schema_fields(self): expected_fields = { "async_chunk", "async_scheduling", + "compilation_config", "config_format", "data_parallel_size", "devices", @@ -881,6 +882,12 @@ def test_load_deploy_config(self): for name in ("config_format", "load_format", "tokenizer_mode") ) + ming_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "ming_flash_omni.yaml" + if ming_path.exists(): + ming_deploy = load_deploy_config(ming_path) + assert ming_deploy.stages[0].compilation_config == {"pass_config": {"fuse_allreduce_rms": False}} + assert "compilation_config" not in ming_deploy.stages[0].engine_extras + def test_merge_pipeline_deploy(self): from pathlib import Path diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index 5cca40bacc4..8fdd1e1daa5 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -419,6 +419,7 @@ class StageDeployConfig: mm_processor_cache_gb: float | None = None # Profiling, tokenizer/config parsing, and model-loading behavior. + compilation_config: dict[str, Any] | None = None profiler_config: dict[str, Any] | None = None skip_mm_profiling: bool | None = None enable_flashinfer_autotune: bool | None = None From 52215e81386f994777219fa2cc0a7d36fddac1c2 Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Mon, 4 May 2026 11:39:12 +0800 Subject: [PATCH 7/8] Reorganize StageDeployConfig fields: Omni-specific vs vLLM EngineArgs Split fields into two clear sections: - Omni fields: stage identity, devices, connectors, sampling params - vLLM EngineArgs fields: parallelism, scheduler/memory, compilation, etc. Fix tests to use separate, descriptive test methods per deploy config. Add enforce_eager: true to bagel deploy config. Signed-off-by: xiaohajiayou <923390377@qq.com> --- tests/test_config_factory.py | 42 +++++++++++++++++--------------- vllm_omni/config/stage_config.py | 32 ++++++++++++------------ vllm_omni/deploy/bagel.yaml | 1 + 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py index f58bf8156bf..0783bf15aae 100644 --- a/tests/test_config_factory.py +++ b/tests/test_config_factory.py @@ -856,37 +856,41 @@ def test_deploy_override_fields_include_deploy_schema_fields(self): assert expected_fields == deploy_override_field_names() - def test_load_deploy_config(self): + def test_load_qwen3_omni_moe_deploy_config(self): from pathlib import Path from vllm_omni.config.stage_config import load_deploy_config deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" - if not deploy_path.exists(): - pytest.skip("Deploy config not found") - deploy = load_deploy_config(deploy_path) assert len(deploy.stages) == 3 assert deploy.async_chunk is True assert deploy.connectors is not None assert deploy.platforms is not None - voxtral_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "voxtral_tts.yaml" - if voxtral_path.exists(): - voxtral_deploy = load_deploy_config(voxtral_path) - assert voxtral_deploy.stages[0].config_format == "mistral" - assert voxtral_deploy.stages[0].load_format == "mistral" - assert voxtral_deploy.stages[0].tokenizer_mode == "mistral" - assert not any( - name in voxtral_deploy.stages[0].engine_extras - for name in ("config_format", "load_format", "tokenizer_mode") - ) + def test_load_voxtral_tts_deploy_config_schema_fields(self): + from pathlib import Path + + from vllm_omni.config.stage_config import load_deploy_config + + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "voxtral_tts.yaml" + deploy = load_deploy_config(deploy_path) + assert deploy.stages[0].config_format == "mistral" + assert deploy.stages[0].load_format == "mistral" + assert deploy.stages[0].tokenizer_mode == "mistral" + assert not any( + name in deploy.stages[0].engine_extras for name in ("config_format", "load_format", "tokenizer_mode") + ) + + def test_load_ming_flash_omni_deploy_config_schema_fields(self): + from pathlib import Path - ming_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "ming_flash_omni.yaml" - if ming_path.exists(): - ming_deploy = load_deploy_config(ming_path) - assert ming_deploy.stages[0].compilation_config == {"pass_config": {"fuse_allreduce_rms": False}} - assert "compilation_config" not in ming_deploy.stages[0].engine_extras + from vllm_omni.config.stage_config import load_deploy_config + + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "ming_flash_omni.yaml" + deploy = load_deploy_config(deploy_path) + assert deploy.stages[0].compilation_config == {"pass_config": {"fuse_allreduce_rms": False}} + assert "compilation_config" not in deploy.stages[0].engine_extras def test_merge_pipeline_deploy(self): from pathlib import Path diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index 8fdd1e1daa5..8e910174d39 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -401,24 +401,32 @@ class StageDeployConfig: the top level of ``DeployConfig`` and propagated to every stage. """ - # Stage identity. + # === Omni fields === + # Stage identity and Omni runtime placement. stage_id: int - - # GPU resources and parallelism. devices: str | None = None - tensor_parallel_size: int | None = None - # Scheduler and memory-capacity knobs passed to vLLM engine args. - max_num_seqs: int | None = None + # Inter-stage connector wiring and request defaults. + output_connectors: dict[str, str] | None = None + input_connectors: dict[str, str] | None = None + default_sampling_params: dict[str, Any] | None = None + subtalker_sampling_params: dict[str, Any] | None = None + + # === vLLM EngineArgs fields === + # Parallelism and scheduler/memory capacity. + tensor_parallel_size: int | None = None gpu_memory_utilization: float | None = None - enforce_eager: bool | None = None + max_num_seqs: int | None = None max_num_batched_tokens: int | None = None max_model_len: int | None = None + + # Execution, scheduling, and KV/cache behavior. + enforce_eager: bool | None = None async_scheduling: bool | None = None disable_hybrid_kv_cache_manager: bool | None = None mm_processor_cache_gb: float | None = None - # Profiling, tokenizer/config parsing, and model-loading behavior. + # Compilation, profiling, tokenizer/config parsing, and model loading. compilation_config: dict[str, Any] | None = None profiler_config: dict[str, Any] | None = None skip_mm_profiling: bool | None = None @@ -427,13 +435,7 @@ class StageDeployConfig: load_format: str | None = None tokenizer_mode: str | None = None - # Inter-stage connector wiring and default request sampling behavior. - output_connectors: dict[str, str] | None = None - input_connectors: dict[str, str] | None = None - default_sampling_params: dict[str, Any] | None = None - subtalker_sampling_params: dict[str, Any] | None = None - - # Pass-through engine args that are not represented by explicit fields. + # Pass-through vLLM EngineArgs fields that are not represented above. engine_extras: dict[str, Any] = field(default_factory=dict) diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml index 188bf948ca7..6b27318b4de 100644 --- a/vllm_omni/deploy/bagel.yaml +++ b/vllm_omni/deploy/bagel.yaml @@ -29,6 +29,7 @@ stages: max_num_batched_tokens: 32768 max_num_seqs: 1 trust_remote_code: true + enforce_eager: true enable_prefix_caching: false devices: "0" input_connectors: From ad83e5f7cd9bb08dfa16b297f267d0bbd558bae6 Mon Sep 17 00:00:00 2001 From: xiaohajiayou <923390377@qq.com> Date: Mon, 4 May 2026 21:49:57 +0800 Subject: [PATCH 8/8] Tidy up: field ordering in deploy YAMLs and minor code cleanup Reorder fields in bagel/cosyvoice3/voxcpm2 deploy YAMLs for consistency. Simplify deploy_override_field_names and to_omegaconf in stage_config. Add better assertion message in test_config_factory. Signed-off-by: xiaohajiayou <923390377@qq.com> --- tests/test_config_factory.py | 5 ++++- vllm_omni/config/stage_config.py | 9 ++------- vllm_omni/deploy/bagel.yaml | 2 +- vllm_omni/deploy/cosyvoice3.yaml | 4 ++-- vllm_omni/deploy/voxcpm2.yaml | 2 +- 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py index 0783bf15aae..ac350daa541 100644 --- a/tests/test_config_factory.py +++ b/tests/test_config_factory.py @@ -854,7 +854,10 @@ def test_deploy_override_fields_include_deploy_schema_fields(self): "trust_remote_code", } - assert expected_fields == deploy_override_field_names() + actual_fields = deploy_override_field_names() + assert expected_fields == actual_fields, ( + f"added={actual_fields - expected_fields}, removed={expected_fields - actual_fields}" + ) def test_load_qwen3_omni_moe_deploy_config(self): from pathlib import Path diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index 8e910174d39..59365d093d2 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -716,10 +716,7 @@ def deploy_override_field_names() -> frozenset[str]: return ( frozenset(_STAGE_DEPLOY_FIELDS) | frozenset(_PIPELINE_WIDE_ENGINE_FIELDS) - | { - "async_chunk", - "devices", - } + | frozenset({"async_chunk", "devices"}) ) @@ -895,9 +892,7 @@ def to_omegaconf(self) -> Any: # CLI overrides take precedence over YAML defaults for key, value in self.runtime_overrides.items(): - if value is None: - continue - if key not in ("devices", "max_batch_size"): + if value is not None and key not in ("devices", "max_batch_size"): engine_args[key] = value # Build runtime config from YAML defaults + CLI overrides diff --git a/vllm_omni/deploy/bagel.yaml b/vllm_omni/deploy/bagel.yaml index 6b27318b4de..8de6f9305ba 100644 --- a/vllm_omni/deploy/bagel.yaml +++ b/vllm_omni/deploy/bagel.yaml @@ -28,8 +28,8 @@ stages: - stage_id: 1 max_num_batched_tokens: 32768 max_num_seqs: 1 - trust_remote_code: true enforce_eager: true + trust_remote_code: true enable_prefix_caching: false devices: "0" input_connectors: diff --git a/vllm_omni/deploy/cosyvoice3.yaml b/vllm_omni/deploy/cosyvoice3.yaml index dd1bd6b78c8..4bfd4ab859d 100644 --- a/vllm_omni/deploy/cosyvoice3.yaml +++ b/vllm_omni/deploy/cosyvoice3.yaml @@ -32,6 +32,7 @@ stages: gpu_memory_utilization: 0.4 enforce_eager: true trust_remote_code: true + enable_prefix_caching: false devices: "0" output_connectors: to_stage_1: connector_of_shared_memory @@ -43,7 +44,6 @@ stages: # output_token_ids for RAS (stop-token logit logsumexp). repetition_penalty: 1.0001 disable_hybrid_kv_cache_manager: true - enable_prefix_caching: false mm_processor_cache_gb: 0 skip_mm_profiling: true @@ -53,6 +53,7 @@ stages: gpu_memory_utilization: 0.2 enforce_eager: true trust_remote_code: true + enable_prefix_caching: false max_model_len: 32768 devices: "0" input_connectors: @@ -60,5 +61,4 @@ stages: default_sampling_params: max_tokens: 2048 disable_hybrid_kv_cache_manager: true - enable_prefix_caching: false skip_mm_profiling: true diff --git a/vllm_omni/deploy/voxcpm2.yaml b/vllm_omni/deploy/voxcpm2.yaml index cf4356cfb1a..71ef148242a 100644 --- a/vllm_omni/deploy/voxcpm2.yaml +++ b/vllm_omni/deploy/voxcpm2.yaml @@ -16,12 +16,12 @@ stages: max_num_seqs: 4 gpu_memory_utilization: 0.9 enforce_eager: true + trust_remote_code: true enable_prefix_caching: false async_scheduling: true max_num_batched_tokens: 4096 max_model_len: 4096 devices: "0" - trust_remote_code: true default_sampling_params: temperature: 0.0 top_p: 1.0