Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6478b2c
Reapply deploy override field derivation
xiaohajiayou May 1, 2026
cf65678
Merge branch 'main' into whitelist-optimization-v2
xiaohajiayou May 2, 2026
da464af
Fix mimo audio async chunk None handling
xiaohajiayou May 2, 2026
a9a09c1
Merge branch 'main' into whitelist-optimization-v2
xiaohajiayou May 2, 2026
3675d56
Restore deploy runtime defaults for migrated models
xiaohajiayou May 2, 2026
072b5b7
Merge branch 'main' into whitelist-optimization-v2
xiaohajiayou May 2, 2026
5604953
Preserve deploy defaults for migrated configs
xiaohajiayou May 2, 2026
63a5558
Refactor StageDeployConfig: separate GPU resources & parallelism into…
xiaohajiayou May 2, 2026
8d39bd7
Merge branch 'main' into whitelist-optimization-v2
xiaohajiayou May 3, 2026
33736c6
Merge branch 'main' into whitelist-optimization-v2
hsliuustc0106 May 3, 2026
3702299
Add compilation config to deploy stage schema
xiaohajiayou May 3, 2026
fe78352
Merge branch 'main' into whitelist-optimization-v2
xiaohajiayou May 3, 2026
2f3f3fb
Merge branch 'main' into whitelist-optimization-v2
xiaohajiayou May 3, 2026
52215e8
Reorganize StageDeployConfig fields: Omni-specific vs vLLM EngineArgs
xiaohajiayou May 4, 2026
cb9907e
Merge branch 'main' into whitelist-optimization-v2
xiaohajiayou May 4, 2026
f846d1e
Merge branch 'main' into whitelist-optimization-v2
xiaohajiayou May 4, 2026
9d9b720
Merge branch 'main' into whitelist-optimization-v2
hsliuustc0106 May 4, 2026
ad83e5f
Tidy up: field ordering in deploy YAMLs and minor code cleanup
xiaohajiayou May 4, 2026
0b65ea5
Merge branch 'main' into whitelist-optimization-v2
lishunyang12 May 4, 2026
f065b2e
Merge branch 'main' into whitelist-optimization-v2
lishunyang12 May 4, 2026
6ebf801
Merge branch 'main' into whitelist-optimization-v2
lishunyang12 May 4, 2026
7894e05
Merge branch 'main' into whitelist-optimization-v2
xiaohajiayou May 4, 2026
beaf1c9
Merge branch 'main' into whitelist-optimization-v2
xiaohajiayou May 5, 2026
065034f
Merge branch 'main' into whitelist-optimization-v2
hsliuustc0106 May 5, 2026
b169da6
Merge branch 'main' into whitelist-optimization-v2
xiaohajiayou May 5, 2026
88daf61
Merge branch 'main' into whitelist-optimization-v2
lishunyang12 May 5, 2026
3a41163
Merge branch 'main' into whitelist-optimization-v2
linyueqian May 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions tests/entrypoints/test_async_omni_diffusion_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest
from vllm.utils.argparse_utils import FlexibleArgumentParser

from vllm_omni.config.stage_config import deploy_override_field_names
from vllm_omni.engine.async_omni_engine import AsyncOmniEngine
from vllm_omni.entrypoints.cli.serve import OmniServeCommand, _create_default_diffusion_stage_cfg

Expand All @@ -30,6 +31,15 @@ def test_default_stage_config_includes_cache_backend():
assert engine_args["model_stage"] == "diffusion"


def test_default_stage_config_ignores_none_deploy_overrides():
"""Ensure nullified deploy override defaults do not alter diffusion defaults."""
baseline = AsyncOmniEngine._create_default_diffusion_stage_cfg({})[0]
nullified_overrides = {name: None for name in deploy_override_field_names()}
stage_cfg = AsyncOmniEngine._create_default_diffusion_stage_cfg(nullified_overrides)[0]

assert stage_cfg == baseline


def test_default_cache_config_used_when_missing():
"""Ensure default cache_config is synthesized when only backend is given."""
stage_cfg = AsyncOmniEngine._create_default_diffusion_stage_cfg(
Expand Down
1 change: 1 addition & 0 deletions tests/helpers/stage_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,7 @@ def delete_by_path(config_dict: dict, path: str) -> None:
"max_num_seqs": 1,
"gpu_memory_utilization": 0.9,
"enforce_eager": True,
"enable_prefix_caching": False,
"max_num_batched_tokens": 16384,
"max_model_len": 16384,
"skip_mm_profiling": True,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,8 +369,8 @@ def _build_full_serve_parser():
def test_nullify_stage_engine_defaults_resets_inherited_defaults():
import argparse

from vllm_omni.config.stage_config import deploy_override_field_names
from vllm_omni.engine.arg_utils import (
deploy_override_field_names,
nullify_stage_engine_defaults,
)

Expand Down
91 changes: 85 additions & 6 deletions tests/test_config_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def test_to_omegaconf_basic(self):
assert omega_config.engine_args.worker_type == "ar"
assert omega_config.final_output is True
assert omega_config.final_output_type == "text"
assert "max_num_seqs" not in omega_config.engine_args
# Legacy field name for backward compatibility
assert omega_config.engine_input_source == []

Expand Down Expand Up @@ -146,6 +147,24 @@ def test_to_omegaconf_max_num_seqs_in_engine_args(self):
omega_config = config.to_omegaconf()
assert omega_config.engine_args.max_num_seqs == 32

def test_to_omegaconf_omits_none_deploy_overrides_for_engine_args(self):
"""None deploy overrides must fall through to EngineArgs defaults."""
from vllm_omni.config.stage_config import deploy_override_field_names

config = StageConfig(
stage_id=0,
model_stage="thinker",
runtime_overrides={name: None for name in deploy_override_field_names()},
)

omega_config = config.to_omegaconf()
engine_args = dict(omega_config.engine_args)

assert "devices" not in engine_args
assert "max_batch_size" not in engine_args
for name in deploy_override_field_names() - {"devices"}:
assert name not in engine_args


class TestModelPipeline:
"""Tests for ModelPipeline class."""
Expand Down Expand Up @@ -806,21 +825,80 @@ def test_register_and_lookup(self):


class TestDeployConfigLoading:
def test_load_deploy_config(self):
def test_deploy_override_fields_include_deploy_schema_fields(self):
from vllm_omni.config.stage_config import deploy_override_field_names

expected_fields = {
"async_chunk",
"async_scheduling",
"compilation_config",
"config_format",
"data_parallel_size",
"devices",
"disable_hybrid_kv_cache_manager",
"distributed_executor_backend",
"dtype",
"enable_chunked_prefill",
"enable_flashinfer_autotune",
"enable_prefix_caching",
"enforce_eager",
"gpu_memory_utilization",
Comment thread
xiaohajiayou marked this conversation as resolved.
"load_format",
"max_model_len",
"max_num_batched_tokens",
"max_num_seqs",
"mm_processor_cache_gb",
"pipeline_parallel_size",
"profiler_config",
"quantization",
"skip_mm_profiling",
"subtalker_sampling_params",
"tensor_parallel_size",
"tokenizer_mode",
"trust_remote_code",
}

actual_fields = deploy_override_field_names()
assert expected_fields == actual_fields, (
f"added={actual_fields - expected_fields}, removed={expected_fields - actual_fields}"
)

def test_load_qwen3_omni_moe_deploy_config(self):
from pathlib import Path

from vllm_omni.config.stage_config import load_deploy_config

deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml"
if not deploy_path.exists():
pytest.skip("Deploy config not found")

deploy = load_deploy_config(deploy_path)
assert len(deploy.stages) == 3
assert deploy.async_chunk is True
assert deploy.connectors is not None
assert deploy.platforms is not None

def test_load_voxtral_tts_deploy_config_schema_fields(self):
from pathlib import Path

from vllm_omni.config.stage_config import load_deploy_config

deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "voxtral_tts.yaml"
deploy = load_deploy_config(deploy_path)
assert deploy.stages[0].config_format == "mistral"
assert deploy.stages[0].load_format == "mistral"
assert deploy.stages[0].tokenizer_mode == "mistral"
assert not any(
name in deploy.stages[0].engine_extras for name in ("config_format", "load_format", "tokenizer_mode")
)

def test_load_ming_flash_omni_deploy_config_schema_fields(self):
from pathlib import Path

from vllm_omni.config.stage_config import load_deploy_config

deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "ming_flash_omni.yaml"
deploy = load_deploy_config(deploy_path)
assert deploy.stages[0].compilation_config == {"pass_config": {"fuse_allreduce_rms": False}}
assert "compilation_config" not in deploy.stages[0].engine_extras

def test_merge_pipeline_deploy(self):
from pathlib import Path

Expand Down Expand Up @@ -1171,7 +1249,8 @@ def test_ci_inherits_from_main(self):
deploy = load_deploy_config(ci_path)
assert len(deploy.stages) == 3
# CI overrides
assert deploy.stages[0].engine_extras.get("load_format") == "dummy"
assert deploy.stages[0].load_format == "dummy"
assert "load_format" not in deploy.stages[0].engine_extras
assert deploy.stages[0].max_num_seqs == 5
# Inherited from base
assert deploy.stages[0].gpu_memory_utilization == 0.9
Expand Down Expand Up @@ -1376,7 +1455,7 @@ def test_typed_kwarg_overrides_yaml(self):
def test_none_value_skipped_yaml_wins(self):
stages = self._stages({"max_num_seqs": None})
assert stages[2].runtime_overrides.get("max_num_seqs") is None
assert stages[2].yaml_engine_args.get("max_num_seqs") == 1
assert "max_num_seqs" not in stages[2].yaml_engine_args

def test_empty_kwargs_yaml_only(self):
stages = self._stages({})
Expand Down
67 changes: 48 additions & 19 deletions vllm_omni/config/stage_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,19 +399,41 @@ class StageDeployConfig:
the top level of ``DeployConfig`` and propagated to every stage.
"""

# === Omni fields ===
# Stage identity and Omni runtime placement.
stage_id: int
Comment thread
xiaohajiayou marked this conversation as resolved.
max_num_seqs: int = 64
gpu_memory_utilization: float = 0.9
tensor_parallel_size: int = 1
enforce_eager: bool = False
max_num_batched_tokens: int = 32768
Copy link
Copy Markdown
Contributor

@alex-jw-brooks alex-jw-brooks May 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related to earlier - if this is where the values for things like max_num_batched_tokens that were added to the configs came from, it would be better to resolve the defaults in the code rather than setting them directly in every yaml. Otherwise custom yaml from users won't be able to use these values.

Are the changes in the yaml needed to make the tests pass since the values aren't handled here? i.e., some things in the schema are required that weren't before

Copy link
Copy Markdown
Contributor Author

@xiaohajiayou xiaohajiayou May 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Previously, some of these fields were not explicitly set in the model YAMLs and instead relied implicitly on default values.

  • For example, trust_remote_code defaults to False in vLLM, while our previous behavior was effectively True, so I explicitly set it to preserve prior behavior.

However, now that we’ve aligned on treating vLLM defaults as the single source of truth, relying on implicit defaults is no longer a stable approach

  • especially since those defaults may change over time and diverge from previous expectations.

Ideally, user-configurable fields should be defined in YAML, and others handled in pipeline.py.

To keep this PR focused, I’ve explicitly materialized those defaults in the YAMLs to avoid unintended behavior changes. Proper value selection can be addressed in a follow-up (#3313).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the changes in the yaml needed to make the tests pass since the values aren't handled here? i.e., some things in the schema are required that weren't before

No, these changes are not introducing any newly required fields that weren’t needed before. The fields themselves already exist in vLLM; the difference is that their default values have changed compared to what was previously assumed.

max_model_len: int | None = None
async_scheduling: bool | None = None
devices: str = "0"
devices: str | None = None

# Inter-stage connector wiring and request defaults.
output_connectors: dict[str, str] | None = None
input_connectors: dict[str, str] | None = None
default_sampling_params: dict[str, Any] | None = None
subtalker_sampling_params: dict[str, Any] | None = None

# === vLLM EngineArgs fields ===
# Parallelism and scheduler/memory capacity.
tensor_parallel_size: int | None = None
gpu_memory_utilization: float | None = None
max_num_seqs: int | None = None
max_num_batched_tokens: int | None = None
max_model_len: int | None = None

# Execution, scheduling, and KV/cache behavior.
enforce_eager: bool | None = None
async_scheduling: bool | None = None
disable_hybrid_kv_cache_manager: bool | None = None
mm_processor_cache_gb: float | None = None

# Compilation, profiling, tokenizer/config parsing, and model loading.
compilation_config: dict[str, Any] | None = None
profiler_config: dict[str, Any] | None = None
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

skip_mm_profiling: bool | None = None
enable_flashinfer_autotune: bool | None = None
config_format: str | None = None
load_format: str | None = None
tokenizer_mode: str | None = None

# Pass-through vLLM EngineArgs fields that are not represented above.
engine_extras: dict[str, Any] = field(default_factory=dict)


Comment thread
xiaohajiayou marked this conversation as resolved.
Expand All @@ -436,14 +458,14 @@ class DeployConfig:
pipeline: str | None = None

# === Pipeline-wide engine settings (applied uniformly to every stage) ===
trust_remote_code: bool = True
trust_remote_code: bool | None = None
distributed_executor_backend: str | None = None
dtype: str | None = None
quantization: str | None = None
enable_prefix_caching: bool = False
enable_prefix_caching: bool | None = None
enable_chunked_prefill: bool | None = None
data_parallel_size: int = 1
pipeline_parallel_size: int = 1
data_parallel_size: int | None = None
pipeline_parallel_size: int | None = None


_STAGE_NON_ENGINE_KEYS = frozenset(
Expand All @@ -465,10 +487,10 @@ def _parse_stage_deploy(stage_data: dict[str, Any]) -> StageDeployConfig:
"""Parse a single stage entry from deploy YAML into StageDeployConfig."""
if "engine_args" in stage_data:
engine_args = dict(stage_data["engine_args"])
devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices", "0"))
devices = stage_data.get("runtime", {}).get("devices", stage_data.get("devices"))
else:
engine_args = {k: v for k, v in stage_data.items() if k not in _STAGE_NON_ENGINE_KEYS and k != "stage_id"}
devices = stage_data.get("devices", "0")
devices = stage_data.get("devices")

kwargs: dict[str, Any] = {"stage_id": stage_data["stage_id"], "devices": devices}
for name, f in _STAGE_DEPLOY_FIELDS.items():
Expand Down Expand Up @@ -687,6 +709,15 @@ def _select_processor_funcs(
)


Comment thread
xiaohajiayou marked this conversation as resolved.
def deploy_override_field_names() -> frozenset[str]:
"""Return deploy-schema fields whose CLI defaults must not override YAML."""
return (
frozenset(_STAGE_DEPLOY_FIELDS)
| frozenset(_PIPELINE_WIDE_ENGINE_FIELDS)
| frozenset({"async_chunk", "devices"})
)


def _build_engine_args(
ps: StagePipelineConfig,
ds: StageDeployConfig | None,
Expand Down Expand Up @@ -802,7 +833,7 @@ def merge_pipeline_deploy(
engine_args["async_scheduling"] = sched_cls is OmniARAsyncScheduler
extras = _build_extras(ps, ds)
runtime: dict[str, Any] = {"process": True}
if ds is not None:
if ds is not None and ds.devices is not None:
runtime["devices"] = ds.devices

result.append(
Expand Down Expand Up @@ -865,13 +896,13 @@ def to_omegaconf(self) -> Any:

# CLI overrides take precedence over YAML defaults
for key, value in self.runtime_overrides.items():
if key not in ("devices", "max_batch_size"):
if value is not None and key not in ("devices", "max_batch_size"):
engine_args[key] = value

# Build runtime config from YAML defaults + CLI overrides
runtime: dict[str, Any] = dict(self.yaml_runtime)
runtime.setdefault("process", True)
if "devices" in self.runtime_overrides:
if self.runtime_overrides.get("devices") is not None:
runtime["devices"] = self.runtime_overrides["devices"]

# Legacy compat: migrate runtime.max_batch_size → engine_args.max_num_seqs
Expand All @@ -887,8 +918,6 @@ def to_omegaconf(self) -> Any:
effective_mbs = int(cli_mbs or legacy_mbs or 1)
engine_args.setdefault("max_num_seqs", effective_mbs)

engine_args.setdefault("max_num_seqs", 1)

# Build full config dict
config_dict: dict[str, Any] = {
"stage_id": self.stage_id,
Expand Down
6 changes: 6 additions & 0 deletions vllm_omni/deploy/bagel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ async_chunk: false

stages:
- stage_id: 0
max_num_batched_tokens: 32768
max_num_seqs: 3
gpu_memory_utilization: 0.45
trust_remote_code: true
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are so max_num_batched_tokens / trust_remote_code / enable_prefix_caching being updated in so many configs?

trust_remote_code is only needed if the model bundles custom implementations inside of it, so generally shouldn't be needed

enable_prefix_caching: false
devices: "0"
default_sampling_params:
temperature: 0.4
Expand All @@ -23,8 +26,11 @@ stages:
repetition_penalty: 1.05

- stage_id: 1
max_num_batched_tokens: 32768
max_num_seqs: 1
enforce_eager: true
Comment thread
xiaohajiayou marked this conversation as resolved.
trust_remote_code: true
enable_prefix_caching: false
devices: "0"
input_connectors:
from_stage_0: shared_memory_connector
Expand Down
4 changes: 4 additions & 0 deletions vllm_omni/deploy/bagel_single_stage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ async_chunk: false

stages:
- stage_id: 0
max_num_batched_tokens: 32768
max_num_seqs: 1
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
devices: "0"
default_sampling_params:
seed: 52
6 changes: 6 additions & 0 deletions vllm_omni/deploy/cosyvoice3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,12 @@ connectors:

stages:
- stage_id: 0
max_num_batched_tokens: 32768
max_num_seqs: 1
gpu_memory_utilization: 0.4
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
devices: "0"
output_connectors:
to_stage_1: connector_of_shared_memory
Expand All @@ -45,9 +48,12 @@ stages:
skip_mm_profiling: true

- stage_id: 1
max_num_batched_tokens: 32768
max_num_seqs: 1
gpu_memory_utilization: 0.2
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
max_model_len: 32768
devices: "0"
input_connectors:
Expand Down
4 changes: 4 additions & 0 deletions vllm_omni/deploy/fish_qwen3_omni.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ stages:
max_num_seqs: 4
gpu_memory_utilization: 0.6
enforce_eager: false
trust_remote_code: true
enable_prefix_caching: false
async_scheduling: false
# vLLM >=0.19 requires max_num_batched_tokens >= max_model_len when
# enable_chunked_prefill=false. Bumped from legacy 3072 to match
Expand All @@ -46,6 +48,8 @@ stages:
max_num_seqs: 1
gpu_memory_utilization: 0.1
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
async_scheduling: false
max_num_batched_tokens: 16384
max_model_len: 16384
Expand Down
Loading
Loading