diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 6c451936233..0d03f919ddf 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -604,7 +604,7 @@ When you want to add L5-level stability test cases, add or extend the appropriat "test_name": "test_qwen3_omni_stability", "server_params": { "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_config_name": "qwen3_omni.yaml" + "stage_config_name": "qwen3_omni_moe.yaml" }, "benchmark_params": [ { @@ -640,7 +640,7 @@ When you want to add L5-level stability test cases, add or extend the appropriat | Parameter | Required | Example | Description | | ----------------- | -------- | ---------------------------------- | ----------------------------------- | | model | Yes | "Qwen/Qwen3-Omni-30B-A3B-Instruct" | Model name or path | -| stage_config_name | Yes | "qwen3_omni.yaml" | Stage configuration file name | +| stage_config_name | Yes | "qwen3_omni_moe.yaml" | Stage configuration file name | ##### Dynamic Configuration (update/delete) diff --git a/docs/contributing/ci/test_examples/l4_performance_tests.inc.md b/docs/contributing/ci/test_examples/l4_performance_tests.inc.md index f1f3073dc52..1329f53872c 100644 --- a/docs/contributing/ci/test_examples/l4_performance_tests.inc.md +++ b/docs/contributing/ci/test_examples/l4_performance_tests.inc.md @@ -5,7 +5,7 @@ When you want to add L4-level ***performance test*** cases, you can refer to the "test_name": "test_qwen3_omni", "server_params": { "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_config_name": "qwen3_omni.yaml" + "stage_config_name": "qwen3_omni_moe.yaml" }, "benchmark_params": [ { @@ -43,7 +43,7 @@ When you want to add L4-level ***performance test*** cases, you can refer to the | Parameter | Required | Example | Description | | ----------------- | -------- | ---------------------------------- | ----------------------------- | | model | Yes | "Qwen/Qwen3-Omni-30B-A3B-Instruct" | Model name or path | -| stage_config_name | Yes | "qwen3_omni.yaml" | Stage configuration file name | +| stage_config_name | Yes | "qwen3_omni_moe.yaml" | Stage configuration file name | *Dynamic Configuration (update/delete)* diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md index 3a8cb0f127c..a62297a8391 100644 --- a/docs/contributing/ci/tests_style.md +++ b/docs/contributing/ci/tests_style.md @@ -235,7 +235,7 @@ models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] #If you use the default configuration file, you can directly use the following address. def get_default_config(): - return get_deploy_config_path("ci/qwen3_omni_moe.yaml") + return get_deploy_config_path("qwen3_omni_moe.yaml") #If you need to modify the configuration file, you can use modify_stage_config. def get_chunk_config(): diff --git a/tests/dfx/conftest.py b/tests/dfx/conftest.py index 0c59bed994f..8dee350bd0c 100644 --- a/tests/dfx/conftest.py +++ b/tests/dfx/conftest.py @@ -11,7 +11,7 @@ from tests.dfx.reliability.helpers import list_remote_process_pids_by_pattern, post_chat_completions_raw from tests.helpers.runtime import OmniServerParams -from tests.helpers.stage_config import modify_stage_config +from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config from vllm_omni.platforms import current_omni_platform @@ -69,7 +69,7 @@ def _build_serve_args(serve_args: Any) -> list[str]: def create_unique_server_params( configs: list[dict[str, Any]], - stage_configs_dir: Path, + stage_configs_dir: Path | None = None, ) -> list[tuple[str, str, str | None, str | None, tuple[str, ...], bool]]: """Return one row per unique server configuration. @@ -87,7 +87,11 @@ def create_unique_server_params( model = server_params["model"] stage_config_name = server_params.get("stage_config_name") if stage_config_name: - stage_config_path = str(stage_configs_dir / stage_config_name) + stage_config_path = ( + str(stage_configs_dir / stage_config_name) + if stage_configs_dir is not None + else get_deploy_config_path(stage_config_name) + ) delete = server_params.get("delete", None) update = server_params.get("update", None) stage_config_path = modify_stage(stage_config_path, update, delete) diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index c1f3264c18e..868ba97f7bc 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -51,8 +51,7 @@ def _get_config_file_from_argv() -> str | None: OMNI_RESULT_TEMPLATE_PATH = Path(__file__).parent / "result_omni_template.json" -DEPLOY_CONFIGS_DIR = Path(__file__).parent.parent / "deploy" -test_params = create_unique_server_params(BENCHMARK_CONFIGS, DEPLOY_CONFIGS_DIR) +test_params = create_unique_server_params(BENCHMARK_CONFIGS) server_to_benchmark_mapping = create_test_parameter_mapping(BENCHMARK_CONFIGS) _omni_server_lock = threading.Lock() diff --git a/tests/diffusion/test_profiler.py b/tests/diffusion/test_profiler.py new file mode 100644 index 00000000000..3fcddf79183 --- /dev/null +++ b/tests/diffusion/test_profiler.py @@ -0,0 +1,321 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Unit tests for profiler trace collection across ranks. + +Tests that: +- OmniTorchProfilerWrapper writes trace files for each rank +- DiffusionWorker start/stop_profile lifecycle works per rank +- OmniStage handles profiler tasks via inline engine when queues are absent +""" + +import os +import tempfile + +import pytest +from pytest_mock import MockerFixture +from vllm.config import ProfilerConfig + +from vllm_omni.entrypoints.omni_stage import OmniStage +from vllm_omni.entrypoints.stage_utils import OmniStageTaskType +from vllm_omni.profiler.omni_torch_profiler import OmniTorchProfilerWrapper + +pytestmark = [pytest.mark.cpu] + + +# --------------------------------------------------------------------------- +# OmniTorchProfilerWrapper: per-rank trace file naming +# --------------------------------------------------------------------------- + + +class TestProfilerTraceNaming: + """Verify that each rank produces a uniquely named trace file.""" + + def test_trace_filename_includes_rank(self): + """_on_trace_ready should produce _rank.json.""" + with tempfile.TemporaryDirectory() as trace_dir: + config = ProfilerConfig( + profiler="torch", + torch_profiler_dir=trace_dir, + ) + for rank in (0, 1): + profiler = OmniTorchProfilerWrapper( + profiler_config=config, + worker_name=f"test_rank_{rank}", + local_rank=rank, + activities=["CPU"], + ) + profiler.set_trace_filename("test_trace") + + # Start → do nothing → stop triggers _on_trace_ready + profiler.start() + profiler.stop() + + # Both rank files should exist + files = sorted(os.listdir(trace_dir)) + rank0_files = [f for f in files if "_rank0.json" in f] + rank1_files = [f for f in files if "_rank1.json" in f] + assert rank0_files, f"No rank-0 trace found in {files}" + assert rank1_files, f"No rank-1 trace found in {files}" + + def test_trace_filename_with_full_path(self): + """When filename already contains a directory, use as-is.""" + with tempfile.TemporaryDirectory() as trace_dir: + config = ProfilerConfig( + profiler="torch", + torch_profiler_dir=trace_dir, + ) + profiler = OmniTorchProfilerWrapper( + profiler_config=config, + worker_name="test", + local_rank=3, + activities=["CPU"], + ) + full_path = os.path.join(trace_dir, "subdir", "my_trace") + profiler.set_trace_filename(full_path) + profiler.start() + profiler.stop() + + expected = f"{full_path}_rank3.json" + assert os.path.exists(expected), ( + f"Expected {expected}, found: {os.listdir(os.path.dirname(expected))}" + ) + + def test_get_results_returns_trace_path(self): + """get_results() should return the path of the exported trace.""" + with tempfile.TemporaryDirectory() as trace_dir: + config = ProfilerConfig( + profiler="torch", + torch_profiler_dir=trace_dir, + torch_profiler_use_gzip=False, + ) + profiler = OmniTorchProfilerWrapper( + profiler_config=config, + worker_name="test", + local_rank=0, + activities=["CPU"], + ) + profiler.set_trace_filename("results_test") + profiler.start() + profiler.stop() + + results = profiler.get_results() + assert results["trace"] is not None + assert results["trace"].endswith("_rank0.json") + assert os.path.exists(results["trace"]) + + +# --------------------------------------------------------------------------- +# DiffusionWorker: profiler lifecycle +# --------------------------------------------------------------------------- + + +class TestDiffusionWorkerProfiler: + """Test DiffusionWorker.start_profile / stop_profile.""" + + @pytest.fixture + def worker_with_profiler(self, mocker: MockerFixture): + """Create a DiffusionWorker with a real profiler (CPU-only).""" + from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker + + config = mocker.Mock() + config.num_gpus = 1 + config.master_port = 12345 + config.enable_sleep_mode = False + config.cache_backend = None + config.cache_config = None + config.model = "test-model" + config.profiler_config = ProfilerConfig( + profiler="torch", + torch_profiler_dir=tempfile.mkdtemp(), + torch_profiler_use_gzip=False, + ) + + mocker.patch.object(DiffusionWorker, "init_device") + mocker.patch.object(DiffusionWorker, "load_model") + mocker.patch.object(DiffusionWorker, "init_lora_manager") + + worker = DiffusionWorker( + local_rank=0, rank=0, od_config=config, skip_load_model=True, + ) + worker.model_runner = mocker.Mock() + return worker + + def test_start_stop_creates_trace(self, worker_with_profiler): + """start_profile + stop_profile should produce a trace file.""" + worker = worker_with_profiler + trace_dir = worker.od_config.profiler_config.torch_profiler_dir + + template = os.path.join(trace_dir, "test_worker") + worker.start_profile(template) + worker.stop_profile() + + files = os.listdir(trace_dir) + assert any("_rank0.json" in f for f in files), f"No rank-0 trace in {files}" + + def test_stop_profile_returns_results(self, worker_with_profiler): + """stop_profile should return dict with trace path.""" + worker = worker_with_profiler + trace_dir = worker.od_config.profiler_config.torch_profiler_dir + + template = os.path.join(trace_dir, "test_results") + worker.start_profile(template) + result = worker.stop_profile() + + assert isinstance(result, dict) + assert "trace" in result + assert result["trace"] is not None + assert os.path.exists(result["trace"]) + + def test_multiple_ranks_produce_separate_traces(self, mocker: MockerFixture): + """Two workers with different local_rank should write separate files.""" + from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker + + trace_dir = tempfile.mkdtemp() + + workers = [] + for rank in (0, 1): + config = mocker.Mock() + config.num_gpus = 2 + config.master_port = 12345 + config.enable_sleep_mode = False + config.cache_backend = None + config.cache_config = None + config.model = "test-model" + config.profiler_config = ProfilerConfig( + profiler="torch", + torch_profiler_dir=trace_dir, + torch_profiler_use_gzip=False, + ) + + mocker.patch.object(DiffusionWorker, "init_device") + mocker.patch.object(DiffusionWorker, "load_model") + mocker.patch.object(DiffusionWorker, "init_lora_manager") + + worker = DiffusionWorker( + local_rank=rank, rank=rank, od_config=config, skip_load_model=True, + ) + worker.model_runner = mocker.Mock() + workers.append(worker) + + # Start and stop profiling on both workers + template = os.path.join(trace_dir, "multi_rank") + for w in workers: + w.start_profile(template) + for w in workers: + w.stop_profile() + + files = os.listdir(trace_dir) + rank0_files = [f for f in files if "_rank0.json" in f] + rank1_files = [f for f in files if "_rank1.json" in f] + assert rank0_files, f"Missing rank-0 trace in {files}" + assert rank1_files, f"Missing rank-1 trace in {files}" + + +# --------------------------------------------------------------------------- +# OmniStage: inline engine profiler routing +# --------------------------------------------------------------------------- + + +class TestOmniStageInlineProfiler: + """Test that OmniStage routes profiler tasks to inline engine.""" + + @pytest.fixture + def stage_with_inline_engine(self, mocker: MockerFixture): + """Create an OmniStage with a mock inline engine (no queues).""" + stage_config = mocker.Mock() + stage_config.stage_id = 0 + stage_config.engine_args = mocker.Mock() + stage_config.engine_args.model_stage = "diffusion" + stage_config.engine_args.engine_output_type = None + stage_config.engine_args.stage_id = 0 + stage_config.runtime = mocker.Mock() + stage_config.runtime.requires_multimodal_data = False + stage_config.stage_type = "diffusion" + stage_config.final_output = True + stage_config.final_output_type = "video" + stage_config.is_comprehension = False + # No custom_process_input_func + del stage_config.custom_process_input_func + # No prompt_expand_func + del stage_config.prompt_expand_func + # Default sampling params + stage_config.default_sampling_params = {} + # No input sources + stage_config.input_sources = [] + stage_config.engine_input_source = [] + + # Patch SamplingParams import to avoid full init + mocker.patch( + "vllm_omni.entrypoints.omni_stage.OmniDiffusionSamplingParams", + return_value=mocker.Mock(), + ) + + stage = OmniStage(stage_config) + + # Attach a mock inline engine (simulates inline diffusion mode) + mock_engine = mocker.Mock() + mock_engine.start_profile = mocker.Mock() + mock_engine.stop_profile = mocker.Mock(return_value={"traces": ["t.json"], "tables": []}) + stage._inline_engine = mock_engine + + return stage, mock_engine + + def test_submit_profiler_start_routes_to_inline_engine(self, stage_with_inline_engine): + """submit(PROFILER_START) should call inline_engine.start_profile().""" + stage, mock_engine = stage_with_inline_engine + + stage.submit({"type": OmniStageTaskType.PROFILER_START}) + + mock_engine.start_profile.assert_called_once() + + def test_submit_profiler_stop_routes_to_inline_engine(self, stage_with_inline_engine): + """submit(PROFILER_STOP) should call inline_engine.stop_profile().""" + stage, mock_engine = stage_with_inline_engine + + stage.submit({"type": OmniStageTaskType.PROFILER_STOP}) + + mock_engine.stop_profile.assert_called_once() + + def test_stop_profile_returns_inline_engine_result(self, stage_with_inline_engine): + """stop_profile() should return the inline engine's result directly.""" + stage, mock_engine = stage_with_inline_engine + + result = stage.stop_profile() + + mock_engine.stop_profile.assert_called_once() + assert result == {"traces": ["t.json"], "tables": []} + + def test_submit_asserts_when_no_queue_and_no_inline_engine(self, mocker: MockerFixture): + """submit() should assert when neither queues nor inline engine available.""" + stage_config = mocker.Mock() + stage_config.stage_id = 0 + stage_config.engine_args = mocker.Mock() + stage_config.engine_args.model_stage = "diffusion" + stage_config.engine_args.engine_output_type = None + stage_config.engine_args.stage_id = 0 + stage_config.runtime = mocker.Mock() + stage_config.runtime.requires_multimodal_data = False + stage_config.stage_type = "diffusion" + stage_config.final_output = False + stage_config.final_output_type = None + stage_config.is_comprehension = False + del stage_config.custom_process_input_func + del stage_config.prompt_expand_func + stage_config.default_sampling_params = {} + stage_config.input_sources = [] + stage_config.engine_input_source = [] + + mocker.patch( + "vllm_omni.entrypoints.omni_stage.OmniDiffusionSamplingParams", + return_value=mocker.Mock(), + ) + + stage = OmniStage(stage_config) + # No inline engine, no queues + assert stage._inline_engine is None + assert stage._in_q is None + + with pytest.raises(AssertionError): + stage.submit({"type": OmniStageTaskType.PROFILER_START}) diff --git a/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py b/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py index 7fb71b28d77..6b3bb5e90e3 100644 --- a/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py +++ b/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py @@ -297,8 +297,10 @@ def build_arg_parser() -> argparse.ArgumentParser: p.add_argument("--daily-omni-input-mode", choices=("all", "visual", "audio"), default="all") p.add_argument( "--daily-extra-body-json", - default='{"modalities":["text"]}', - help="JSON merged into each chat request for Daily-Omni (default matches common L4 / text-output runs).", + default='{"modalities":["text"],"max_tokens":8192}', + help="JSON merged into each chat request for Daily-Omni. max_tokens:8192 gives the thinker " + "enough room to complete its reasoning trace before producing the final MCQ answer " + "(the production server default of 2048 can be insufficient for complex multimodal questions).", ) p.add_argument( "--daily-omni-save-eval-items", diff --git a/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py b/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py index 773f7c1108c..6ef21a7ccbf 100644 --- a/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py +++ b/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py @@ -51,8 +51,7 @@ ) from tests.helpers.mark import hardware_test from tests.helpers.runtime import OmniServerParams -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config -from vllm_omni.platforms import current_omni_platform +from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY _E2E_ROOT = Path(__file__).resolve().parent.parent.parent @@ -60,26 +59,10 @@ pytestmark = [pytest.mark.full_model, pytest.mark.omni] -_CI_DEPLOY = get_deploy_config_path("ci/qwen3_omni_moe.yaml") +_DEPLOY = QWEN3_OMNI_MOE_DEPLOY -def get_chunk_config(config_path: str | None = None): - """Load the qwen3_omni CI deploy yaml with async_chunk modifications for streaming mode.""" - if config_path is None: - config_path = _CI_DEPLOY - # TODO: remove this workaround once legacy `stage_args` path is deleted. - # The pipeline (qwen3_omni/pipeline.py) already wires - # thinker2talker_async_chunk / talker2code2wav_async_chunk on stage 0/1, - # so only async_chunk needs flipping. Writing nested `engine_args:` into - # the new-schema overlay trips _parse_stage_deploy's legacy branch and - # drops flat fields (load_format, max_num_seqs, ...). - return modify_stage_config(config_path, updates={"async_chunk": True}) - - -if current_omni_platform.is_xpu(): - stage_configs = [_CI_DEPLOY] -else: # CUDA + ROCm MI325 share the same deploy config - stage_configs = [get_chunk_config()] +stage_configs = [_DEPLOY] test_params = [ OmniServerParams(model=model, stage_config_path=stage_config) for model in models for stage_config in stage_configs diff --git a/tests/e2e/offline_inference/test_qwen3_omni.py b/tests/e2e/offline_inference/test_qwen3_omni.py index c4d257b5114..9e47efeb15f 100644 --- a/tests/e2e/offline_inference/test_qwen3_omni.py +++ b/tests/e2e/offline_inference/test_qwen3_omni.py @@ -11,20 +11,20 @@ from tests.helpers.mark import hardware_test from tests.helpers.media import generate_synthetic_video -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config +from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY, modify_stage_config from vllm_omni.platforms import current_omni_platform models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] -# Single CI deploy YAML; rocm/xpu deltas are picked automatically via the +# Single deploy YAML; rocm/xpu deltas are picked automatically via the # platforms: section. Only CUDA needs an extra enforce_eager tweak. -_CI_DEPLOY = get_deploy_config_path("ci/qwen3_omni_moe.yaml") +_DEPLOY = QWEN3_OMNI_MOE_DEPLOY def get_cuda_graph_config(): return modify_stage_config( - _CI_DEPLOY, + _DEPLOY, updates={ "stages": { 0: {"enforce_eager": True}, @@ -35,7 +35,7 @@ def get_cuda_graph_config(): if current_omni_platform.is_rocm() or current_omni_platform.is_xpu(): - stage_configs = [_CI_DEPLOY] + stage_configs = [_DEPLOY] else: stage_configs = [get_cuda_graph_config()] diff --git a/tests/e2e/offline_inference/test_qwen3_omni_autoround_w4a16.py b/tests/e2e/offline_inference/test_qwen3_omni_autoround_w4a16.py index 3a3c874b64b..6b1690a26fe 100644 --- a/tests/e2e/offline_inference/test_qwen3_omni_autoround_w4a16.py +++ b/tests/e2e/offline_inference/test_qwen3_omni_autoround_w4a16.py @@ -20,7 +20,7 @@ generate_synthetic_image, generate_synthetic_video, ) -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config +from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY, modify_stage_config QUANTIZED_MODEL = "Intel/Qwen3-Omni-30B-A3B-Instruct-int4-AutoRound" BASELINE_MODEL = "Qwen/Qwen3-Omni-30B-A3B-Instruct" @@ -29,7 +29,7 @@ QUANTIZED_MODEL = os.environ.get("QWEN3_OMNI_AUTOROUND_MODEL", QUANTIZED_MODEL) BASELINE_MODEL = os.environ.get("QWEN3_OMNI_BASELINE_MODEL", BASELINE_MODEL) -_CI_DEPLOY = get_deploy_config_path("ci/qwen3_omni_moe.yaml") +_DEPLOY = QWEN3_OMNI_MOE_DEPLOY @pytest.fixture(scope="module", autouse=True) @@ -48,7 +48,7 @@ def _qwen3_omni_env(): def _get_stage_config(): """Build a CI-friendly stage config with eager mode.""" return modify_stage_config( - _CI_DEPLOY, + _DEPLOY, updates={ "stages": { 0: {"enforce_eager": True}, diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py index ed210dc9c2f..4c586c7e9a3 100644 --- a/tests/e2e/online_serving/test_qwen3_omni.py +++ b/tests/e2e/online_serving/test_qwen3_omni.py @@ -9,8 +9,7 @@ from tests.helpers.mark import hardware_test from tests.helpers.media import generate_synthetic_audio, generate_synthetic_image, generate_synthetic_video from tests.helpers.runtime import OmniServerParams, dummy_messages_from_mix_data -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config -from vllm_omni.platforms import current_omni_platform +from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY, modify_stage_config os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" @@ -21,20 +20,7 @@ # Set VLLM_TEST_PD_MODE=1 to test PD disaggregation (follow-up — deploy overlay not yet migrated). _USE_PD = os.environ.get("VLLM_TEST_PD_MODE", "0") == "1" -_CI_DEPLOY = get_deploy_config_path("ci/qwen3_omni_moe.yaml") - - -def get_chunk_config(config_path: str | None = None): - """Load the qwen3_omni CI deploy yaml with async_chunk modifications for streaming mode.""" - if config_path is None: - config_path = _CI_DEPLOY - # TODO: remove this workaround once legacy `stage_args` path is deleted. - # The pipeline (qwen3_omni/pipeline.py) already wires - # thinker2talker_async_chunk / talker2code2wav_async_chunk on stage 0/1, - # so only async_chunk needs flipping. Writing nested `engine_args:` into - # the new-schema overlay trips _parse_stage_deploy's legacy branch and - # drops flat fields (load_format, max_num_seqs, ...). - return modify_stage_config(config_path, updates={"async_chunk": True}) +_DEPLOY = QWEN3_OMNI_MOE_DEPLOY def get_prefix_caching_config(config_path: str): @@ -42,24 +28,21 @@ def get_prefix_caching_config(config_path: str): path = modify_stage_config( config_path, updates={ - "stage_args": { - 0: {"engine_args.enable_prefix_caching": True}, + "stages": { + 0: {"enable_prefix_caching": True}, }, }, ) return path -# Platform-specific overrides live inside the new deploy yaml's ``platforms:`` -# section, so a single ``_CI_DEPLOY`` path serves CUDA, ROCm, and XPU. +# Platform-specific overrides live inside the deploy yaml's ``platforms:`` +# section, so a single ``_DEPLOY`` path serves CUDA, ROCm, and XPU. # TODO: re-add VLLM_TEST_PD_MODE branch once the PD-disaggregation deploy # overlay has been migrated to the new schema (previously used the deleted # ``qwen3_omni_moe_pd_ci.yaml`` stage-configs file). -if current_omni_platform.is_xpu(): - stage_configs = [_CI_DEPLOY] -else: # CUDA + ROCm MI325 share the same deploy config - stage_configs = [get_chunk_config()] -prefix_caching_stage_configs = [get_prefix_caching_config(_CI_DEPLOY)] +stage_configs = [_DEPLOY] +prefix_caching_stage_configs = [get_prefix_caching_config(_DEPLOY)] # Create parameter combinations for model and stage config test_params = [ diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index 9601057d44c..a6bef9d135f 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -11,7 +11,7 @@ from tests.helpers.mark import hardware_test from tests.helpers.media import generate_synthetic_audio, generate_synthetic_image, generate_synthetic_video from tests.helpers.runtime import OmniServerParams, dummy_messages_from_mix_data -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config +from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY, modify_stage_config pytestmark = [pytest.mark.full_model, pytest.mark.omni] @@ -66,11 +66,9 @@ def get_async_chunk_config(default_path): ) -# CI deploy YAML (single file; xpu deltas applied via ``platforms:`` section). -# The overlay explicitly sets ``async_chunk: False``, so ``default`` tests the -# sync path and ``async_chunk`` tests the streaming path with a longer thinker -# output — two distinct scenarios, kept as separate parametrizations. -default_path = get_deploy_config_path("ci/qwen3_omni_moe.yaml") +# Qwen3-Omni uses the default deploy YAML. The sync variant disables async +# chunk through CLI so both parametrizations share the same config source. +default_path = QWEN3_OMNI_MOE_DEPLOY test_params = [ pytest.param( diff --git a/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py b/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py index 90f8897c58f..dae5a254d4c 100644 --- a/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py +++ b/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py @@ -23,7 +23,7 @@ generate_synthetic_audio, ) from tests.helpers.runtime import OmniServerParams -from tests.helpers.stage_config import get_deploy_config_path +from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -32,9 +32,9 @@ # Synthetic input for realtime E2E (``generate_synthetic_audio``); distinct cache file per phrase. REALTIME_SYNTH_PHRASE_TEXT = "Translate into Chinese: Beijing is the Capital of China" -# The new-schema CI overlay bakes in async_chunk: False and covers CUDA/ROCm/XPU -# via its ``platforms:`` section, so one path serves all three. -default_stage_config = get_deploy_config_path("ci/qwen3_omni_moe.yaml") +# Use the default deploy config; the sync realtime path disables async chunk +# through CLI. +default_stage_config = QWEN3_OMNI_MOE_DEPLOY realtime_server_params = [ pytest.param( diff --git a/tests/examples/online_serving/test_qwen3_omni.py b/tests/examples/online_serving/test_qwen3_omni.py index 1bb577ed656..ade101bd065 100644 --- a/tests/examples/online_serving/test_qwen3_omni.py +++ b/tests/examples/online_serving/test_qwen3_omni.py @@ -21,15 +21,14 @@ from tests.helpers.mark import hardware_test from tests.helpers.media import convert_audio_file_to_text, cosine_similarity_text from tests.helpers.runtime import OmniServerParams -from tests.helpers.stage_config import get_deploy_config_path +from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY pytestmark = [pytest.mark.full_model, pytest.mark.example, pytest.mark.omni] models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] -stage_configs = [get_deploy_config_path("ci/qwen3_omni_moe.yaml")] - +stage_configs = [QWEN3_OMNI_MOE_DEPLOY] example_dir = str(Path(__file__).parent.parent.parent.parent / "examples" / "online_serving") # Create parameter combinations for model and stage config @@ -205,15 +204,14 @@ def test_stream_001(omni_server) -> None: text_content_tmp = extract_content_after_keyword("content:", result) text_content = strip_audio_saved_to_lines(text_content_tmp) - # Verify text output same as audio output + # In streaming mode, audio is emitted as multiple small chunks; only the last + # chunk path is captured by extract_last_audio_saved_path, so keyword + # verification must use text_content (the complete accumulated response). wav_path = extract_last_audio_saved_path(result) audio_content = convert_audio_file_to_text(output_path=f"./{wav_path}") print(f"text content is: {text_content}") - assert "cherry blossom" in audio_content, "The output does not contain any of the keywords." print(f"audio content is: {audio_content}") - similarity = cosine_similarity_text(audio_content.lower(), text_content.lower()) - print(f"similarity is: {similarity}") - assert similarity > 0.9, "The audio content is not same as the text" + assert "cherry blossom" in text_content, "The output does not contain any of the keywords." # TODO: Verify the E2E latency after confirmation baseline. diff --git a/tests/helpers/stage_config.py b/tests/helpers/stage_config.py index 66a58378d32..fcde3161d17 100644 --- a/tests/helpers/stage_config.py +++ b/tests/helpers/stage_config.py @@ -325,92 +325,6 @@ def delete_by_path(config_dict: dict, path: str) -> None: }, }, }, - "qwen3_omni_moe": { - "base_config": "qwen3_omni_moe.yaml", - "async_chunk": False, - "stages": [ - { - "stage_id": 0, - "max_num_seqs": 5, - "max_model_len": 32768, - "mm_processor_cache_gb": 0, - "load_format": "dummy", - "default_sampling_params": {"max_tokens": 150, "ignore_eos": False}, - }, - { - "stage_id": 1, - "gpu_memory_utilization": 0.5, - "max_num_seqs": 5, - "max_model_len": 32768, - "load_format": "dummy", - "default_sampling_params": {"max_tokens": 1000}, - }, - { - "stage_id": 2, - "max_num_seqs": 5, - "max_num_batched_tokens": 100000, - "load_format": "dummy", - "default_sampling_params": {"max_tokens": 2000}, - }, - ], - "platforms": { - "rocm": { - "stages": [ - {"stage_id": 0, "max_num_seqs": 1, "default_sampling_params": {"max_tokens": 100}}, - { - "stage_id": 1, - "max_num_seqs": 1, - "enforce_eager": True, - "default_sampling_params": {"max_tokens": 100}, - }, - { - "stage_id": 2, - "max_num_seqs": 1, - "max_num_batched_tokens": 1000000, - "default_sampling_params": {"max_tokens": 200}, - }, - ], - }, - "xpu": { - "stages": [ - { - "stage_id": 0, - "gpu_memory_utilization": 0.85, - "max_num_seqs": 1, - "tensor_parallel_size": 4, - "enforce_eager": True, - "max_num_batched_tokens": 4096, - "max_model_len": 4096, - "max_cudagraph_capture_size": 0, - "skip_mm_profiling": True, - "devices": "0,1,2,3", - "default_sampling_params": {"max_tokens": 100, "ignore_eos": False}, - }, - { - "stage_id": 1, - "gpu_memory_utilization": 0.6, - "max_num_seqs": 1, - "enforce_eager": True, - "max_num_batched_tokens": 4096, - "max_model_len": 4096, - "max_cudagraph_capture_size": 0, - "skip_mm_profiling": True, - "devices": "4", - }, - { - "stage_id": 2, - "gpu_memory_utilization": 0.3, - "max_num_seqs": 1, - "max_num_batched_tokens": 100000, - "max_cudagraph_capture_size": 0, - "skip_mm_profiling": True, - "devices": "5", - "default_sampling_params": {"max_tokens": 2000}, - }, - ], - }, - }, - }, "bagel": { "base_config": "bagel.yaml", "stages": [ @@ -588,7 +502,10 @@ def get_deploy_config_path(rel_path: str) -> str: return str(_DEPLOY_DIR / rel_path) +QWEN3_OMNI_MOE_DEPLOY = get_deploy_config_path("qwen3_omni_moe.yaml") + __all__ = [ "modify_stage_config", "get_deploy_config_path", + "QWEN3_OMNI_MOE_DEPLOY", ] diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py index 22dcf26d097..dd61c7360f9 100644 --- a/tests/test_config_factory.py +++ b/tests/test_config_factory.py @@ -136,6 +136,15 @@ def test_to_omegaconf_max_batch_size_deprecation(self): assert len(deprecation_warnings) == 1 assert "max_batch_size" in str(deprecation_warnings[0].message) + def test_to_omegaconf_leaves_max_num_seqs_unset_by_default(self): + """Let vLLM choose its default max_num_seqs when stage config omits it.""" + config = StageConfig( + stage_id=0, + model_stage="thinker", + ) + omega_config = config.to_omegaconf() + assert "max_num_seqs" not in omega_config.engine_args + def test_to_omegaconf_max_num_seqs_in_engine_args(self): """Test that max_num_seqs in yaml_engine_args takes precedence.""" config = StageConfig( @@ -1156,41 +1165,30 @@ def test_thinker_only_yaml_loads_and_merges(self): class TestBaseConfigInheritance: """Test deploy YAML base_config inheritance.""" - def test_ci_inherits_from_main(self): - from tests.helpers.stage_config import get_deploy_config_path + def test_qwen3_omni_deploy_config(self): from vllm_omni.config.stage_config import load_deploy_config - ci_path = Path(get_deploy_config_path("ci/qwen3_omni_moe.yaml")) - if not ci_path.exists(): - pytest.skip("CI deploy config not found") + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" + if not deploy_path.exists(): + pytest.skip("Qwen3-Omni deploy config not found") - deploy = load_deploy_config(ci_path) + deploy = load_deploy_config(deploy_path) assert len(deploy.stages) == 3 - # CI overrides - assert deploy.stages[0].engine_extras.get("load_format") == "dummy" - assert deploy.stages[0].max_num_seqs == 5 - # Inherited from base - assert deploy.stages[0].gpu_memory_utilization == 0.9 + assert deploy.stages[0].engine_extras.get("gpu_memory_utilization") == 0.9 assert deploy.connectors is not None assert "connector_of_shared_memory" in deploy.connectors - # CI overlay explicitly sets async_chunk: False (see - # tests.helpers.stage_config._CI_OVERLAYS and PR #2383 discussion). Overlay - # bool overrides base even when the base yaml has async_chunk: true. - assert deploy.async_chunk is False + assert deploy.async_chunk is True - def test_ci_sampling_merge(self): - from tests.helpers.stage_config import get_deploy_config_path + def test_qwen3_omni_deploy_sampling_params(self): from vllm_omni.config.stage_config import load_deploy_config - ci_path = Path(get_deploy_config_path("ci/qwen3_omni_moe.yaml")) - if not ci_path.exists(): - pytest.skip("CI deploy config not found") + deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml" + if not deploy_path.exists(): + pytest.skip("Qwen3-Omni deploy config not found") - deploy = load_deploy_config(ci_path) + deploy = load_deploy_config(deploy_path) s0 = deploy.stages[0].default_sampling_params - # CI overrides max_tokens - assert s0["max_tokens"] == 150 - # Inherited from base + assert s0["max_tokens"] == 2048 assert s0["temperature"] == 0.4 assert s0["seed"] == 42 @@ -1207,7 +1205,7 @@ def test_pure_inheritance_overlay(self, tmp_path): deploy = load_deploy_config(overlay) assert len(deploy.stages) == 3 - assert deploy.stages[0].gpu_memory_utilization == 0.9 + assert deploy.stages[0].engine_extras.get("gpu_memory_utilization") == 0.9 def test_single_field_overlay(self, tmp_path): """An overlay overriding one stage field merges with the base.""" @@ -1221,9 +1219,10 @@ def test_single_field_overlay(self, tmp_path): overlay.write_text(f"base_config: {base}\nstages:\n - stage_id: 2\n max_num_batched_tokens: 1000000\n") deploy = load_deploy_config(overlay) - assert deploy.stages[2].max_num_batched_tokens == 1000000 - # Rest inherited - assert deploy.stages[0].gpu_memory_utilization == 0.9 + # max_num_batched_tokens goes into engine_extras (not a StageDeployConfig field) + assert deploy.stages[2].engine_extras.get("max_num_batched_tokens") == 1000000 + # max_num_seqs is in engine_extras (no longer a direct StageDeployConfig field) + assert deploy.stages[0].engine_extras.get("max_num_seqs") == 64 class TestPlatformOverrides: @@ -1241,11 +1240,11 @@ def test_npu_overrides(self): deploy = load_deploy_config(deploy_path) deploy = _apply_platform_overrides(deploy, platform="npu") - assert deploy.stages[0].gpu_memory_utilization == 0.6 - assert deploy.stages[0].tensor_parallel_size == 2 + assert deploy.stages[0].engine_extras.get("gpu_memory_utilization") == 0.6 + assert deploy.stages[0].engine_extras.get("tensor_parallel_size") == 2 assert deploy.stages[0].devices == "0,1" # Stage 2 unaffected fields stay at base - assert deploy.stages[2].enforce_eager is True + assert deploy.stages[2].engine_extras.get("enforce_eager") is True def test_xpu_overrides(self): from pathlib import Path @@ -1259,7 +1258,7 @@ def test_xpu_overrides(self): deploy = load_deploy_config(deploy_path) deploy = _apply_platform_overrides(deploy, platform="xpu") - assert deploy.stages[0].tensor_parallel_size == 4 + assert deploy.stages[0].engine_extras.get("tensor_parallel_size") == 4 assert deploy.stages[0].devices == "0,1,2,3" assert deploy.stages[0].engine_extras.get("max_cudagraph_capture_size") == 0 @@ -1273,9 +1272,9 @@ def test_unknown_platform_noop(self): pytest.skip("Deploy config not found") deploy = load_deploy_config(deploy_path) - original_mem = deploy.stages[0].gpu_memory_utilization + original_mem = deploy.stages[0].engine_extras.get("gpu_memory_utilization") deploy = _apply_platform_overrides(deploy, platform="unknown_hw") - assert deploy.stages[0].gpu_memory_utilization == original_mem + assert deploy.stages[0].engine_extras.get("gpu_memory_utilization") == original_mem def test_platforms_deep_merge_inheritance(self, tmp_path): """Overlay's platforms: block layers onto base's, per-stage.""" @@ -1305,10 +1304,10 @@ def test_platforms_deep_merge_inheritance(self, tmp_path): deploy = load_deploy_config(overlay) deploy = _apply_platform_overrides(deploy, platform="rocm") # Both base's enforce_eager and overlay's max_num_seqs should apply. - assert deploy.stages[0].enforce_eager is True - assert deploy.stages[0].max_num_seqs == 1 + assert deploy.stages[0].engine_extras.get("enforce_eager") is True + assert deploy.stages[0].engine_extras.get("max_num_seqs") == 1 # Inherited stage default not touched by overlay platforms section. - assert deploy.stages[0].gpu_memory_utilization == 0.9 + assert deploy.stages[0].engine_extras.get("gpu_memory_utilization") == 0.9 class TestCLIOverrideFlow: diff --git a/vllm_omni/benchmarks/data_modules/daily_omni_eval.py b/vllm_omni/benchmarks/data_modules/daily_omni_eval.py index f191cf2febc..a84792dd58b 100644 --- a/vllm_omni/benchmarks/data_modules/daily_omni_eval.py +++ b/vllm_omni/benchmarks/data_modules/daily_omni_eval.py @@ -47,6 +47,11 @@ def extract_choice_letter_official(text: str | None) -> str | None: return None match = re.search(r"assistant\s*([\s\S]*)$", raw, flags=re.IGNORECASE) candidate = match.group(1).strip() if match else raw + # Strip ... reasoning traces (Qwen3-Omni thinking model output) so we look + # only at the final answer, not option letters mentioned inside the thinking trace. + post_think = re.sub(r"[\s\S]*?", "", candidate, flags=re.IGNORECASE).strip() + if post_think: + candidate = post_think direct = re.match(r"(?i)^\s*([A-D])(?:[\s\.\)::]|$)", candidate) if direct: return direct.group(1).upper() diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index 44cc83baea8..22510980b48 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -402,13 +402,6 @@ class StageDeployConfig: """ stage_id: int - max_num_seqs: int = 64 - gpu_memory_utilization: float = 0.9 - tensor_parallel_size: int = 1 - enforce_eager: bool = False - max_num_batched_tokens: int = 32768 - max_model_len: int | None = None - async_scheduling: bool | None = None devices: str = "0" output_connectors: dict[str, str] | None = None input_connectors: dict[str, str] | None = None @@ -455,6 +448,7 @@ class DeployConfig: "output_connectors", "input_connectors", "default_sampling_params", + "subtalker_sampling_params", "engine_extras", } ) @@ -883,8 +877,6 @@ def to_omegaconf(self) -> Any: effective_mbs = int(cli_mbs or legacy_mbs or 1) engine_args.setdefault("max_num_seqs", effective_mbs) - engine_args.setdefault("max_num_seqs", 1) - # Build full config dict config_dict: dict[str, Any] = { "stage_id": self.stage_id, diff --git a/vllm_omni/deploy/qwen2_5_omni.yaml b/vllm_omni/deploy/qwen2_5_omni.yaml index 41aef0df6f6..fe84005baf3 100644 --- a/vllm_omni/deploy/qwen2_5_omni.yaml +++ b/vllm_omni/deploy/qwen2_5_omni.yaml @@ -3,10 +3,9 @@ # flashinfer; the autotune dummy run OOMs the shared cuda:0 device otherwise. # # Fields omitted from a stage fall back to StageDeployConfig dataclass -# defaults (see vllm_omni/config/stage_config.py). For instance, every -# stage here uses vLLM's default max_num_batched_tokens=32768 because -# chat-sized prefill comfortably fits; only models with codec prefill -# (Qwen3-Omni, Qwen3-TTS) need to bump it above 32k. +# defaults (see vllm_omni/config/stage_config.py). Omitting +# max_num_batched_tokens inherits vLLM's hardware-specific default +# (e.g., 16384 for H100, 8192 for others). # # enforce_eager policy across the three deploy YAMLs: # * code2wav / generation stages: always true (cudagraph incompatible with diff --git a/vllm_omni/deploy/qwen3_omni_moe.yaml b/vllm_omni/deploy/qwen3_omni_moe.yaml index 39baed6bd7b..9ea180e137e 100644 --- a/vllm_omni/deploy/qwen3_omni_moe.yaml +++ b/vllm_omni/deploy/qwen3_omni_moe.yaml @@ -23,6 +23,8 @@ connectors: stages: - stage_id: 0 gpu_memory_utilization: 0.9 + max_num_seqs: 64 + max_num_batched_tokens: 32768 devices: "0" default_sampling_params: temperature: 0.4 @@ -34,7 +36,9 @@ stages: - stage_id: 1 gpu_memory_utilization: 0.6 + max_num_seqs: 64 devices: "1" + max_num_batched_tokens: 32768 input_connectors: from_stage_0: connector_of_shared_memory default_sampling_params: @@ -46,6 +50,7 @@ stages: - stage_id: 2 gpu_memory_utilization: 0.1 + max_num_seqs: 64 enforce_eager: true async_scheduling: false max_num_batched_tokens: 51200