Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions tests/dfx/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,17 @@ def _build_serve_args(serve_args: Any) -> list[str]:
def create_unique_server_params(
configs: list[dict[str, Any]],
stage_configs_dir: Path,
) -> list[tuple[str, str, str | None, str | None, tuple[str, ...]]]:
"""Return one row per unique server configuration (same 5-tuple shape as upstream).
) -> list[tuple[str, str, str | None, str | None, tuple[str, ...], bool]]:
"""Return one row per unique server configuration.

``(test_name, model, deploy_yaml_path, stage_overrides_json, extra_cli_args)``.
``(test_name, model, deploy_yaml_path, stage_overrides_json, extra_cli_args, use_omni)``.

JSON ``server_params.serve_args`` (dict/list) is expanded via ``_build_serve_args``
and **prepended** to ``extra_cli_args`` so perf / stability ``omni_server`` fixtures
stay identical to main while still honoring ``serve_args`` in benchmark JSON.
"""
unique_params: list[tuple[str, str, str | None, str | None, tuple[str, ...]]] = []
seen: set[tuple[str, str, str | None, str | None, tuple[str, ...]]] = set()
unique_params: list[tuple[str, str, str | None, str | None, tuple[str, ...], bool]] = []
seen: set[tuple[str, str, str | None, str | None, tuple[str, ...], bool]] = set()
for config in configs:
test_name = config["test_name"]
server_params = config["server_params"]
Expand All @@ -104,8 +104,16 @@ def create_unique_server_params(
serve_flat = _build_serve_args(server_params.get("serve_args"))
raw_extra = tuple(server_params.get("extra_cli_args") or ())
extra_cli_args = tuple(serve_flat) + raw_extra

server_param = (test_name, model, stage_config_path, stage_overrides_json, extra_cli_args)
use_omni = bool(server_params.get("use_omni", True))

server_param = (
test_name,
model,
stage_config_path,
stage_overrides_json,
extra_cli_args,
use_omni,
)
Comment on lines +114 to +116
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Keep reliability tuple unpacking compatible

Adding use_omni to server_param makes create_unique_server_params emit 6-tuples, but create_reliability_omni_server_params in the same file still unpacks each entry as 5 values. When reliability tests initialize QWEN_PARAMS/WAN_PARAMS, this now raises ValueError: too many values to unpack, so those suites fail before running any test logic.

Useful? React with 👍 / 👎.

if server_param not in seen:
seen.add(server_param)
unique_params.append(server_param)
Expand Down Expand Up @@ -145,8 +153,9 @@ def create_reliability_omni_server_params(
model=model,
stage_config_path=stage_config_path,
server_args=server_args_by_name.get(test_name),
use_omni=use_omni,
)
for test_name, model, stage_config_path, _stage_overrides_json, _extra_cli_args in unique_params
for test_name, model, stage_config_path, _stage_overrides_json, _extra_cli_args, use_omni in unique_params
]


Expand Down
8 changes: 5 additions & 3 deletions tests/dfx/perf/scripts/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,13 @@ def omni_server(request):
Multi-stage initialization can take 10-20+ minutes.
"""
with _omni_server_lock:
test_name, model, stage_config_path, stage_overrides, extra_cli_args = request.param
test_name, model, stage_config_path, stage_overrides, extra_cli_args, use_omni = request.param

print(f"Starting OmniServer with test: {test_name}, model: {model}")

server_args = ["--stage-init-timeout", "600", "--init-timeout", "900"]
server_args: list[str] = []
if use_omni:
server_args += ["--stage-init-timeout", "600", "--init-timeout", "900"]
# --deploy-config and --stage-overrides compose at the CLI (see vllm_omni/entrypoints/utils.py):
# deploy-config sets the base; stage-overrides are applied on top. Both can be set.
if stage_config_path:
Expand All @@ -78,7 +80,7 @@ def omni_server(request):
server_args = ["--stage-overrides", stage_overrides] + server_args
if extra_cli_args:
server_args = list(extra_cli_args) + server_args
with OmniServer(model, server_args) as server:
with OmniServer(model, server_args, use_omni=use_omni) as server:
server.test_name = test_name
print("OmniServer started successfully")
yield server
Expand Down
135 changes: 135 additions & 0 deletions tests/dfx/perf/tests/test_qwen_omni.json
Original file line number Diff line number Diff line change
Expand Up @@ -311,5 +311,140 @@
}
}
]
},
{
"test_name": "test_qwen3_omni_vllm_text",
"server_params": {
"model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"use_omni": false,
"extra_cli_args": ["--no-enable-prefix-caching"]
},
"benchmark_params": [
{
"dataset_name": "random",
"backend": "vllm",
"model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"endpoint": "/v1/completions",
"num_prompts": [4, 16, 32, 64, 128],
"max_concurrency": [1, 4, 8, 16, 32],
"random_input_len": 2500,
"random_output_len": 900,
"temperature": 0.4,
"top_p": 0.9,
"top_k": 1,
"seed": 42,
"repetition_penalty": 1.05,
"ignore_eos": true,
"extra_body": {
"modalities": ["text"]
},
"percentile-metrics": "ttft,tpot,itl,e2el",
"baseline": {
"mean_ttft_ms": [1000, 3000, 5000, 7000, 9000]
}
},
{
"dataset_name": "random-mm",
"backend": "vllm",
"model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"endpoint": "/v1/completions",
"num_prompts": [10],
"request_rate": [0.1],
"random_input_len": 100,
"random_output_len": 100,
"random_range_ratio": 0.0,
"temperature": 0.4,
"top_p": 0.9,
"top_k": 1,
"seed": 42,
"repetition_penalty": 1.05,
"ignore_eos": true,
"extra_body": {
"modalities": ["text"]
},
"random_mm_base_items_per_request": 1,
"random_mm_num_mm_items_range_ratio": 0.5,
"random_mm_limit_mm_per_prompt": {
"audio": 1
},
"random_mm_bucket_config": {
"(0, 60, 3)": 1.0
},
"percentile-metrics": "ttft,tpot,itl,e2el",
"baseline": {
"mean_ttft_ms": [2000]
}
},
{
"dataset_name": "random-mm",
"backend": "vllm",
"model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"endpoint": "/v1/completions",
"num_prompts": [40],
"request_rate": [0.5],
"random_input_len": 100,
"random_output_len": 100,
"random_range_ratio": 0.0,
"temperature": 0.4,
"top_p": 0.9,
"top_k": 1,
"seed": 42,
"repetition_penalty": 1.05,
"ignore_eos": true,
"extra_body": {
"modalities": ["text"]
},
"random_mm_base_items_per_request": 2,
"random_mm_num_mm_items_range_ratio": 0.5,
"random_mm_limit_mm_per_prompt": {
"image": 1,
"video": 1
},
"random_mm_bucket_config": {
"(256, 256, 1)": 0.5,
"(720, 1280, 2)": 0.5
},
"percentile-metrics": "ttft,tpot,itl,e2el",
"baseline": {
"mean_ttft_ms": [6000]
}
},
{
"dataset_name": "random-mm",
"backend": "vllm",
"model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"endpoint": "/v1/completions",
"num_prompts": [100],
"request_rate": [1.0],
"random_input_len": 100,
"random_output_len": 100,
"random_range_ratio": 0.0,
"temperature": 0.4,
"top_p": 0.9,
"top_k": 1,
"seed": 42,
"repetition_penalty": 1.05,
"ignore_eos": true,
"extra_body": {
"modalities": ["text"]
},
"random_mm_base_items_per_request": 3,
"random_mm_num_mm_items_range_ratio": 0.5,
"random_mm_limit_mm_per_prompt": {
"image": 1,
"video": 1,
"audio": 1
},
"random_mm_bucket_config": {
"(256, 256, 1)": 0.34,
"(720, 1280, 2)": 0.33,
"(0, 60, 3)": 0.33
},
"percentile-metrics": "ttft,tpot,itl,e2el",
"baseline": {
"mean_ttft_ms": [6000]
}
}
]
}
]
10 changes: 6 additions & 4 deletions tests/dfx/stability/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,22 @@ def omni_server(request: pytest.FixtureRequest):
"""Start OmniServer for stability tests, with per-module timeout override."""
timeout_args = getattr(request.module, "STABILITY_SERVER_TIMEOUT_ARGS", DEFAULT_STABILITY_SERVER_TIMEOUT_ARGS)
with _omni_server_lock:
# Same 5-tuple and CLI composition as ``tests/dfx/perf/scripts/run_benchmark.py`` on main;
# Same tuple and CLI composition as ``tests/dfx/perf/scripts/run_benchmark.py``;
# ``serve_args`` from JSON are folded into ``extra_cli_args`` inside
# ``create_unique_server_params``.
test_name, model, deploy_path, stage_overrides, extra_cli_args = request.param
test_name, model, deploy_path, stage_overrides, extra_cli_args, use_omni = request.param

print(f"Starting OmniServer with test: {test_name}, model: {model}")
server_args = list(timeout_args)
server_args: list[str] = []
if use_omni:
server_args += list(timeout_args)
if deploy_path:
server_args = ["--deploy-config", deploy_path] + server_args
if stage_overrides:
server_args = ["--stage-overrides", stage_overrides] + server_args
if extra_cli_args:
server_args = list(extra_cli_args) + server_args
with OmniServer(model, server_args) as server:
with OmniServer(model, server_args, use_omni=use_omni) as server:
server.test_name = test_name
print("OmniServer started successfully")
yield server
Expand Down
43 changes: 43 additions & 0 deletions tests/worker/test_gpu_ar_model_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from types import SimpleNamespace

import pytest

from vllm_omni.worker.gpu_ar_model_runner import GPUARModelRunner

pytestmark = [pytest.mark.core_model, pytest.mark.cpu]


def _make_runner(engine_output_type: str | None, downstream_req_ids: set[str]) -> GPUARModelRunner:
runner = object.__new__(GPUARModelRunner)
runner.vllm_config = SimpleNamespace(
model_config=SimpleNamespace(engine_output_type=engine_output_type),
)
runner._request_needs_downstream_stage_payload = lambda rid: rid in downstream_req_ids
return runner


def test_resolve_pooler_payload_req_ids_audio_terminal_stage_keeps_payload():
runner = _make_runner(engine_output_type="audio", downstream_req_ids=set())

engine_output_type, payload_req_ids = GPUARModelRunner._resolve_pooler_payload_req_ids(runner, ["r1", "r2"])

assert engine_output_type == "audio"
assert payload_req_ids == ["r1", "r2"]


def test_resolve_pooler_payload_req_ids_text_terminal_stage_drops_payload():
runner = _make_runner(engine_output_type="text", downstream_req_ids=set())

engine_output_type, payload_req_ids = GPUARModelRunner._resolve_pooler_payload_req_ids(runner, ["r1", "r2"])

assert engine_output_type == "text"
assert payload_req_ids == []


def test_resolve_pooler_payload_req_ids_downstream_stage_uses_filtered_requests():
runner = _make_runner(engine_output_type="latent", downstream_req_ids={"r2"})

engine_output_type, payload_req_ids = GPUARModelRunner._resolve_pooler_payload_req_ids(runner, ["r1", "r2", "r3"])

assert engine_output_type == "latent"
assert payload_req_ids == ["r2"]
5 changes: 4 additions & 1 deletion vllm_omni/benchmarks/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,10 @@ def calculate_metrics(
total_input += outputs[i].prompt_len
tpot = 0
if output_len > 1:
latency_minus_ttft = outputs[i].text_latency - outputs[i].ttft
try:
latency_minus_ttft = outputs[i].text_latency - outputs[i].ttft
except Exception:
latency_minus_ttft = outputs[i].latency - outputs[i].ttft
tpot = latency_minus_ttft / (output_len - 1)
tpots.append(tpot)
# Note: if output_len <= 1, we regard tpot as 0 for goodput
Expand Down
2 changes: 1 addition & 1 deletion vllm_omni/core/sched/omni_ar_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from vllm.distributed.kv_events import KVEventBatch
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
from vllm.logger import init_logger
from vllm.v1.core.sched.async_scheduler import AsyncScheduler as VLLMScheduler
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.core.sched.scheduler import Scheduler as VLLMScheduler
from vllm.v1.core.sched.utils import remove_all
from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
from vllm.v1.metrics.perf import PerfStats
Expand Down
Loading
Loading