Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion .buildkite/test-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,51 @@ steps:
path: /mnt/hf-cache
type: DirectoryOrCreate

- label: ":full_moon: Omni · Accuracy Test"
timeout_in_minutes: 180
commands:
- export SEED_TTS_WER_EVAL=1
- export SEED_TTS_EVAL_DEVICE=cuda:1
- |
set +e
pytest -s -v tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py -m "full_model" --run-level full_model
EXIT=$$?
buildkite-agent artifact upload "tests/e2e/accuracy/qwen3_omni/results/qwen_omni_acc/*.json"
exit $$EXIT
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 2
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate

- label: ":full_moon: Omni · Perf Test"
key: nightly-omni-performance
timeout_in_minutes: 180
Expand Down Expand Up @@ -514,7 +559,7 @@ steps:
- label: ":full_moon: Diffusion X2V · Function Test"
timeout_in_minutes: 90
commands:
- pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py -m "full_model" --run-level "full_model"
- pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py --run-level "full_model"
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down
14 changes: 4 additions & 10 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,23 +56,17 @@ dev = [
"opencc>=1.2.0",
"mistune>=3.2.0", # for example tests
"torchmetrics>=1.4.0", # for accuracy similarity metrics
]

demo = [
"gradio>=6.7.0",
]

# Seed-TTS serve benchmark WER (BytedanceSpeech/seed-tts-eval run_wer.py protocol).
seed-tts-eval = [
"jiwer>=3.0.0",
"zhon>=2.0.0",
"zhconv>=1.4.2",
"scipy>=1.10.0",
"soundfile>=0.12.0",
"transformers>=4.36.0",
"funasr>=1.0.0",
]

demo = [
"gradio>=6.7.0",
]

docs = [
"mkdocs>=1.5.0",
"mkdocs-api-autonav",
Expand Down
61 changes: 61 additions & 0 deletions tests/benchmarks/test_accuracy_bench_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# ruff: noqa: E402, I001
import argparse
import math
import os
import sys
import types
from pathlib import Path

import pytest
Expand Down Expand Up @@ -39,6 +42,64 @@
summarize_generated_records as summarize_gebench_generated_records,
summarize_gebench_results,
)
from tests.e2e.accuracy.qwen3_omni.qwen3_omni_acc_bench_core import seed_tts_bench_argv
from tests.e2e.accuracy.qwen3_omni.run_qwen_omni_acc_benchmark import sync_dataset_env_from_ns
from vllm_omni.benchmarks.data_modules.seed_tts_dataset import resolve_seed_tts_root


def test_seed_tts_bench_argv_preserves_hf_repo_id_from_env(monkeypatch):
monkeypatch.setenv("VLLM_SEED_TTS_DATASET_PATH", "zhaochenyang20/seed-tts-eval")
monkeypatch.delenv("VLLM_SEED_TTS_REPO", raising=False)

argv = seed_tts_bench_argv(locale="en")

dataset_idx = argv.index("--dataset-path")
assert argv[dataset_idx + 1] == "zhaochenyang20/seed-tts-eval"


def test_sync_dataset_env_preserves_seed_tts_hf_repo_id(monkeypatch):
ns = argparse.Namespace(
daily_omni_repo=None,
daily_omni_qa_json=None,
daily_omni_video_dir=None,
seed_tts_dataset_path="zhaochenyang20/seed-tts-eval",
seed_tts_root=None,
)

monkeypatch.delenv("VLLM_SEED_TTS_DATASET_PATH", raising=False)
sync_dataset_env_from_ns(ns)

assert os.environ["VLLM_SEED_TTS_DATASET_PATH"] == "zhaochenyang20/seed-tts-eval"


def test_resolve_seed_tts_root_downloads_only_requested_locale(monkeypatch, tmp_path: Path):
downloaded_root = tmp_path / "seed_tts_cache"
(downloaded_root / "zh" / "prompt-wavs").mkdir(parents=True)
(downloaded_root / "zh" / "meta.lst").write_text("", encoding="utf-8")
captured: dict[str, object] = {}

def fake_snapshot_download(*, repo_id, repo_type, allow_patterns):
captured["repo_id"] = repo_id
captured["repo_type"] = repo_type
captured["allow_patterns"] = allow_patterns
return str(downloaded_root)

monkeypatch.setitem(
sys.modules,
"huggingface_hub",
types.SimpleNamespace(snapshot_download=fake_snapshot_download),
)

resolved = resolve_seed_tts_root(
"zhaochenyang20/seed-tts-eval",
explicit_root=None,
locale="zh",
)

assert resolved == downloaded_root.resolve()
assert captured["repo_id"] == "zhaochenyang20/seed-tts-eval"
assert captured["repo_type"] == "dataset"
assert captured["allow_patterns"] == ["zh/**"]


def test_summarize_gebench_generated_records_groups_by_type():
Expand Down
2 changes: 2 additions & 0 deletions tests/e2e/accuracy/qwen3_omni/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-License-Identifier: Apache-2.0
"""Qwen3-Omni accuracy benchmarks (Daily-Omni / Seed-TTS ``vllm bench serve --omni``)."""
201 changes: 201 additions & 0 deletions tests/e2e/accuracy/qwen3_omni/qwen3_omni_acc_bench_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
# SPDX-License-Identifier: Apache-2.0
"""Shared helpers for Qwen3-Omni Daily-Omni / Seed-TTS ``vllm bench serve --omni`` accuracy runs.

Local dataset paths are **optional**. When ``VLLM_DAILY_OMNI_QA_JSON`` + ``VLLM_DAILY_OMNI_VIDEO_DIR``
point to existing files, those are used with inline video. Otherwise the benchmark falls back to
the HuggingFace dataset id (``liarliar/Daily-Omni``); QA loads via ``datasets``, and the first
bench request that needs media downloads ``Videos.tar`` from the Hub when no video dir is set.

Similarly for Seed-TTS: a local directory wins; otherwise ``--dataset-path`` uses the Hub id
and ``huggingface_hub.snapshot_download`` inside ``resolve_seed_tts_root`` pulls files on demand.

Use :func:`build_acc_benchmark_cli_argv` to assemble ``argv`` for a live Omni server (host/port/model
and small bench defaults) before ``parse_args`` / ``run_acc_benchmark`` in the accuracy driver.
"""

from __future__ import annotations

import json
import os
import shutil
import subprocess
from pathlib import Path
from typing import Any, Protocol

DEFAULT_DAILY_OMNI_HF_REPO = "liarliar/Daily-Omni"
DEFAULT_SEED_TTS_HF_REPO = "zhaochenyang20/seed-tts-eval"


class OmniBenchServerEndpoint(Protocol):
"""Anything with ``host`` / ``port`` / ``model`` (e.g. :class:`tests.conftest.OmniServer`)."""

host: str
port: int
model: str


def build_acc_benchmark_cli_argv(
server: OmniBenchServerEndpoint,
*,
skip_seed: bool,
skip_daily: bool,
num_prompts: int | None = None,
max_concurrency: int | None = None,
) -> list[str]:
"""Prefix argv for :func:`run_qwen_omni_acc_benchmark.parse_acc_benchmark_args` + :func:`run_acc_benchmark`.

Wires ``--host`` / ``--port`` / ``--model`` to a running Omni OpenAI server, sets small
``--num-prompts`` / ``--max-concurrency`` defaults (overridable via ``ACC_BENCH_NUM_PROMPTS`` /
``ACC_BENCH_MAX_CONCURRENCY``), and when Daily-Omni runs adds ``--daily-omni-repo`` so Hub QA
matches :func:`daily_omni_bench_argv` once ``run_acc_benchmark`` mirrors ``--daily-omni-repo`` into env.
"""
n_prompts = int(os.environ.get("ACC_BENCH_NUM_PROMPTS", "2000")) if num_prompts is None else int(num_prompts)
n_conc = int(os.environ.get("ACC_BENCH_MAX_CONCURRENCY", "10")) if max_concurrency is None else int(max_concurrency)
argv = [
"--host",
server.host,
"--port",
str(server.port),
"--model",
server.model,
"--num-prompts",
str(n_prompts),
"--max-concurrency",
str(n_conc),
]
if not skip_daily:
repo = os.environ.get("VLLM_DAILY_OMNI_REPO", DEFAULT_DAILY_OMNI_HF_REPO).strip() or DEFAULT_DAILY_OMNI_HF_REPO
argv.extend(["--daily-omni-repo", repo])
if skip_seed:
argv.append("--skip-seed-tts")
if skip_daily:
argv.append("--skip-daily-omni")
return argv


def daily_omni_bench_argv() -> list[str]:
"""CLI args for Daily-Omni (after ``vllm bench serve --omni``)."""
qa = os.environ.get("VLLM_DAILY_OMNI_QA_JSON", "").strip()
vd = os.environ.get("VLLM_DAILY_OMNI_VIDEO_DIR", "").strip()
if qa and vd:
qap = Path(qa).expanduser()
vdp = Path(vd).expanduser()
if qap.is_file() and vdp.is_dir():
return [
"--dataset-name",
"daily-omni",
"--daily-omni-qa-json",
str(qap.resolve()),
"--daily-omni-video-dir",
str(vdp.resolve()),
"--daily-omni-inline-local-video",
]
repo = os.environ.get("VLLM_DAILY_OMNI_REPO", DEFAULT_DAILY_OMNI_HF_REPO).strip() or DEFAULT_DAILY_OMNI_HF_REPO
return [
"--dataset-name",
"daily-omni",
"--dataset-path",
repo,
]


def seed_tts_bench_argv(*, locale: str = "en") -> list[str]:
"""CLI args for Seed-TTS (after ``vllm bench serve --omni``)."""
dp = os.environ.get("VLLM_SEED_TTS_DATASET_PATH", "").strip()
if dp:
p = Path(dp).expanduser()
# Preserve Hugging Face repo ids verbatim. Only canonicalize to an
# absolute path when the value actually exists as a local directory.
dataset_path = str(p.resolve()) if p.exists() and p.is_dir() else dp
else:
dataset_path = (
os.environ.get("VLLM_SEED_TTS_REPO", DEFAULT_SEED_TTS_HF_REPO).strip() or DEFAULT_SEED_TTS_HF_REPO
)
out = ["--dataset-name", "seed-tts", "--dataset-path", dataset_path]
root = os.environ.get("SEED_TTS_ROOT", "").strip()
if root:
out.extend(["--seed-tts-root", str(Path(root).expanduser().resolve())])
out.extend(["--seed-tts-locale", locale])
return out


def find_vllm_cli() -> str:
exe = shutil.which("vllm")
if not exe:
raise FileNotFoundError("Could not find `vllm` on PATH (install vLLM-Omni with CLI entrypoints).")
return exe


def run_vllm_bench_subprocess(vllm: str, argv: list[str], *, extra_env: dict[str, str] | None = None) -> None:
env = os.environ.copy()
if extra_env:
env.update(extra_env)
subprocess.run([vllm, *argv], env=env, check=True)


def load_benchmark_result(path: Path) -> dict[str, Any]:
with path.open(encoding="utf-8") as f:
return json.load(f)


def build_serve_common_argv(
*,
host: str,
port: int,
model: str,
num_prompts: int,
max_concurrency: int,
num_warmups: int,
percentile_metrics: str,
result_dir: Path,
result_filename: str,
ready_check_timeout_sec: int | None = None,
) -> list[str]:
out = [
"bench",
"serve",
"--omni",
"--host",
host,
"--port",
str(port),
"--model",
model,
"--endpoint",
"/v1/chat/completions",
"--backend",
"openai-chat-omni",
"--request-rate",
"inf",
"--num-prompts",
str(num_prompts),
"--max-concurrency",
str(max_concurrency),
"--no-oversample",
"--num-warmups",
str(num_warmups),
"--percentile-metrics",
percentile_metrics,
"--save-result",
"--result-dir",
str(result_dir),
"--result-filename",
result_filename,
]
if ready_check_timeout_sec is not None:
out.extend(["--ready-check-timeout-sec", str(int(ready_check_timeout_sec))])
return out


def assert_daily_omni_scored(result: dict[str, Any]) -> None:
acc = result.get("daily_omni_accuracy")
assert acc is not None, "daily_omni_accuracy missing — wrong dataset or benchmark wiring"
assert int(result.get("daily_omni_evaluated_ok", 0) or 0) > 0, "no successful MCQ rows (daily_omni_evaluated_ok==0)"


def assert_seed_tts_scored(result: dict[str, Any]) -> None:
err = result.get("seed_tts_eval_setup_error")
assert not err, f"Seed-TTS eval deps/setup failed: {err}"
assert int(result.get("seed_tts_content_evaluated", 0) or 0) > 0, (
"seed_tts_content_evaluated==0 — enable WER eval and check PCM capture / modalities"
)
Loading
Loading