From f44209b7bb8736b7e5f0fe9cc182f1f7ae41813b Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Mon, 20 Apr 2026 20:15:49 +0800 Subject: [PATCH] Add accuracy benchmark L4 test cases Signed-off-by: amy-why-3459 --- .buildkite/test-nightly.yml | 47 +- pyproject.toml | 14 +- tests/benchmarks/test_accuracy_bench_utils.py | 61 +++ tests/e2e/accuracy/qwen3_omni/__init__.py | 2 + .../qwen3_omni/qwen3_omni_acc_bench_core.py | 201 ++++++++ .../qwen3_omni/run_qwen_omni_acc_benchmark.py | 428 ++++++++++++++++++ .../accuracy/qwen3_omni/test_qwen3_omni.py | 137 ++++++ .../data_modules/daily_omni_dataset.py | 148 +++++- .../data_modules/daily_omni_eval.py | 13 +- .../data_modules/seed_tts_dataset.py | 13 +- .../benchmarks/data_modules/seed_tts_eval.py | 6 +- vllm_omni/entrypoints/cli/benchmark/serve.py | 4 +- 12 files changed, 1044 insertions(+), 30 deletions(-) create mode 100644 tests/e2e/accuracy/qwen3_omni/__init__.py create mode 100644 tests/e2e/accuracy/qwen3_omni/qwen3_omni_acc_bench_core.py create mode 100644 tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py create mode 100644 tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index e6912bd49c7..82d86266197 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -105,6 +105,51 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate + - label: ":full_moon: Omni · Accuracy Test" + timeout_in_minutes: 180 + commands: + - export SEED_TTS_WER_EVAL=1 + - export SEED_TTS_EVAL_DEVICE=cuda:1 + - | + set +e + pytest -s -v tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py -m "full_model" --run-level full_model + EXIT=$$? + buildkite-agent artifact upload "tests/e2e/accuracy/qwen3_omni/results/qwen_omni_acc/*.json" + exit $$EXIT + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: ":full_moon: Omni · Perf Test" key: nightly-omni-performance timeout_in_minutes: 180 @@ -514,7 +559,7 @@ steps: - label: ":full_moon: Diffusion X2V · Function Test" timeout_in_minutes: 90 commands: - - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py -m "full_model" --run-level "full_model" + - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py --run-level "full_model" agents: queue: "mithril-h100-pool" plugins: diff --git a/pyproject.toml b/pyproject.toml index c7b2be20e77..8346693f129 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,23 +56,17 @@ dev = [ "opencc>=1.2.0", "mistune>=3.2.0", # for example tests "torchmetrics>=1.4.0", # for accuracy similarity metrics -] - -demo = [ - "gradio>=6.7.0", -] - -# Seed-TTS serve benchmark WER (BytedanceSpeech/seed-tts-eval run_wer.py protocol). -seed-tts-eval = [ "jiwer>=3.0.0", "zhon>=2.0.0", "zhconv>=1.4.2", "scipy>=1.10.0", - "soundfile>=0.12.0", - "transformers>=4.36.0", "funasr>=1.0.0", ] +demo = [ + "gradio>=6.7.0", +] + docs = [ "mkdocs>=1.5.0", "mkdocs-api-autonav", diff --git a/tests/benchmarks/test_accuracy_bench_utils.py b/tests/benchmarks/test_accuracy_bench_utils.py index 8425804aeaa..6ceebb11b79 100644 --- a/tests/benchmarks/test_accuracy_bench_utils.py +++ b/tests/benchmarks/test_accuracy_bench_utils.py @@ -1,6 +1,9 @@ # ruff: noqa: E402, I001 +import argparse import math +import os import sys +import types from pathlib import Path import pytest @@ -39,6 +42,64 @@ summarize_generated_records as summarize_gebench_generated_records, summarize_gebench_results, ) +from tests.e2e.accuracy.qwen3_omni.qwen3_omni_acc_bench_core import seed_tts_bench_argv +from tests.e2e.accuracy.qwen3_omni.run_qwen_omni_acc_benchmark import sync_dataset_env_from_ns +from vllm_omni.benchmarks.data_modules.seed_tts_dataset import resolve_seed_tts_root + + +def test_seed_tts_bench_argv_preserves_hf_repo_id_from_env(monkeypatch): + monkeypatch.setenv("VLLM_SEED_TTS_DATASET_PATH", "zhaochenyang20/seed-tts-eval") + monkeypatch.delenv("VLLM_SEED_TTS_REPO", raising=False) + + argv = seed_tts_bench_argv(locale="en") + + dataset_idx = argv.index("--dataset-path") + assert argv[dataset_idx + 1] == "zhaochenyang20/seed-tts-eval" + + +def test_sync_dataset_env_preserves_seed_tts_hf_repo_id(monkeypatch): + ns = argparse.Namespace( + daily_omni_repo=None, + daily_omni_qa_json=None, + daily_omni_video_dir=None, + seed_tts_dataset_path="zhaochenyang20/seed-tts-eval", + seed_tts_root=None, + ) + + monkeypatch.delenv("VLLM_SEED_TTS_DATASET_PATH", raising=False) + sync_dataset_env_from_ns(ns) + + assert os.environ["VLLM_SEED_TTS_DATASET_PATH"] == "zhaochenyang20/seed-tts-eval" + + +def test_resolve_seed_tts_root_downloads_only_requested_locale(monkeypatch, tmp_path: Path): + downloaded_root = tmp_path / "seed_tts_cache" + (downloaded_root / "zh" / "prompt-wavs").mkdir(parents=True) + (downloaded_root / "zh" / "meta.lst").write_text("", encoding="utf-8") + captured: dict[str, object] = {} + + def fake_snapshot_download(*, repo_id, repo_type, allow_patterns): + captured["repo_id"] = repo_id + captured["repo_type"] = repo_type + captured["allow_patterns"] = allow_patterns + return str(downloaded_root) + + monkeypatch.setitem( + sys.modules, + "huggingface_hub", + types.SimpleNamespace(snapshot_download=fake_snapshot_download), + ) + + resolved = resolve_seed_tts_root( + "zhaochenyang20/seed-tts-eval", + explicit_root=None, + locale="zh", + ) + + assert resolved == downloaded_root.resolve() + assert captured["repo_id"] == "zhaochenyang20/seed-tts-eval" + assert captured["repo_type"] == "dataset" + assert captured["allow_patterns"] == ["zh/**"] def test_summarize_gebench_generated_records_groups_by_type(): diff --git a/tests/e2e/accuracy/qwen3_omni/__init__.py b/tests/e2e/accuracy/qwen3_omni/__init__.py new file mode 100644 index 00000000000..79a31c4f100 --- /dev/null +++ b/tests/e2e/accuracy/qwen3_omni/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Qwen3-Omni accuracy benchmarks (Daily-Omni / Seed-TTS ``vllm bench serve --omni``).""" diff --git a/tests/e2e/accuracy/qwen3_omni/qwen3_omni_acc_bench_core.py b/tests/e2e/accuracy/qwen3_omni/qwen3_omni_acc_bench_core.py new file mode 100644 index 00000000000..2ce86d504f0 --- /dev/null +++ b/tests/e2e/accuracy/qwen3_omni/qwen3_omni_acc_bench_core.py @@ -0,0 +1,201 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Shared helpers for Qwen3-Omni Daily-Omni / Seed-TTS ``vllm bench serve --omni`` accuracy runs. + +Local dataset paths are **optional**. When ``VLLM_DAILY_OMNI_QA_JSON`` + ``VLLM_DAILY_OMNI_VIDEO_DIR`` +point to existing files, those are used with inline video. Otherwise the benchmark falls back to +the HuggingFace dataset id (``liarliar/Daily-Omni``); QA loads via ``datasets``, and the first +bench request that needs media downloads ``Videos.tar`` from the Hub when no video dir is set. + +Similarly for Seed-TTS: a local directory wins; otherwise ``--dataset-path`` uses the Hub id +and ``huggingface_hub.snapshot_download`` inside ``resolve_seed_tts_root`` pulls files on demand. + +Use :func:`build_acc_benchmark_cli_argv` to assemble ``argv`` for a live Omni server (host/port/model +and small bench defaults) before ``parse_args`` / ``run_acc_benchmark`` in the accuracy driver. +""" + +from __future__ import annotations + +import json +import os +import shutil +import subprocess +from pathlib import Path +from typing import Any, Protocol + +DEFAULT_DAILY_OMNI_HF_REPO = "liarliar/Daily-Omni" +DEFAULT_SEED_TTS_HF_REPO = "zhaochenyang20/seed-tts-eval" + + +class OmniBenchServerEndpoint(Protocol): + """Anything with ``host`` / ``port`` / ``model`` (e.g. :class:`tests.conftest.OmniServer`).""" + + host: str + port: int + model: str + + +def build_acc_benchmark_cli_argv( + server: OmniBenchServerEndpoint, + *, + skip_seed: bool, + skip_daily: bool, + num_prompts: int | None = None, + max_concurrency: int | None = None, +) -> list[str]: + """Prefix argv for :func:`run_qwen_omni_acc_benchmark.parse_acc_benchmark_args` + :func:`run_acc_benchmark`. + + Wires ``--host`` / ``--port`` / ``--model`` to a running Omni OpenAI server, sets small + ``--num-prompts`` / ``--max-concurrency`` defaults (overridable via ``ACC_BENCH_NUM_PROMPTS`` / + ``ACC_BENCH_MAX_CONCURRENCY``), and when Daily-Omni runs adds ``--daily-omni-repo`` so Hub QA + matches :func:`daily_omni_bench_argv` once ``run_acc_benchmark`` mirrors ``--daily-omni-repo`` into env. + """ + n_prompts = int(os.environ.get("ACC_BENCH_NUM_PROMPTS", "2000")) if num_prompts is None else int(num_prompts) + n_conc = int(os.environ.get("ACC_BENCH_MAX_CONCURRENCY", "10")) if max_concurrency is None else int(max_concurrency) + argv = [ + "--host", + server.host, + "--port", + str(server.port), + "--model", + server.model, + "--num-prompts", + str(n_prompts), + "--max-concurrency", + str(n_conc), + ] + if not skip_daily: + repo = os.environ.get("VLLM_DAILY_OMNI_REPO", DEFAULT_DAILY_OMNI_HF_REPO).strip() or DEFAULT_DAILY_OMNI_HF_REPO + argv.extend(["--daily-omni-repo", repo]) + if skip_seed: + argv.append("--skip-seed-tts") + if skip_daily: + argv.append("--skip-daily-omni") + return argv + + +def daily_omni_bench_argv() -> list[str]: + """CLI args for Daily-Omni (after ``vllm bench serve --omni``).""" + qa = os.environ.get("VLLM_DAILY_OMNI_QA_JSON", "").strip() + vd = os.environ.get("VLLM_DAILY_OMNI_VIDEO_DIR", "").strip() + if qa and vd: + qap = Path(qa).expanduser() + vdp = Path(vd).expanduser() + if qap.is_file() and vdp.is_dir(): + return [ + "--dataset-name", + "daily-omni", + "--daily-omni-qa-json", + str(qap.resolve()), + "--daily-omni-video-dir", + str(vdp.resolve()), + "--daily-omni-inline-local-video", + ] + repo = os.environ.get("VLLM_DAILY_OMNI_REPO", DEFAULT_DAILY_OMNI_HF_REPO).strip() or DEFAULT_DAILY_OMNI_HF_REPO + return [ + "--dataset-name", + "daily-omni", + "--dataset-path", + repo, + ] + + +def seed_tts_bench_argv(*, locale: str = "en") -> list[str]: + """CLI args for Seed-TTS (after ``vllm bench serve --omni``).""" + dp = os.environ.get("VLLM_SEED_TTS_DATASET_PATH", "").strip() + if dp: + p = Path(dp).expanduser() + # Preserve Hugging Face repo ids verbatim. Only canonicalize to an + # absolute path when the value actually exists as a local directory. + dataset_path = str(p.resolve()) if p.exists() and p.is_dir() else dp + else: + dataset_path = ( + os.environ.get("VLLM_SEED_TTS_REPO", DEFAULT_SEED_TTS_HF_REPO).strip() or DEFAULT_SEED_TTS_HF_REPO + ) + out = ["--dataset-name", "seed-tts", "--dataset-path", dataset_path] + root = os.environ.get("SEED_TTS_ROOT", "").strip() + if root: + out.extend(["--seed-tts-root", str(Path(root).expanduser().resolve())]) + out.extend(["--seed-tts-locale", locale]) + return out + + +def find_vllm_cli() -> str: + exe = shutil.which("vllm") + if not exe: + raise FileNotFoundError("Could not find `vllm` on PATH (install vLLM-Omni with CLI entrypoints).") + return exe + + +def run_vllm_bench_subprocess(vllm: str, argv: list[str], *, extra_env: dict[str, str] | None = None) -> None: + env = os.environ.copy() + if extra_env: + env.update(extra_env) + subprocess.run([vllm, *argv], env=env, check=True) + + +def load_benchmark_result(path: Path) -> dict[str, Any]: + with path.open(encoding="utf-8") as f: + return json.load(f) + + +def build_serve_common_argv( + *, + host: str, + port: int, + model: str, + num_prompts: int, + max_concurrency: int, + num_warmups: int, + percentile_metrics: str, + result_dir: Path, + result_filename: str, + ready_check_timeout_sec: int | None = None, +) -> list[str]: + out = [ + "bench", + "serve", + "--omni", + "--host", + host, + "--port", + str(port), + "--model", + model, + "--endpoint", + "/v1/chat/completions", + "--backend", + "openai-chat-omni", + "--request-rate", + "inf", + "--num-prompts", + str(num_prompts), + "--max-concurrency", + str(max_concurrency), + "--no-oversample", + "--num-warmups", + str(num_warmups), + "--percentile-metrics", + percentile_metrics, + "--save-result", + "--result-dir", + str(result_dir), + "--result-filename", + result_filename, + ] + if ready_check_timeout_sec is not None: + out.extend(["--ready-check-timeout-sec", str(int(ready_check_timeout_sec))]) + return out + + +def assert_daily_omni_scored(result: dict[str, Any]) -> None: + acc = result.get("daily_omni_accuracy") + assert acc is not None, "daily_omni_accuracy missing — wrong dataset or benchmark wiring" + assert int(result.get("daily_omni_evaluated_ok", 0) or 0) > 0, "no successful MCQ rows (daily_omni_evaluated_ok==0)" + + +def assert_seed_tts_scored(result: dict[str, Any]) -> None: + err = result.get("seed_tts_eval_setup_error") + assert not err, f"Seed-TTS eval deps/setup failed: {err}" + assert int(result.get("seed_tts_content_evaluated", 0) or 0) > 0, ( + "seed_tts_content_evaluated==0 — enable WER eval and check PCM capture / modalities" + ) diff --git a/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py b/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py new file mode 100644 index 00000000000..d30457dcd28 --- /dev/null +++ b/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +"""Accuracy (and light perf) checks for Qwen3-Omni via ``vllm bench serve --omni``. + +The standalone CLI uses small ``--num-prompts`` / ``--max-concurrency`` defaults suitable for +L4-style smoke runs against an already-running server. The pytest wrappers in +``tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py`` may still require larger GPUs (currently +H100 / MI325) because they launch the live Omni server inside the test. + +1. **Daily-Omni** — MCQ accuracy fields in the saved JSON (``daily_omni_accuracy``, …); by default the + run **fails** if accuracy is strictly below **0.69** (``--min-daily-omni-accuracy`` / ``ACC_BENCH_MIN_DAILY_OMNI_ACCURACY``). +2. **Seed-TTS** — ``seed-tts-eval``-style metrics when ``--seed-tts-wer-eval`` is used + (WER / SIM / UTMOS keys from :func:`compute_seed_tts_wer_metrics`). + +Prerequisites +------------- +* A running Omni OpenAI-compatible server (same machine or reachable host), e.g.:: + + vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8000 + + On L4 you may need a smaller checkpoint, quantization, or tighter engine flags; this script + only drives the **client** benchmark. + +* ``vllm`` CLI from **vLLM-Omni** (so ``bench serve`` registers ``daily-omni`` / ``seed-tts``). + +* **Daily-Omni** — if local ``qa.json`` + ``Videos/`` are not both provided (CLI or matching env), + the client passes ``--dataset-path`` with a Hub id (default ``liarliar/Daily-Omni``). The **child** + ``vllm bench serve`` process then loads QA via ``datasets.load_dataset`` (needs ``pip install datasets``, + network or HF cache). Without ``--daily-omni-video-dir``, the benchmark **lazily** downloads and + extracts ``Videos.tar`` from the Hub (``huggingface_hub``) on first multimodal request. Override + the dataset repo with ``--daily-omni-repo`` or ``VLLM_DAILY_OMNI_REPO``; override the tar repo + with ``VLLM_DAILY_OMNI_MEDIA_REPO`` if needed. + +* **Seed-TTS** optional extras for WER/SIM/UTMOS:: + + pip install 'vllm-omni[seed-tts-eval]' + +Examples +-------- +Pytest (same checks; needs a running server):: + + pytest -sv tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py + +Smoke on localhost (server already up):: + + python tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py \\ + --model Qwen/Qwen3-Omni-30B-A3B-Instruct \\ + --daily-omni-qa-json ./qa.json \\ + --daily-omni-video-dir ./Videos \\ + --seed-tts-dataset-path ./seed-tts-eval + +Skip one suite, tighten gates:: + + python tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py \\ + --skip-daily-omni \\ + --max-seed-tts-mean-wer 0.35 \\ + --min-seed-tts-mean-sim 0.75 +""" + +from __future__ import annotations + +import argparse +import contextlib +import json +import os +import sys +from datetime import datetime +from pathlib import Path +from typing import Any + +from tests.e2e.accuracy.qwen3_omni.qwen3_omni_acc_bench_core import ( + build_serve_common_argv, + daily_omni_bench_argv, + find_vllm_cli, + load_benchmark_result, + run_vllm_bench_subprocess, + seed_tts_bench_argv, +) + +_REPO_ROOT = Path(__file__).resolve().parents[4] + + +def _repo_root() -> Path: + return _REPO_ROOT + + +def _default_result_dir() -> Path: + return Path(__file__).resolve().parent / "results" / "qwen_omni_acc" + + +def _validate_daily_omni(result: dict[str, Any], *, min_accuracy: float | None) -> list[str]: + errs: list[str] = [] + acc = result.get("daily_omni_accuracy") + if acc is None: + errs.append("Missing daily_omni_accuracy (wrong dataset or no gold-evaluated rows).") + return errs + ev = int(result.get("daily_omni_evaluated_ok", 0) or 0) + if ev <= 0: + errs.append("daily_omni_evaluated_ok is 0; no successful MCQ rows to score.") + if min_accuracy is not None and float(acc) + 1e-12 < float(min_accuracy): + errs.append(f"daily_omni_accuracy={acc:.6f} < --min-daily-omni-accuracy={min_accuracy}") + return errs + + +def _validate_seed_tts( + result: dict[str, Any], + *, + max_mean_wer: float | None, + min_mean_sim: float | None, + min_mean_utmos: float | None, +) -> list[str]: + errs: list[str] = [] + setup = result.get("seed_tts_eval_setup_error") + if setup: + errs.append(f"Seed-TTS eval setup failed: {setup}") + return errs + n = int(result.get("seed_tts_content_evaluated", 0) or 0) + if n <= 0: + errs.append("seed_tts_content_evaluated is 0 (enable --seed-tts-wer-eval and check PCM capture).") + mean_wer = result.get("seed_tts_content_error_mean") + if mean_wer is not None and max_mean_wer is not None and float(mean_wer) > float(max_mean_wer) + 1e-12: + errs.append(f"seed_tts_content_error_mean (WER)={mean_wer:.6f} > --max-seed-tts-mean-wer={max_mean_wer}") + sim_m = result.get("seed_tts_sim_mean") + if sim_m is not None and min_mean_sim is not None and float(sim_m) + 1e-12 < float(min_mean_sim): + errs.append(f"seed_tts_sim_mean={sim_m:.6f} < --min-seed-tts-mean-sim={min_mean_sim}") + ut_m = result.get("seed_tts_utmos_mean") + if ut_m is not None and min_mean_utmos is not None and float(ut_m) + 1e-12 < float(min_mean_utmos): + errs.append(f"seed_tts_utmos_mean={ut_m:.6f} < --min-seed-tts-mean-utmos={min_mean_utmos}") + return errs + + +def sync_dataset_env_from_ns(ns: argparse.Namespace) -> None: + """Mirror CLI path flags into env vars read by ``daily_omni_bench_argv`` / ``seed_tts_bench_argv``.""" + repo = getattr(ns, "daily_omni_repo", None) + if repo is not None and str(repo).strip(): + os.environ["VLLM_DAILY_OMNI_REPO"] = str(repo).strip() + if ns.daily_omni_qa_json is not None: + os.environ["VLLM_DAILY_OMNI_QA_JSON"] = str(Path(ns.daily_omni_qa_json).expanduser().resolve()) + if ns.daily_omni_video_dir is not None: + os.environ["VLLM_DAILY_OMNI_VIDEO_DIR"] = str(Path(ns.daily_omni_video_dir).expanduser().resolve()) + if ns.seed_tts_dataset_path is not None: + # ``--seed-tts-dataset-path`` accepts either a local directory or a + # Hugging Face repo id. Only resolve to an absolute filesystem path + # when the value actually exists locally; otherwise preserve the repo + # string verbatim so downstream code can pass it to snapshot_download. + raw = str(ns.seed_tts_dataset_path).strip() + p = Path(raw).expanduser() + os.environ["VLLM_SEED_TTS_DATASET_PATH"] = str(p.resolve()) if p.exists() and p.is_dir() else raw + if ns.seed_tts_root is not None: + os.environ["SEED_TTS_ROOT"] = str(Path(ns.seed_tts_root).expanduser().resolve()) + + +@contextlib.contextmanager +def _preserve_benchmark_dataset_env() -> Any: + """Save/restore dataset-related env vars so benchmark tests don't leak state.""" + keys = ( + "VLLM_DAILY_OMNI_REPO", + "VLLM_DAILY_OMNI_QA_JSON", + "VLLM_DAILY_OMNI_VIDEO_DIR", + "VLLM_SEED_TTS_DATASET_PATH", + "SEED_TTS_ROOT", + ) + original = {k: os.environ.get(k) for k in keys} + try: + yield + finally: + for key, value in original.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + + +def _build_common_args(ns: argparse.Namespace, *, result_filename: str) -> list[str]: + return build_serve_common_argv( + host=ns.host, + port=ns.port, + model=ns.model, + num_prompts=ns.num_prompts, + max_concurrency=ns.max_concurrency, + num_warmups=ns.num_warmups, + percentile_metrics=ns.percentile_metrics, + result_dir=ns.result_dir, + result_filename=result_filename, + ready_check_timeout_sec=ns.ready_check_timeout_sec, + ) + + +def run_daily_omni(ns: argparse.Namespace, vllm: str) -> Path: + ns.result_dir.mkdir(parents=True, exist_ok=True) + tag = datetime.now().strftime("%Y%m%d-%H%M%S") + result_filename = f"qwen_omni_acc_daily_omni_{tag}.json" + extra = json.loads(ns.daily_extra_body_json) + argv = ( + _build_common_args(ns, result_filename=result_filename) + + daily_omni_bench_argv() + + [ + "--daily-omni-input-mode", + ns.daily_omni_input_mode, + "--extra-body", + json.dumps(extra, ensure_ascii=False, separators=(",", ":")), + ] + ) + if ns.daily_omni_save_eval_items: + argv.append("--daily-omni-save-eval-items") + print("\n$", vllm, *argv, "\n", flush=True) + run_vllm_bench_subprocess(vllm, argv) + out = Path(ns.result_dir) / result_filename + if not out.is_file(): + raise FileNotFoundError(f"Expected result JSON at {out}") + return out + + +def run_seed_tts(ns: argparse.Namespace, vllm: str) -> Path: + ns.result_dir.mkdir(parents=True, exist_ok=True) + tag = datetime.now().strftime("%Y%m%d-%H%M%S") + result_filename = f"qwen_omni_acc_seed_tts_{tag}.json" + extra = json.loads(ns.seed_extra_body_json) + argv = ( + _build_common_args(ns, result_filename=result_filename) + + seed_tts_bench_argv(locale=ns.seed_tts_locale) + + [ + "--seed-tts-wer-eval", + "--extra-body", + json.dumps(extra, ensure_ascii=False, separators=(",", ":")), + ] + ) + if ns.seed_tts_wer_save_items: + argv.append("--seed-tts-wer-save-items") + if ns.seed_tts_file_ref_audio: + argv.append("--seed-tts-file-ref-audio") + extra_env: dict[str, str] = {"SEED_TTS_WER_EVAL": "1"} + if ns.seed_tts_eval_device: + extra_env["SEED_TTS_EVAL_DEVICE"] = ns.seed_tts_eval_device + print("\n$", vllm, *argv, "\n", flush=True) + run_vllm_bench_subprocess(vllm, argv, extra_env=extra_env) + out = Path(ns.result_dir) / result_filename + if not out.is_file(): + raise FileNotFoundError(f"Expected result JSON at {out}") + return out + + +def build_arg_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--host", default=os.environ.get("ACC_BENCH_HOST", "127.0.0.1")) + p.add_argument("--port", type=int, default=int(os.environ.get("ACC_BENCH_PORT", "8000"))) + p.add_argument( + "--model", + default=os.environ.get( + "ACC_BENCH_MODEL", + "Qwen/Qwen3-Omni-30B-A3B-Instruct", + ), + help="Model id passed to ``vllm bench serve`` (must match the running server).", + ) + p.add_argument("--num-prompts", type=int, default=int(os.environ.get("ACC_BENCH_NUM_PROMPTS", "2000"))) + p.add_argument("--max-concurrency", type=int, default=int(os.environ.get("ACC_BENCH_MAX_CONCURRENCY", "10"))) + p.add_argument("--num-warmups", type=int, default=int(os.environ.get("ACC_BENCH_NUM_WARMUPS", "0"))) + p.add_argument( + "--percentile-metrics", + default=os.environ.get("ACC_BENCH_PERCENTILE_METRICS", "ttft,tpot,itl,e2el,audio_ttfp,audio_rtf"), + ) + p.add_argument( + "--ready-check-timeout-sec", + type=int, + default=None, + help="If set, forwarded to ``vllm bench serve`` (probe first request until success). " + "Omit to use upstream default (typically skip).", + ) + p.add_argument( + "--result-dir", + type=Path, + default=Path(os.environ.get("ACC_BENCH_RESULT_DIR", str(_default_result_dir()))), + ) + + p.add_argument("--skip-daily-omni", action="store_true") + p.add_argument("--skip-seed-tts", action="store_true") + + p.add_argument( + "--daily-omni-repo", + type=str, + default=None, + help="Hugging Face dataset id for Daily-Omni Hub mode (sets VLLM_DAILY_OMNI_REPO). " + "Ignored when local qa.json + video dir are used.", + ) + p.add_argument( + "--daily-omni-qa-json", + type=Path, + default=None, + help="Optional local qa.json; if omitted with no env, uses Hub liarliar/Daily-Omni.", + ) + p.add_argument( + "--daily-omni-video-dir", + type=Path, + default=None, + help="Optional local Videos root; if omitted, media is fetched lazily from Hub Videos.tar.", + ) + p.add_argument("--daily-omni-input-mode", choices=("all", "visual", "audio"), default="all") + p.add_argument( + "--daily-extra-body-json", + default='{"modalities":["text"]}', + help="JSON merged into each chat request for Daily-Omni (default matches common L4 / text-output runs).", + ) + p.add_argument( + "--daily-omni-save-eval-items", + action="store_true", + help="Sets env via CLI flag so per-item rows are stored in the result JSON.", + ) + p.add_argument( + "--min-daily-omni-accuracy", + type=float, + default=float((os.environ.get("ACC_BENCH_MIN_DAILY_OMNI_ACCURACY") or "0.69").strip() or "0.69"), + help="Fail when daily_omni_accuracy is strictly below this threshold (0–1). " + "Default baseline 0.69; override with env ACC_BENCH_MIN_DAILY_OMNI_ACCURACY or pass 0 to disable the floor.", + ) + + p.add_argument( + "--seed-tts-dataset-path", + type=str, + default=None, + help="Optional local root or Hub id; if omitted, uses zhaochenyang20/seed-tts-eval.", + ) + p.add_argument("--seed-tts-root", type=Path, default=None, help="Optional override for Seed-TTS filesystem root.") + p.add_argument("--seed-tts-locale", choices=("en", "zh"), default="en") + p.add_argument( + "--seed-extra-body-json", + default='{"modalities":["text","audio"]}', + help="JSON for Seed-TTS chat requests (must include audio for synthesis + PCM capture).", + ) + p.add_argument("--seed-tts-wer-save-items", action="store_true") + p.add_argument( + "--seed-tts-file-ref-audio", + action="store_true", + help="Use file:// ref_audio; server must allow local media paths.", + ) + p.add_argument( + "--seed-tts-eval-device", + default=os.environ.get("SEED_TTS_EVAL_DEVICE"), + help="Sets SEED_TTS_EVAL_DEVICE for Whisper / WavLM / UTMOS (e.g. cuda:0).", + ) + p.add_argument( + "--max-seed-tts-mean-wer", + type=float, + default=0.02, + help="If set, fail when seed_tts_content_error_mean is strictly above this value.", + ) + p.add_argument( + "--min-seed-tts-mean-sim", + type=float, + default=None, + help="If set, fail when seed_tts_sim_mean is strictly below this value.", + ) + p.add_argument( + "--min-seed-tts-mean-utmos", + type=float, + default=None, + help="If set, fail when seed_tts_utmos_mean is strictly below this value.", + ) + return p + + +def parse_acc_benchmark_args(argv: list[str] | None = None) -> argparse.Namespace: + """Parse CLI args; when ``argv`` is ``None``, use ``sys.argv[1:]`` (standalone script).""" + if argv is None: + argv = sys.argv[1:] + return build_arg_parser().parse_args(argv) + + +def run_acc_benchmark(ns: argparse.Namespace) -> int: + """Run Daily-Omni and/or Seed-TTS client benches against a running server; return 0 on success.""" + failed: list[str] = [] + + with _preserve_benchmark_dataset_env(): + sync_dataset_env_from_ns(ns) + + vllm = find_vllm_cli() + print(f"Using vLLM CLI: {vllm}", flush=True) + print(f"Repo root (for cwd reference): {_repo_root()}", flush=True) + + if not ns.skip_daily_omni: + path = run_daily_omni(ns, vllm) + print(f"\n[Daily-Omni] result JSON: {path}", flush=True) + data = load_benchmark_result(path) + errs = _validate_daily_omni(data, min_accuracy=ns.min_daily_omni_accuracy) + if errs: + failed.extend([f"[Daily-Omni] {e}" for e in errs]) + else: + print( + f"[Daily-Omni] daily_omni_accuracy={data.get('daily_omni_accuracy')} " + f"evaluated_ok={data.get('daily_omni_evaluated_ok')}", + flush=True, + ) + + if not ns.skip_seed_tts: + path = run_seed_tts(ns, vllm) + print(f"\n[Seed-TTS] result JSON: {path}", flush=True) + data = load_benchmark_result(path) + errs = _validate_seed_tts( + data, + max_mean_wer=ns.max_seed_tts_mean_wer, + min_mean_sim=ns.min_seed_tts_mean_sim, + min_mean_utmos=ns.min_seed_tts_mean_utmos, + ) + if errs: + failed.extend([f"[Seed-TTS] {e}" for e in errs]) + else: + print( + f"[Seed-TTS] mean_wer={data.get('seed_tts_content_error_mean')} " + f"mean_sim={data.get('seed_tts_sim_mean')} mean_utmos={data.get('seed_tts_utmos_mean')} " + f"evaluated={data.get('seed_tts_content_evaluated')}", + flush=True, + ) + + if failed: + print("\nACCURACY CHECK FAILED:", file=sys.stderr) + for line in failed: + print(f" - {line}", file=sys.stderr) + return 1 + + print("\nAll configured accuracy checks passed.", flush=True) + return 0 + + +def main() -> int: + return run_acc_benchmark(parse_acc_benchmark_args()) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py b/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py new file mode 100644 index 00000000000..773f7c1108c --- /dev/null +++ b/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Qwen3-Omni accuracy benchmarks (Daily-Omni MCQ + Seed-TTS WER) via ``vllm bench serve --omni``. + +Starts a **module-scoped** Omni OpenAI-compatible server (same pattern as ``tests/dfx/perf`` and +``tests/e2e/online_serving/test_qwen3_omni.py``), then runs the client benches against +``omni_server.host`` / ``omni_server.port`` / ``omni_server.model``. + +**Daily-Omni from Hugging Face:** unless ``VLLM_DAILY_OMNI_QA_JSON`` and ``VLLM_DAILY_OMNI_VIDEO_DIR`` +point at a full local tree, the bench uses ``--dataset-path`` (default ``liarliar/Daily-Omni`` via +``VLLM_DAILY_OMNI_REPO`` / ``--daily-omni-repo``). QA loads through ``datasets``; ``Videos.tar`` is +downloaded and extracted under ``HF_HOME`` on demand. The tests patch in +``--daily-omni-inline-local-video`` so multimodal payloads use data URLs (no +``--allowed-local-media-path`` on the server). Use small ``--num-prompts`` defaults suitable for CI +(override with ``ACC_BENCH_NUM_PROMPTS`` / ``ACC_BENCH_MAX_CONCURRENCY``; see +:func:`tests.e2e.accuracy.qwen3_omni.qwen3_omni_acc_bench_core.build_acc_benchmark_cli_argv`). + +This package lives under ``tests/e2e/accuracy/qwen3_omni/``, so pytest still loads +``tests/e2e/accuracy/conftest.py``, which imports ``tests.conftest`` (heavy deps: ``vllm``, ``torch``, …). +A broken or partial install can therefore **fail during collection** before these tests run. + +If ``vllm`` is not on ``PATH``, the tests **skip** instead of erroring. Without +``VLLM_SKIP_ACC_BENCH=1``, a failed bench still yields a **failed** run (non-zero subprocess exit). + +Run:: + + pytest -sv tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py + +Only the subprocess accuracy marker:: + + pytest -sv tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py -m qwen3_omni_acc + +Skip when you do not have GPUs, a server, or datasets (CI opt-out):: + + VLLM_SKIP_ACC_BENCH=1 pytest -sv tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py + +Standalone CLI (expects a server already up; uses ``ACC_BENCH_*`` env defaults):: + + python tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py --help +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from tests.e2e.accuracy.qwen3_omni import run_qwen_omni_acc_benchmark as _acc_bench +from tests.e2e.accuracy.qwen3_omni.qwen3_omni_acc_bench_core import ( + build_acc_benchmark_cli_argv, + find_vllm_cli, +) +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniServerParams +from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config +from vllm_omni.platforms import current_omni_platform + +_E2E_ROOT = Path(__file__).resolve().parent.parent.parent + +models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] + +pytestmark = [pytest.mark.full_model, pytest.mark.omni] + +_CI_DEPLOY = get_deploy_config_path("ci/qwen3_omni_moe.yaml") + + +def get_chunk_config(config_path: str | None = None): + """Load the qwen3_omni CI deploy yaml with async_chunk modifications for streaming mode.""" + if config_path is None: + config_path = _CI_DEPLOY + # TODO: remove this workaround once legacy `stage_args` path is deleted. + # The pipeline (qwen3_omni/pipeline.py) already wires + # thinker2talker_async_chunk / talker2code2wav_async_chunk on stage 0/1, + # so only async_chunk needs flipping. Writing nested `engine_args:` into + # the new-schema overlay trips _parse_stage_deploy's legacy branch and + # drops flat fields (load_format, max_num_seqs, ...). + return modify_stage_config(config_path, updates={"async_chunk": True}) + + +if current_omni_platform.is_xpu(): + stage_configs = [_CI_DEPLOY] +else: # CUDA + ROCm MI325 share the same deploy config + stage_configs = [get_chunk_config()] + +test_params = [ + OmniServerParams(model=model, stage_config_path=stage_config) for model in models for stage_config in stage_configs +] + + +def _require_vllm_cli() -> None: + try: + find_vllm_cli() + except FileNotFoundError as exc: + pytest.skip(str(exc)) + + +@pytest.fixture(autouse=True) +def _daily_omni_hub_inline_media(monkeypatch: pytest.MonkeyPatch) -> None: + """Hub / lazy-cache mode uses local files → default ``file://`` needs server allowlist. + + ``run_qwen_omni_acc_benchmark`` binds ``daily_omni_bench_argv`` at import time; patch that copy + so we append ``--daily-omni-inline-local-video`` whenever the core helper did not already set it + (local qa.json + video-dir mode already passes the flag). + """ + orig = _acc_bench.daily_omni_bench_argv + + def _wrapped() -> list[str]: + out = list(orig()) + if "--daily-omni-inline-local-video" not in out: + out.append("--daily-omni-inline-local-video") + return out + + monkeypatch.setattr(_acc_bench, "daily_omni_bench_argv", _wrapped) + monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + monkeypatch.setenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") + + +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_qwen3_omni_daily_omni_accuracy_bench(omni_server) -> None: + _require_vllm_cli() + pytest.importorskip("datasets") + pytest.importorskip("huggingface_hub") + ns = _acc_bench.parse_acc_benchmark_args( + build_acc_benchmark_cli_argv(omni_server, skip_seed=True, skip_daily=False) + ) + assert _acc_bench.run_acc_benchmark(ns) == 0 + + +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_qwen3_omni_seed_tts_wer_bench(omni_server) -> None: + _require_vllm_cli() + pytest.importorskip("huggingface_hub") + ns = _acc_bench.parse_acc_benchmark_args( + build_acc_benchmark_cli_argv(omni_server, skip_seed=False, skip_daily=True) + ) + assert _acc_bench.run_acc_benchmark(ns) == 0 diff --git a/vllm_omni/benchmarks/data_modules/daily_omni_dataset.py b/vllm_omni/benchmarks/data_modules/daily_omni_dataset.py index 01b86d0fd1e..65918414f45 100644 --- a/vllm_omni/benchmarks/data_modules/daily_omni_dataset.py +++ b/vllm_omni/benchmarks/data_modules/daily_omni_dataset.py @@ -9,7 +9,9 @@ - Local JSON file (``qa_json_path``): recommended for offline/air-gapped environments - HuggingFace datasets (``dataset_path``): legacy online mode -The videos must be separately downloaded and extracted from Videos.tar. +Video/audio files normally come from extracted ``Videos.tar``. When ``--daily-omni-video-dir`` +is not set, the first request that needs on-disk media downloads that archive from the Hugging Face +dataset repo (``huggingface_hub``) and caches it under ``HF_HOME``. Why ``BenchmarkDataset`` instead of ``HuggingFaceDataset``? vLLM's ``HuggingFaceDataset`` is a thin wrapper whose ``__init__`` always ends by calling @@ -52,6 +54,9 @@ import base64 import json import logging +import os +import shutil +import tarfile from dataclasses import dataclass from pathlib import Path from typing import Any, Literal @@ -73,6 +78,103 @@ logger = logging.getLogger(__name__) +def _daily_omni_hf_cache_root() -> Path: + return Path(os.environ.get("HF_HOME", Path.home() / ".cache" / "huggingface")).expanduser().resolve() + + +def _daily_omni_tar_fingerprint(tar_path: Path) -> str: + st = tar_path.stat() + return f"v1:{st.st_size}:{int(st.st_mtime_ns)}" + + +def _daily_omni_find_videos_root_in_extract(tmp: Path) -> Path: + """Return directory whose children are ``video_id`` folders with ``*_video.mp4``.""" + videos = tmp / "Videos" + if videos.is_dir(): + return videos + for child in sorted(tmp.iterdir()): + if child.is_dir() and not child.name.startswith("."): + probe = child / f"{child.name}_video.mp4" + if probe.is_file(): + return tmp + raise RuntimeError( + f"Unrecognized layout after extracting Daily-Omni Videos.tar under {tmp} " + "(expected top-level 'Videos/' or per-video_id subdirs)." + ) + + +def ensure_daily_omni_hub_videos_dir(repo_id: str) -> Path: + """Download ``Videos.tar`` from the Hugging Face dataset repo and return the ``Videos`` root. + + The returned path matches ``--daily-omni-video-dir`` (directory containing ``{{video_id}}/``). + + Cached under ``HF_HOME`` / ``vllm_omni/daily_omni_media/``. Reuses extraction when the + tarball fingerprint matches. + + Raises: + ImportError: if ``huggingface_hub`` is not installed. + FileNotFoundError / RuntimeError: if the archive is missing or malformed. + """ + rid = (repo_id or "").strip() + if not rid: + raise ValueError("repo_id is required to download Daily-Omni Videos.tar") + + try: + from huggingface_hub import hf_hub_download + except ImportError as e: + raise ImportError( + "Daily-Omni Hub media download requires huggingface_hub. " + "Install it (e.g. with vLLM) or provide --daily-omni-video-dir with a local extract." + ) from e + + safe = rid.replace("/", "__").replace("\\", "_") + staging_root = _daily_omni_hf_cache_root() / "vllm_omni" / "daily_omni_media" / safe + videos_dir = staging_root / "Videos" + marker = staging_root / ".videos_extracted" + + tar_path: Path | None = None + for fname in ("Videos.tar", "videos.tar"): + try: + tar_path = Path(hf_hub_download(repo_id=rid, filename=fname, repo_type="dataset")) + break + except Exception: + continue + if tar_path is None or not tar_path.is_file(): + raise FileNotFoundError( + f"Could not download Videos.tar from Hugging Face dataset {rid!r} (tried Videos.tar / videos.tar)." + ) + + fp = _daily_omni_tar_fingerprint(tar_path) + if marker.is_file() and videos_dir.is_dir(): + try: + if marker.read_text(encoding="utf-8").strip() == fp: + next(videos_dir.iterdir()) + logger.info("Reusing cached Daily-Omni Videos extract at %s", videos_dir) + return videos_dir + except (OSError, StopIteration): + shutil.rmtree(videos_dir, ignore_errors=True) + marker.unlink(missing_ok=True) + + staging_root.mkdir(parents=True, exist_ok=True) + work = staging_root / "_extract_work" + shutil.rmtree(work, ignore_errors=True) + work.mkdir(parents=True) + try: + logger.info("Extracting Daily-Omni Videos.tar from %s (repo=%s)", tar_path, rid) + with tarfile.open(tar_path, "r:*") as tf: + tf.extractall(path=work, filter="data") + found = _daily_omni_find_videos_root_in_extract(work) + if videos_dir.exists(): + shutil.rmtree(videos_dir, ignore_errors=True) + shutil.move(str(found), str(videos_dir)) + finally: + shutil.rmtree(work, ignore_errors=True) + + marker.write_text(fp, encoding="utf-8") + logger.info("Daily-Omni Hub media ready at %s", videos_dir) + return videos_dir + + class _ListDatasetIterator: """Simple iterator wrapper around a list to mimic HuggingFace streaming dataset behavior.""" @@ -142,7 +244,10 @@ class DailyOmniDataset(BenchmarkDataset): - Local JSON file (``qa_json_path``): recommended for offline/air-gapped environments - HuggingFace datasets (``dataset_path``): legacy online mode - The videos must be separately downloaded and extracted from Videos.tar. + Video/audio files normally come from extracted ``Videos.tar``. When ``video_dir`` is not set, + the first sample that needs on-disk media downloads that archive from the Hugging Face dataset + repo (env ``VLLM_DAILY_OMNI_MEDIA_REPO`` overrides the repo id; else ``dataset_path`` or + :data:`DEFAULT_HF_DATASET_ID`). Args: qa_json_path: Path to local qa.json file (offline mode, preferred). When provided, @@ -151,7 +256,8 @@ class DailyOmniDataset(BenchmarkDataset): ``qa_json_path`` is not provided (legacy online mode). dataset_split: Dataset split to use (default: "train"). Used only in online mode. random_seed: Random seed for shuffling - video_dir: Directory containing extracted video files (default: None) + video_dir: Directory containing extracted video files (default: None; may be filled lazily + from Hub — see above). input_mode: Which modalities to send, matching upstream Daily-Omni ``--input_mode``: ``all`` — video + WAV (default; official audio-visual protocol); ``visual`` — video only; @@ -308,13 +414,14 @@ def _load_from_huggingface(self) -> None: "Install with: pip install datasets, or use local JSON mode instead." ) - ds = load_dataset( - self.dataset_path, - name=self.dataset_subset, - split=self.dataset_split, - streaming=self._hf_streaming, - trust_remote_code=self.trust_remote_code, - ) + load_kw: dict[str, Any] = { + "split": self.dataset_split, + "streaming": self._hf_streaming, + "trust_remote_code": self.trust_remote_code, + } + if self.dataset_subset is not None: + load_kw["name"] = self.dataset_subset + ds = load_dataset(self.dataset_path, **load_kw) if not getattr(self, "disable_shuffle", False): ds = ds.shuffle(seed=self.random_seed) self.data = ds @@ -569,6 +676,17 @@ def _local_file_to_audio_url_payload(self, audio_path: Path) -> dict[str, Any]: "audio_url": {"url": path.as_uri()}, } + def _lazy_ensure_hub_media_dir(self) -> None: + """If ``video_dir`` was not configured, download and extract ``Videos.tar`` once from HF.""" + if self.video_dir is not None: + return + repo = os.environ.get("VLLM_DAILY_OMNI_MEDIA_REPO", "").strip() + if not repo: + repo = (self.dataset_path or "").strip() + if not repo: + repo = self.DEFAULT_HF_DATASET_ID + self.video_dir = ensure_daily_omni_hub_videos_dir(repo) + def _get_video_content( self, video_id: str, @@ -589,6 +707,8 @@ def _get_video_content( url = f"https://{url.lstrip('/')}" return {"type": "video_url", "video_url": {"url": url}} + self._lazy_ensure_hub_media_dir() + if self.video_dir and video_id: video_path = self._resolve_local_video_path(video_id) if video_path is not None: @@ -615,7 +735,13 @@ def _get_video_content( return None def _get_audio_content(self, video_id: str) -> dict[str, Any] | None: - """Resolve extracted WAV for OpenAI-style ``audio_url`` (local files only).""" + """Resolve extracted WAV for OpenAI-style ``audio_url`` (local files under ``video_dir``). + + Uses the same tree as video (``{video_id}/{video_id}_audio.wav``), including after lazy + Hub ``Videos.tar`` extraction when ``video_dir`` was unset. + """ + self._lazy_ensure_hub_media_dir() + if not self.video_dir or not video_id: logger.warning( "Daily-Omni input_mode %r requires --daily-omni-video-dir with %s", diff --git a/vllm_omni/benchmarks/data_modules/daily_omni_eval.py b/vllm_omni/benchmarks/data_modules/daily_omni_eval.py index ecc9edc8445..f191cf2febc 100644 --- a/vllm_omni/benchmarks/data_modules/daily_omni_eval.py +++ b/vllm_omni/benchmarks/data_modules/daily_omni_eval.py @@ -5,7 +5,8 @@ **Alignment with open-source** (`Lliar-liar/Daily-Omni` ``test_model/.../testmodel.py``): - Answer extraction defaults to the same rules as ``extract_choice_letter`` (strip after an - ``assistant`` marker, then leading ``A``–``D``, else first ``\\b[A-D]\\b``). Set env + ``assistant`` marker, then leading ``A``–``D``, else ``\\b[A-D]\\b``, else a CJK-safe + non-letter-boundary pass). Set env ``DAILY_OMNI_EXTRACT_MODE=relaxed`` to use the older vLLM-Omni heuristics (last ``answer:``, tail scan, etc.). - Overall accuracy comparable to the official script uses **successful HTTP responses only** as @@ -52,6 +53,16 @@ def extract_choice_letter_official(text: str | None) -> str | None: fallback = re.search(r"\b([A-D])\b", candidate.upper()) if fallback: return fallback.group(1) + # ``\b`` is ASCII/Latin-word-centric; CJK (e.g. "选B", "答案:B") has no boundary before B. + loose = list( + re.finditer( + r"(?:[^A-Za-z]|^)([A-D])(?:[^A-Za-z]|$)", + candidate, + flags=re.IGNORECASE, + ) + ) + if loose: + return loose[-1].group(1).upper() return None diff --git a/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py b/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py index ca6de4cb202..4f072e1a2eb 100644 --- a/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py +++ b/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py @@ -94,7 +94,7 @@ def _load_meta_rows(meta_file: Path) -> list[_SeedTTSRow]: return rows -def resolve_seed_tts_root(dataset_path: str | None, *, explicit_root: str | None) -> Path: +def resolve_seed_tts_root(dataset_path: str | None, *, explicit_root: str | None, locale: str = "en") -> Path: """Return directory containing ``{locale}/meta.lst`` and ``{locale}/prompt-wavs/``.""" if explicit_root: root = Path(explicit_root).expanduser().resolve() @@ -117,7 +117,14 @@ def resolve_seed_tts_root(dataset_path: str | None, *, explicit_root: str | None "Install huggingface_hub to download Seed-TTS from the Hub, or clone the dataset " "locally and pass --dataset-path / --seed-tts-root to that directory." ) from e - cache = snapshot_download(repo_id=repo_id, repo_type="dataset") + # Download only the requested locale subtree instead of the whole dataset + # repo. This avoids large, flaky nightly downloads when we only need e.g. + # ``en/meta.lst`` + ``en/prompt-wavs/**``. + cache = snapshot_download( + repo_id=repo_id, + repo_type="dataset", + allow_patterns=[f"{locale}/**"], + ) return Path(cache).resolve() @@ -172,7 +179,7 @@ def __init__( disable_shuffle=disable_shuffle, **kwargs, ) - self._root = resolve_seed_tts_root(self.dataset_path, explicit_root=self._explicit_root) + self._root = resolve_seed_tts_root(self.dataset_path, explicit_root=self._explicit_root, locale=self.locale) self._rows: list[_SeedTTSRow] = [] self.load_data() diff --git a/vllm_omni/benchmarks/data_modules/seed_tts_eval.py b/vllm_omni/benchmarks/data_modules/seed_tts_eval.py index d5f1b64709f..d8c37af1300 100644 --- a/vllm_omni/benchmarks/data_modules/seed_tts_eval.py +++ b/vllm_omni/benchmarks/data_modules/seed_tts_eval.py @@ -1,7 +1,7 @@ """Seed-TTS WER aligned with Bytedance ``seed-tts-eval`` / ``run_wer.py``. Matches the published protocol (see Hugging Face dataset card and -https://github.com/BytedanceSpeech/seed-tts-eval): +https://github.com/zhaochenyang20/seed-tts-eval): - **EN**: ``openai/whisper-large-v3`` via ``transformers``, audio resampled to **16 kHz** (same as ``run_wer.py``). @@ -489,7 +489,7 @@ def compute_seed_tts_wer_metrics( sim_failed = 0 sim_skipped_no_ref = 0 utmos_failed = 0 - utmos_on = _eval_submetric_enabled("SEED_TTS_UTMOS_EVAL", default=True) + utmos_on = _eval_submetric_enabled("SEED_TTS_UTMOS_EVAL", default=False) for req, out in zip(input_requests, outputs, strict=True): assert isinstance(req, SeedTTSSampleRequest) @@ -622,7 +622,7 @@ def compute_seed_tts_wer_metrics( errs.append(wer) sim_v: float | None = None - if _eval_submetric_enabled("SEED_TTS_SIM_EVAL", default=True): + if _eval_submetric_enabled("SEED_TTS_SIM_EVAL", default=False): ref_path = getattr(req, "seed_tts_ref_wav_path", "") or "" if ref_path and os.path.isfile(ref_path): try: diff --git a/vllm_omni/entrypoints/cli/benchmark/serve.py b/vllm_omni/entrypoints/cli/benchmark/serve.py index d281432e59b..44d38303fce 100644 --- a/vllm_omni/entrypoints/cli/benchmark/serve.py +++ b/vllm_omni/entrypoints/cli/benchmark/serve.py @@ -33,6 +33,8 @@ def add_daily_omni_cli_args(parser: argparse.ArgumentParser) -> None: default=None, help="Root directory of extracted Daily-Omni videos (contents of Videos.tar: " "each video_id in its own subdir with {video_id}_video.mp4). " + "If omitted, Videos.tar is downloaded from the Hugging Face dataset repo on first multimodal " + "request. " "When using file URLs, you MUST start the vLLM server with " "--allowed-local-media-path set to this same directory (or a parent), " "otherwise requests fail with 'Cannot load local files without " @@ -116,7 +118,7 @@ def add_seed_tts_cli_args(parser: argparse.ArgumentParser) -> None: default=False, help="Keep synthesized audio as 24 kHz mono PCM for WER (works with " "--backend openai-audio-speech or openai-chat-omni). Scoring follows " - "BytedanceSpeech/seed-tts-eval (Whisper-large-v3 / Paraformer-zh + jiwer). " + "zhaochenyang20/seed-tts-eval (Whisper-large-v3 / Paraformer-zh + jiwer). " "Sets SEED_TTS_WER_EVAL=1. Install: pip install 'vllm-omni[seed-tts-eval]'. " "Optional: SEED_TTS_EVAL_DEVICE, SEED_TTS_HF_WHISPER_MODEL.", )